Port histogram verb from C to Go

2026-01-23 02:14:13 +00:00 · 2021-02-28 00:43:50 -05:00 · 2021-02-28 00:43:50 -05:00 · d48226feb7
commit d48226feb7
parent ca320d8475
9 changed files with 478 additions and 78 deletions
--- a/go/mlr.go
+++ b/go/mlr.go
@ -14,7 +14,7 @@ import (

 // ----------------------------------------------------------------
 func main() {
-	runtime.GOMAXPROCS(4) // Seems reasonable these days
+	runtime.GOMAXPROCS(4)   // Seems reasonable these days
 	debug.SetGCPercent(500) // Empirical: See README-profiling.md

 	//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
--- a/go/reg-test/expected/case-c-verb-histogram.sh.out
+++ b/go/reg-test/expected/case-c-verb-histogram.sh.out
@ -1,71 +1,71 @@

 mlr --opprint histogram -f x,y --lo 0 --hi 1 --nbins 20 ./reg-test/input/small
-bin_lo   bin_hi   x_count y_count
-0.000000 0.050000 1       0
-0.050000 0.100000 0       0
-0.100000 0.150000 0       1
-0.150000 0.200000 0       1
-0.200000 0.250000 1       0
-0.250000 0.300000 0       0
-0.300000 0.350000 1       1
-0.350000 0.400000 1       0
-0.400000 0.450000 0       0
-0.450000 0.500000 0       1
-0.500000 0.550000 2       1
-0.550000 0.600000 2       0
-0.600000 0.650000 1       0
-0.650000 0.700000 0       0
-0.700000 0.750000 0       2
-0.750000 0.800000 1       0
-0.800000 0.850000 0       0
-0.850000 0.900000 0       1
-0.900000 0.950000 0       0
-0.950000 1.000000 0       2
+bin_lo bin_hi x_count y_count
+0      0.05   1       0
+0.05   0.1    0       0
+0.1    0.15   0       1
+0.15   0.2    0       1
+0.2    0.25   1       0
+0.25   0.3    0       0
+0.3    0.35   1       1
+0.35   0.4    1       0
+0.4    0.45   0       0
+0.45   0.5    0       1
+0.5    0.55   2       1
+0.55   0.6    2       0
+0.6    0.65   1       0
+0.65   0.7    0       0
+0.7    0.75   0       2
+0.75   0.8    1       0
+0.8    0.85   0       0
+0.85   0.9    0       1
+0.9    0.95   0       0
+0.95   1      0       2

 mlr --opprint histogram -f x,y --lo 0 --hi 1 --nbins 20 -o foo_ ./reg-test/input/small
 foo_bin_lo foo_bin_hi foo_x_count foo_y_count
-0.000000   0.050000   1           0
-0.050000   0.100000   0           0
-0.100000   0.150000   0           1
-0.150000   0.200000   0           1
-0.200000   0.250000   1           0
-0.250000   0.300000   0           0
-0.300000   0.350000   1           1
-0.350000   0.400000   1           0
-0.400000   0.450000   0           0
-0.450000   0.500000   0           1
-0.500000   0.550000   2           1
-0.550000   0.600000   2           0
-0.600000   0.650000   1           0
-0.650000   0.700000   0           0
-0.700000   0.750000   0           2
-0.750000   0.800000   1           0
-0.800000   0.850000   0           0
-0.850000   0.900000   0           1
-0.900000   0.950000   0           0
-0.950000   1.000000   0           2
+0          0.05       1           0
+0.05       0.1        0           0
+0.1        0.15       0           1
+0.15       0.2        0           1
+0.2        0.25       1           0
+0.25       0.3        0           0
+0.3        0.35       1           1
+0.35       0.4        1           0
+0.4        0.45       0           0
+0.45       0.5        0           1
+0.5        0.55       2           1
+0.55       0.6        2           0
+0.6        0.65       1           0
+0.65       0.7        0           0
+0.7        0.75       0           2
+0.75       0.8        1           0
+0.8        0.85       0           0
+0.85       0.9        0           1
+0.9        0.95       0           0
+0.95       1          0           2

 mlr --opprint histogram --nbins 9 --auto -f x,y ./reg-test/input/ints.dkvp
-bin_lo   bin_hi   x_count y_count
-0.000000 1.000000 8       1
-1.000000 2.000000 2       2
-2.000000 3.000000 5       5
-3.000000 4.000000 4       1
-4.000000 5.000000 3       2
-5.000000 6.000000 1       4
-6.000000 7.000000 3       4
-7.000000 8.000000 2       4
-8.000000 9.000000 2       7
+bin_lo bin_hi x_count y_count
+0      1      8       1
+1      2      2       2
+2      3      5       5
+3      4      4       1
+4      5      3       2
+5      6      1       4
+6      7      3       4
+7      8      2       4
+8      9      2       7

 mlr --opprint histogram --nbins 9 --auto -f x,y -o foo_ ./reg-test/input/ints.dkvp
-foo_bin_lo bin_hi   foo_x_count foo_y_count
-0.000000   1.000000 8           1
-1.000000   2.000000 2           2
-2.000000   3.000000 5           5
-3.000000   4.000000 4           1
-4.000000   5.000000 3           2
-5.000000   6.000000 1           4
-6.000000   7.000000 3           4
-7.000000   8.000000 2           4
-8.000000   9.000000 2           7
+foo_bin_lo foo_bin_hi foo_x_count foo_y_count
+0          1          8           1
+1          2          2           2
+2          3          5           5
+3          4          4           1
+4          5          3           2
+5          6          1           4
+6          7          3           4
+7          8          2           4
+8          9          2           7

--- a/go/src/cli/mlrcli_transformers.go
+++ b/go/src/cli/mlrcli_transformers.go
@ -29,6 +29,7 @@ var MAPPER_LOOKUP_TABLE = []transforming.TransformerSetup{
 	transformers.GroupLikeSetup,
 	transformers.HavingFieldsSetup,
 	transformers.HeadSetup,
+	transformers.HistogramSetup,
 	transformers.JoinSetup,
 	transformers.JSONParseSetup,
 	transformers.JSONStringifySetup,
--- a/go/src/cliutil/verb_utils.go
+++ b/go/src/cliutil/verb_utils.go
@ -54,3 +54,19 @@ func VerbGetIntArgOrDie(verb string, opt string, args []string, pargi *int, argc
 	}
 	return retval
 }
+
+// E.g. with ["-n", "10.3"], makes sure there is something in the "10.3"
+// position, scans it as float, and returns it.
+func VerbGetFloatArgOrDie(verb string, opt string, args []string, pargi *int, argc int) float64 {
+	flag := args[*pargi]
+	stringArg := VerbGetStringArgOrDie(verb, opt, args, pargi, argc)
+	retval, err := strconv.ParseFloat(stringArg, 64)
+	if err != nil {
+		fmt.Fprintf(os.Stderr,
+			"%s %s: could not scan flag \"%s\" argument \"%s\" as float.\n",
+			lib.MlrExeName(), verb, flag, stringArg,
+		)
+		os.Exit(1)
+	}
+	return retval
+}
--- a/go/src/lib/util.go
+++ b/go/src/lib/util.go
@ -100,9 +100,9 @@ func TryIntFromString(input string) (int, bool) {
 }

 func TryFloat64FromString(input string) (float64, bool) {
-	ival, err := strconv.ParseFloat(input, 64)
+	fval, err := strconv.ParseFloat(input, 64)
 	if err == nil {
-		return ival, true
+		return fval, true
 	} else {
 		return 0, false
 	}
--- a/go/src/transformers/cat.go
+++ b/go/src/transformers/cat.go
@ -55,7 +55,7 @@ func transformerCatParseCLI(
 	// Parse local flags
 	doCounters := false
 	counterFieldName := ""
-	groupByFieldNames := ""
+	var groupByFieldNames []string = nil

 	for argi < argc /* variable increment: 1 or 2 depending on flag */ {
 		opt := args[argi]
@ -74,7 +74,7 @@ func transformerCatParseCLI(
 			counterFieldName = cliutil.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)

 		} else if opt == "-g" {
-			groupByFieldNames = cliutil.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
+			groupByFieldNames = cliutil.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)

 		} else {
 			transformerCatUsage(os.Stderr, true, 1)
@ -92,8 +92,8 @@ func transformerCatParseCLI(

 // ----------------------------------------------------------------
 type TransformerCat struct {
-	doCounters           bool
-	groupByFieldNameList []string
+	doCounters        bool
+	groupByFieldNames []string

 	counter          int
 	countsByGroup    map[string]int
@ -106,27 +106,25 @@ type TransformerCat struct {
 func NewTransformerCat(
 	doCounters bool,
 	counterFieldName string,
-	groupByFieldNames string,
+	groupByFieldNames []string,
 ) (*TransformerCat, error) {

-	groupByFieldNameList := lib.SplitString(groupByFieldNames, ",")
-
 	if counterFieldName != "" {
 		doCounters = true
 	}

 	this := &TransformerCat{
-		doCounters:           doCounters,
-		groupByFieldNameList: groupByFieldNameList,
-		counter:              0,
-		countsByGroup:        make(map[string]int),
-		counterFieldName:     counterFieldName,
+		doCounters:        doCounters,
+		groupByFieldNames: groupByFieldNames,
+		counter:           0,
+		countsByGroup:     make(map[string]int),
+		counterFieldName:  counterFieldName,
 	}

 	if !doCounters {
 		this.recordTransformerFunc = this.simpleCat
 	} else {
-		if groupByFieldNames == "" {
+		if groupByFieldNames == nil {
 			this.recordTransformerFunc = this.countersUngrouped
 		} else {
 			this.recordTransformerFunc = this.countersGrouped
@ -175,7 +173,7 @@ func (this *TransformerCat) countersGrouped(
 	if !inrecAndContext.EndOfStream {
 		inrec := inrecAndContext.Record

-		groupingKey, ok := inrec.GetSelectedValuesJoined(this.groupByFieldNameList)
+		groupingKey, ok := inrec.GetSelectedValuesJoined(this.groupByFieldNames)
 		var counter int = 0
 		if !ok {
 			// Treat as unkeyed
--- a/go/src/transformers/histogram.go
+++ b/go/src/transformers/histogram.go
@ -0,0 +1,366 @@
+package transformers
+
+import (
+	"fmt"
+	"os"
+	"strings"
+
+	"miller/src/cliutil"
+	"miller/src/lib"
+	"miller/src/transforming"
+	"miller/src/types"
+)
+
+// ----------------------------------------------------------------
+const verbNameHistogram = "histogram"
+
+var HistogramSetup = transforming.TransformerSetup{
+	Verb:         verbNameHistogram,
+	UsageFunc:    transformerHistogramUsage,
+	ParseCLIFunc: transformerHistogramParseCLI,
+	IgnoresInput: false,
+}
+
+func transformerHistogramUsage(
+	o *os.File,
+	doExit bool,
+	exitCode int,
+) {
+	argv0 := lib.MlrExeName()
+	verb := verbNameHistogram
+	fmt.Fprintf(o, "Just a histogram. Input values < lo or > hi are not counted.\n")
+	fmt.Fprintf(o, "Usage: %s %s [options]\n", argv0, verb)
+	fmt.Fprintf(o, "-f {a,b,c}    Value-field names for histogram counts\n")
+	fmt.Fprintf(o, "--lo {lo}     Histogram low value\n")
+	fmt.Fprintf(o, "--hi {hi}     Histogram high value\n")
+	fmt.Fprintf(o, "--nbins {n}   Number of histogram bins\n")
+	fmt.Fprintf(o, "--auto        Automatically computes limits, ignoring --lo and --hi.\n")
+	fmt.Fprintf(o, "              Holds all values in memory before producing any output.\n")
+	fmt.Fprintf(o, "-o {prefix}   Prefix for output field name. Default: no prefix.\n")
+	fmt.Fprintf(o, "-h|--help Show this message.\n")
+
+	if doExit {
+		os.Exit(exitCode)
+	}
+}
+
+func transformerHistogramParseCLI(
+	pargi *int,
+	argc int,
+	args []string,
+	_ *cliutil.TReaderOptions,
+	__ *cliutil.TWriterOptions,
+) transforming.IRecordTransformer {
+
+	// Skip the verb name from the current spot in the mlr command line
+	argi := *pargi
+	verb := args[argi]
+	argi++
+
+	// Parse local flags
+	var valueFieldNames []string = nil
+	lo := 0.0
+	nbins := 0
+	hi := 0.0
+	doAuto := false
+	outputPrefix := ""
+
+	for argi < argc /* variable increment: 1 or 2 depending on flag */ {
+		opt := args[argi]
+		if !strings.HasPrefix(opt, "-") {
+			break // No more flag options to process
+		}
+		argi++
+
+		if opt == "-h" || opt == "--help" {
+			transformerHistogramUsage(os.Stdout, true, 0)
+
+		} else if opt == "-f" {
+			valueFieldNames = cliutil.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
+
+		} else if opt == "--lo" {
+			lo = cliutil.VerbGetFloatArgOrDie(verb, opt, args, &argi, argc)
+
+		} else if opt == "--nbins" {
+			nbins = cliutil.VerbGetIntArgOrDie(verb, opt, args, &argi, argc)
+
+		} else if opt == "--hi" {
+			hi = cliutil.VerbGetFloatArgOrDie(verb, opt, args, &argi, argc)
+
+		} else if opt == "--auto" {
+			doAuto = true
+
+		} else if opt == "-o" {
+			outputPrefix = cliutil.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
+
+		} else {
+			transformerHistogramUsage(os.Stderr, true, 1)
+		}
+	}
+
+	if valueFieldNames == nil {
+		transformerHistogramUsage(os.Stderr, true, 1)
+	}
+
+	if nbins == 0 {
+		transformerHistogramUsage(os.Stderr, true, 1)
+	}
+
+	if lo == hi && !doAuto {
+		transformerHistogramUsage(os.Stderr, true, 1)
+	}
+
+	transformer, _ := NewTransformerHistogram(
+		valueFieldNames,
+		lo,
+		nbins,
+		hi,
+		doAuto,
+		outputPrefix,
+	)
+
+	*pargi = argi
+	return transformer
+}
+
+// ----------------------------------------------------------------
+const histogramVectorInitialSize = 1024
+
+type TransformerHistogram struct {
+	valueFieldNames []string
+	lo              float64
+	nbins           int
+	hi              float64
+	mul             float64
+
+	countsByField      map[string][]int
+	vectorsByFieldName map[string][]float64 // For auto-mode
+	outputPrefix       string
+
+	recordTransformerFunc transforming.RecordTransformerFunc
+}
+
+// ----------------------------------------------------------------
+func NewTransformerHistogram(
+	valueFieldNames []string,
+	lo float64,
+	nbins int,
+	hi float64,
+	doAuto bool,
+	outputPrefix string,
+) (*TransformerHistogram, error) {
+
+	countsByField := make(map[string][]int)
+	for _, valueFieldName := range valueFieldNames {
+		countsByField[valueFieldName] = make([]int, nbins)
+		for i := 0; i < nbins; i++ {
+			countsByField[valueFieldName][i] = 0
+		}
+	}
+
+	this := &TransformerHistogram{
+		valueFieldNames: valueFieldNames,
+		countsByField:   countsByField,
+		outputPrefix:    outputPrefix,
+		nbins:           nbins,
+	}
+
+	if !doAuto {
+		this.recordTransformerFunc = this.transformNonAuto
+		this.lo = lo
+		this.hi = hi
+		this.mul = float64(nbins) / (hi - lo)
+	} else {
+		this.vectorsByFieldName = make(map[string][]float64)
+		for _, valueFieldName := range valueFieldNames {
+			this.vectorsByFieldName[valueFieldName] = make([]float64, 0, histogramVectorInitialSize)
+		}
+
+		this.recordTransformerFunc = this.transformAuto
+	}
+
+	return this, nil
+}
+
+// ----------------------------------------------------------------
+func (this *TransformerHistogram) Transform(
+	inrecAndContext *types.RecordAndContext,
+	outputChannel chan<- *types.RecordAndContext,
+) {
+	this.recordTransformerFunc(inrecAndContext, outputChannel)
+}
+
+// ----------------------------------------------------------------
+func (this *TransformerHistogram) transformNonAuto(
+	inrecAndContext *types.RecordAndContext,
+	outputChannel chan<- *types.RecordAndContext,
+) {
+	if !inrecAndContext.EndOfStream {
+		this.ingestNonAuto(inrecAndContext)
+	} else {
+		this.emitNonAuto(&inrecAndContext.Context, outputChannel)
+		outputChannel <- inrecAndContext // end-of-stream marker
+	}
+}
+
+func (this *TransformerHistogram) ingestNonAuto(
+	inrecAndContext *types.RecordAndContext,
+) {
+	inrec := inrecAndContext.Record
+	for _, valueFieldName := range this.valueFieldNames {
+		stringValue := inrec.Get(valueFieldName)
+		if stringValue != nil {
+			floatValue, ok := stringValue.GetNumericToFloatValue()
+			if !ok {
+				fmt.Fprintf(
+					os.Stderr,
+					"%s %s: cannot parse \"%s\" as float.\n",
+					lib.MlrExeName(), verbNameHistogram, stringValue.String(),
+				)
+				os.Exit(1)
+			}
+			if (floatValue >= this.lo) && (floatValue < this.hi) {
+				idx := int((floatValue - this.lo) * this.mul)
+				this.countsByField[valueFieldName][idx]++
+			} else if floatValue == this.hi {
+				idx := this.nbins - 1
+				this.countsByField[valueFieldName][idx]++
+			}
+		}
+	}
+}
+
+func (this *TransformerHistogram) emitNonAuto(
+	endOfStreamContext *types.Context,
+	outputChannel chan<- *types.RecordAndContext,
+) {
+	countFieldNames := make(map[string]string)
+	for _, valueFieldName := range this.valueFieldNames {
+		countFieldNames[valueFieldName] = this.outputPrefix + valueFieldName + "_count"
+	}
+	for i := 0; i < this.nbins; i++ {
+		outrec := types.NewMlrmapAsRecord()
+
+		outrec.PutReference(
+			this.outputPrefix+"bin_lo",
+			types.MlrvalPointerFromFloat64((this.lo+float64(i))/this.mul),
+		)
+		outrec.PutReference(
+			this.outputPrefix+"bin_hi",
+			types.MlrvalPointerFromFloat64((this.lo+float64(i+1))/this.mul),
+		)
+
+		for _, valueFieldName := range this.valueFieldNames {
+			outrec.PutReference(
+				countFieldNames[valueFieldName],
+				types.MlrvalPointerFromInt(this.countsByField[valueFieldName][i]),
+			)
+		}
+
+		outputChannel <- types.NewRecordAndContext(outrec, endOfStreamContext)
+	}
+}
+
+// ----------------------------------------------------------------
+func (this *TransformerHistogram) transformAuto(
+	inrecAndContext *types.RecordAndContext,
+	outputChannel chan<- *types.RecordAndContext,
+) {
+	if !inrecAndContext.EndOfStream {
+		this.ingestAuto(inrecAndContext)
+	} else {
+		this.emitAuto(&inrecAndContext.Context, outputChannel)
+		outputChannel <- inrecAndContext // end-of-stream marker
+	}
+}
+
+func (this *TransformerHistogram) ingestAuto(
+	inrecAndContext *types.RecordAndContext,
+) {
+	inrec := inrecAndContext.Record
+	for _, valueFieldName := range this.valueFieldNames {
+		mvalue := inrec.Get(valueFieldName)
+		if mvalue != nil {
+			value := mvalue.GetNumericToFloatValueOrDie()
+			this.vectorsByFieldName[valueFieldName] = append(this.vectorsByFieldName[valueFieldName], value)
+		}
+	}
+}
+
+func (this *TransformerHistogram) emitAuto(
+	endOfStreamContext *types.Context,
+	outputChannel chan<- *types.RecordAndContext,
+) {
+	haveLoHi := false
+	lo := 0.0
+	hi := 1.0
+	nbins := this.nbins
+
+	// Limits pass
+	for _, valueFieldName := range this.valueFieldNames {
+		vector := this.vectorsByFieldName[valueFieldName]
+		n := len(vector)
+		for i := 0; i < n; i++ {
+			value := vector[i]
+			if haveLoHi {
+				if lo > value {
+					lo = value
+				}
+				if hi < value {
+					hi = value
+				}
+			} else {
+				lo = value
+				hi = value
+				haveLoHi = true
+			}
+		}
+	}
+
+	// Binning pass
+	mul := float64(nbins) / (hi - lo)
+	for _, valueFieldName := range this.valueFieldNames {
+		vector := this.vectorsByFieldName[valueFieldName]
+		counts := this.countsByField[valueFieldName]
+		lib.InternalCodingErrorIf(counts == nil)
+		n := len(vector)
+		for i := 0; i < n; i++ {
+			value := vector[i]
+			if (value >= lo) && (value < hi) {
+				idx := int(((value - lo) * mul))
+				counts[idx]++
+			} else if value == hi {
+				idx := nbins - 1
+				counts[idx]++
+			}
+		}
+	}
+
+	// Emission pass
+	countFieldNames := make(map[string]string)
+	for _, valueFieldName := range this.valueFieldNames {
+		countFieldNames[valueFieldName] = this.outputPrefix + valueFieldName + "_count"
+	}
+
+	for i := 0; i < nbins; i++ {
+		outrec := types.NewMlrmapAsRecord()
+
+		outrec.PutReference(
+			this.outputPrefix+"bin_lo",
+			types.MlrvalPointerFromFloat64((lo+float64(i))/mul),
+		)
+		outrec.PutReference(
+			this.outputPrefix+"bin_hi",
+			types.MlrvalPointerFromFloat64((lo+float64(i+1))/mul),
+		)
+
+		for _, valueFieldName := range this.valueFieldNames {
+			outrec.PutReference(
+				countFieldNames[valueFieldName],
+				types.MlrvalPointerFromInt(this.countsByField[valueFieldName][i]),
+			)
+		}
+
+		outputChannel <- types.NewRecordAndContext(outrec, endOfStreamContext)
+	}
+}
--- a/go/src/types/mlrval_accessors.go
+++ b/go/src/types/mlrval_accessors.go
@ -1,6 +1,8 @@
 package types

 import (
+	"fmt"
+	"os"
 	"strconv"

 	"miller/src/lib"
@ -132,6 +134,19 @@ func (this *Mlrval) GetNumericToFloatValue() (floatValue float64, isFloat bool)
 	}
 }

+func (this *Mlrval) GetNumericToFloatValueOrDie() (floatValue float64) {
+	floatValue, ok := this.GetNumericToFloatValue()
+	if !ok {
+		fmt.Fprintf(
+			os.Stderr,
+			"%s: couldn't parse \"%s\" as number.",
+			lib.MlrExeName(), this.String(),
+		)
+		os.Exit(1)
+	}
+	return floatValue
+}
+
 func (this *Mlrval) GetBoolValue() (boolValue bool, isBool bool) {
 	if this.mvtype == MT_BOOL {
 		return this.boolval, true
--- a/go/todo.txt
+++ b/go/todo.txt
@ -1,6 +1,10 @@
 ================================================================
 TOP OF LIST:

+* audit
+  groupByFieldNames = cliutil.VerbGetStringArgOrDie
+  -> cliutil.VerbGetStringArrayArgOrDie
+
 * regexes
  o finish stats1 -r
  o regex captures ...