From d48226feb71afd972f512780084dfdd644bb1f3b Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 28 Feb 2021 00:43:50 -0500 Subject: [PATCH] Port histogram verb from C to Go --- go/mlr.go | 2 +- .../expected/case-c-verb-histogram.sh.out | 122 +++--- go/src/cli/mlrcli_transformers.go | 1 + go/src/cliutil/verb_utils.go | 16 + go/src/lib/util.go | 4 +- go/src/transformers/cat.go | 26 +- go/src/transformers/histogram.go | 366 ++++++++++++++++++ go/src/types/mlrval_accessors.go | 15 + go/todo.txt | 4 + 9 files changed, 478 insertions(+), 78 deletions(-) create mode 100644 go/src/transformers/histogram.go diff --git a/go/mlr.go b/go/mlr.go index 529271d62..71033945e 100644 --- a/go/mlr.go +++ b/go/mlr.go @@ -14,7 +14,7 @@ import ( // ---------------------------------------------------------------- func main() { - runtime.GOMAXPROCS(4) // Seems reasonable these days + runtime.GOMAXPROCS(4) // Seems reasonable these days debug.SetGCPercent(500) // Empirical: See README-profiling.md // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/go/reg-test/expected/case-c-verb-histogram.sh.out b/go/reg-test/expected/case-c-verb-histogram.sh.out index 0e112cbff..237f92249 100644 --- a/go/reg-test/expected/case-c-verb-histogram.sh.out +++ b/go/reg-test/expected/case-c-verb-histogram.sh.out @@ -1,71 +1,71 @@ mlr --opprint histogram -f x,y --lo 0 --hi 1 --nbins 20 ./reg-test/input/small -bin_lo bin_hi x_count y_count -0.000000 0.050000 1 0 -0.050000 0.100000 0 0 -0.100000 0.150000 0 1 -0.150000 0.200000 0 1 -0.200000 0.250000 1 0 -0.250000 0.300000 0 0 -0.300000 0.350000 1 1 -0.350000 0.400000 1 0 -0.400000 0.450000 0 0 -0.450000 0.500000 0 1 -0.500000 0.550000 2 1 -0.550000 0.600000 2 0 -0.600000 0.650000 1 0 -0.650000 0.700000 0 0 -0.700000 0.750000 0 2 -0.750000 0.800000 1 0 -0.800000 0.850000 0 0 -0.850000 0.900000 0 1 -0.900000 0.950000 0 0 -0.950000 1.000000 0 2 +bin_lo bin_hi x_count y_count +0 0.05 1 0 +0.05 0.1 0 0 +0.1 0.15 0 1 +0.15 0.2 0 1 +0.2 0.25 1 0 +0.25 0.3 0 0 +0.3 0.35 1 1 +0.35 0.4 1 0 +0.4 0.45 0 0 +0.45 0.5 0 1 +0.5 0.55 2 1 +0.55 0.6 2 0 +0.6 0.65 1 0 +0.65 0.7 0 0 +0.7 0.75 0 2 +0.75 0.8 1 0 +0.8 0.85 0 0 +0.85 0.9 0 1 +0.9 0.95 0 0 +0.95 1 0 2 mlr --opprint histogram -f x,y --lo 0 --hi 1 --nbins 20 -o foo_ ./reg-test/input/small foo_bin_lo foo_bin_hi foo_x_count foo_y_count -0.000000 0.050000 1 0 -0.050000 0.100000 0 0 -0.100000 0.150000 0 1 -0.150000 0.200000 0 1 -0.200000 0.250000 1 0 -0.250000 0.300000 0 0 -0.300000 0.350000 1 1 -0.350000 0.400000 1 0 -0.400000 0.450000 0 0 -0.450000 0.500000 0 1 -0.500000 0.550000 2 1 -0.550000 0.600000 2 0 -0.600000 0.650000 1 0 -0.650000 0.700000 0 0 -0.700000 0.750000 0 2 -0.750000 0.800000 1 0 -0.800000 0.850000 0 0 -0.850000 0.900000 0 1 -0.900000 0.950000 0 0 -0.950000 1.000000 0 2 +0 0.05 1 0 +0.05 0.1 0 0 +0.1 0.15 0 1 +0.15 0.2 0 1 +0.2 0.25 1 0 +0.25 0.3 0 0 +0.3 0.35 1 1 +0.35 0.4 1 0 +0.4 0.45 0 0 +0.45 0.5 0 1 +0.5 0.55 2 1 +0.55 0.6 2 0 +0.6 0.65 1 0 +0.65 0.7 0 0 +0.7 0.75 0 2 +0.75 0.8 1 0 +0.8 0.85 0 0 +0.85 0.9 0 1 +0.9 0.95 0 0 +0.95 1 0 2 mlr --opprint histogram --nbins 9 --auto -f x,y ./reg-test/input/ints.dkvp -bin_lo bin_hi x_count y_count -0.000000 1.000000 8 1 -1.000000 2.000000 2 2 -2.000000 3.000000 5 5 -3.000000 4.000000 4 1 -4.000000 5.000000 3 2 -5.000000 6.000000 1 4 -6.000000 7.000000 3 4 -7.000000 8.000000 2 4 -8.000000 9.000000 2 7 +bin_lo bin_hi x_count y_count +0 1 8 1 +1 2 2 2 +2 3 5 5 +3 4 4 1 +4 5 3 2 +5 6 1 4 +6 7 3 4 +7 8 2 4 +8 9 2 7 mlr --opprint histogram --nbins 9 --auto -f x,y -o foo_ ./reg-test/input/ints.dkvp -foo_bin_lo bin_hi foo_x_count foo_y_count -0.000000 1.000000 8 1 -1.000000 2.000000 2 2 -2.000000 3.000000 5 5 -3.000000 4.000000 4 1 -4.000000 5.000000 3 2 -5.000000 6.000000 1 4 -6.000000 7.000000 3 4 -7.000000 8.000000 2 4 -8.000000 9.000000 2 7 +foo_bin_lo foo_bin_hi foo_x_count foo_y_count +0 1 8 1 +1 2 2 2 +2 3 5 5 +3 4 4 1 +4 5 3 2 +5 6 1 4 +6 7 3 4 +7 8 2 4 +8 9 2 7 diff --git a/go/src/cli/mlrcli_transformers.go b/go/src/cli/mlrcli_transformers.go index 998e9ddd4..0637e0a31 100644 --- a/go/src/cli/mlrcli_transformers.go +++ b/go/src/cli/mlrcli_transformers.go @@ -29,6 +29,7 @@ var MAPPER_LOOKUP_TABLE = []transforming.TransformerSetup{ transformers.GroupLikeSetup, transformers.HavingFieldsSetup, transformers.HeadSetup, + transformers.HistogramSetup, transformers.JoinSetup, transformers.JSONParseSetup, transformers.JSONStringifySetup, diff --git a/go/src/cliutil/verb_utils.go b/go/src/cliutil/verb_utils.go index b1a382a08..cedf083ad 100644 --- a/go/src/cliutil/verb_utils.go +++ b/go/src/cliutil/verb_utils.go @@ -54,3 +54,19 @@ func VerbGetIntArgOrDie(verb string, opt string, args []string, pargi *int, argc } return retval } + +// E.g. with ["-n", "10.3"], makes sure there is something in the "10.3" +// position, scans it as float, and returns it. +func VerbGetFloatArgOrDie(verb string, opt string, args []string, pargi *int, argc int) float64 { + flag := args[*pargi] + stringArg := VerbGetStringArgOrDie(verb, opt, args, pargi, argc) + retval, err := strconv.ParseFloat(stringArg, 64) + if err != nil { + fmt.Fprintf(os.Stderr, + "%s %s: could not scan flag \"%s\" argument \"%s\" as float.\n", + lib.MlrExeName(), verb, flag, stringArg, + ) + os.Exit(1) + } + return retval +} diff --git a/go/src/lib/util.go b/go/src/lib/util.go index 03975df93..3a95066fd 100644 --- a/go/src/lib/util.go +++ b/go/src/lib/util.go @@ -100,9 +100,9 @@ func TryIntFromString(input string) (int, bool) { } func TryFloat64FromString(input string) (float64, bool) { - ival, err := strconv.ParseFloat(input, 64) + fval, err := strconv.ParseFloat(input, 64) if err == nil { - return ival, true + return fval, true } else { return 0, false } diff --git a/go/src/transformers/cat.go b/go/src/transformers/cat.go index 41cbb676a..9951c565f 100644 --- a/go/src/transformers/cat.go +++ b/go/src/transformers/cat.go @@ -55,7 +55,7 @@ func transformerCatParseCLI( // Parse local flags doCounters := false counterFieldName := "" - groupByFieldNames := "" + var groupByFieldNames []string = nil for argi < argc /* variable increment: 1 or 2 depending on flag */ { opt := args[argi] @@ -74,7 +74,7 @@ func transformerCatParseCLI( counterFieldName = cliutil.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) } else if opt == "-g" { - groupByFieldNames = cliutil.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) + groupByFieldNames = cliutil.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) } else { transformerCatUsage(os.Stderr, true, 1) @@ -92,8 +92,8 @@ func transformerCatParseCLI( // ---------------------------------------------------------------- type TransformerCat struct { - doCounters bool - groupByFieldNameList []string + doCounters bool + groupByFieldNames []string counter int countsByGroup map[string]int @@ -106,27 +106,25 @@ type TransformerCat struct { func NewTransformerCat( doCounters bool, counterFieldName string, - groupByFieldNames string, + groupByFieldNames []string, ) (*TransformerCat, error) { - groupByFieldNameList := lib.SplitString(groupByFieldNames, ",") - if counterFieldName != "" { doCounters = true } this := &TransformerCat{ - doCounters: doCounters, - groupByFieldNameList: groupByFieldNameList, - counter: 0, - countsByGroup: make(map[string]int), - counterFieldName: counterFieldName, + doCounters: doCounters, + groupByFieldNames: groupByFieldNames, + counter: 0, + countsByGroup: make(map[string]int), + counterFieldName: counterFieldName, } if !doCounters { this.recordTransformerFunc = this.simpleCat } else { - if groupByFieldNames == "" { + if groupByFieldNames == nil { this.recordTransformerFunc = this.countersUngrouped } else { this.recordTransformerFunc = this.countersGrouped @@ -175,7 +173,7 @@ func (this *TransformerCat) countersGrouped( if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record - groupingKey, ok := inrec.GetSelectedValuesJoined(this.groupByFieldNameList) + groupingKey, ok := inrec.GetSelectedValuesJoined(this.groupByFieldNames) var counter int = 0 if !ok { // Treat as unkeyed diff --git a/go/src/transformers/histogram.go b/go/src/transformers/histogram.go new file mode 100644 index 000000000..4f9076036 --- /dev/null +++ b/go/src/transformers/histogram.go @@ -0,0 +1,366 @@ +package transformers + +import ( + "fmt" + "os" + "strings" + + "miller/src/cliutil" + "miller/src/lib" + "miller/src/transforming" + "miller/src/types" +) + +// ---------------------------------------------------------------- +const verbNameHistogram = "histogram" + +var HistogramSetup = transforming.TransformerSetup{ + Verb: verbNameHistogram, + UsageFunc: transformerHistogramUsage, + ParseCLIFunc: transformerHistogramParseCLI, + IgnoresInput: false, +} + +func transformerHistogramUsage( + o *os.File, + doExit bool, + exitCode int, +) { + argv0 := lib.MlrExeName() + verb := verbNameHistogram + fmt.Fprintf(o, "Just a histogram. Input values < lo or > hi are not counted.\n") + fmt.Fprintf(o, "Usage: %s %s [options]\n", argv0, verb) + fmt.Fprintf(o, "-f {a,b,c} Value-field names for histogram counts\n") + fmt.Fprintf(o, "--lo {lo} Histogram low value\n") + fmt.Fprintf(o, "--hi {hi} Histogram high value\n") + fmt.Fprintf(o, "--nbins {n} Number of histogram bins\n") + fmt.Fprintf(o, "--auto Automatically computes limits, ignoring --lo and --hi.\n") + fmt.Fprintf(o, " Holds all values in memory before producing any output.\n") + fmt.Fprintf(o, "-o {prefix} Prefix for output field name. Default: no prefix.\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") + + if doExit { + os.Exit(exitCode) + } +} + +func transformerHistogramParseCLI( + pargi *int, + argc int, + args []string, + _ *cliutil.TReaderOptions, + __ *cliutil.TWriterOptions, +) transforming.IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + verb := args[argi] + argi++ + + // Parse local flags + var valueFieldNames []string = nil + lo := 0.0 + nbins := 0 + hi := 0.0 + doAuto := false + outputPrefix := "" + + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + argi++ + + if opt == "-h" || opt == "--help" { + transformerHistogramUsage(os.Stdout, true, 0) + + } else if opt == "-f" { + valueFieldNames = cliutil.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + + } else if opt == "--lo" { + lo = cliutil.VerbGetFloatArgOrDie(verb, opt, args, &argi, argc) + + } else if opt == "--nbins" { + nbins = cliutil.VerbGetIntArgOrDie(verb, opt, args, &argi, argc) + + } else if opt == "--hi" { + hi = cliutil.VerbGetFloatArgOrDie(verb, opt, args, &argi, argc) + + } else if opt == "--auto" { + doAuto = true + + } else if opt == "-o" { + outputPrefix = cliutil.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) + + } else { + transformerHistogramUsage(os.Stderr, true, 1) + } + } + + if valueFieldNames == nil { + transformerHistogramUsage(os.Stderr, true, 1) + } + + if nbins == 0 { + transformerHistogramUsage(os.Stderr, true, 1) + } + + if lo == hi && !doAuto { + transformerHistogramUsage(os.Stderr, true, 1) + } + + transformer, _ := NewTransformerHistogram( + valueFieldNames, + lo, + nbins, + hi, + doAuto, + outputPrefix, + ) + + *pargi = argi + return transformer +} + +// ---------------------------------------------------------------- +const histogramVectorInitialSize = 1024 + +type TransformerHistogram struct { + valueFieldNames []string + lo float64 + nbins int + hi float64 + mul float64 + + countsByField map[string][]int + vectorsByFieldName map[string][]float64 // For auto-mode + outputPrefix string + + recordTransformerFunc transforming.RecordTransformerFunc +} + +// ---------------------------------------------------------------- +func NewTransformerHistogram( + valueFieldNames []string, + lo float64, + nbins int, + hi float64, + doAuto bool, + outputPrefix string, +) (*TransformerHistogram, error) { + + countsByField := make(map[string][]int) + for _, valueFieldName := range valueFieldNames { + countsByField[valueFieldName] = make([]int, nbins) + for i := 0; i < nbins; i++ { + countsByField[valueFieldName][i] = 0 + } + } + + this := &TransformerHistogram{ + valueFieldNames: valueFieldNames, + countsByField: countsByField, + outputPrefix: outputPrefix, + nbins: nbins, + } + + if !doAuto { + this.recordTransformerFunc = this.transformNonAuto + this.lo = lo + this.hi = hi + this.mul = float64(nbins) / (hi - lo) + } else { + this.vectorsByFieldName = make(map[string][]float64) + for _, valueFieldName := range valueFieldNames { + this.vectorsByFieldName[valueFieldName] = make([]float64, 0, histogramVectorInitialSize) + } + + this.recordTransformerFunc = this.transformAuto + } + + return this, nil +} + +// ---------------------------------------------------------------- +func (this *TransformerHistogram) Transform( + inrecAndContext *types.RecordAndContext, + outputChannel chan<- *types.RecordAndContext, +) { + this.recordTransformerFunc(inrecAndContext, outputChannel) +} + +// ---------------------------------------------------------------- +func (this *TransformerHistogram) transformNonAuto( + inrecAndContext *types.RecordAndContext, + outputChannel chan<- *types.RecordAndContext, +) { + if !inrecAndContext.EndOfStream { + this.ingestNonAuto(inrecAndContext) + } else { + this.emitNonAuto(&inrecAndContext.Context, outputChannel) + outputChannel <- inrecAndContext // end-of-stream marker + } +} + +func (this *TransformerHistogram) ingestNonAuto( + inrecAndContext *types.RecordAndContext, +) { + inrec := inrecAndContext.Record + for _, valueFieldName := range this.valueFieldNames { + stringValue := inrec.Get(valueFieldName) + if stringValue != nil { + floatValue, ok := stringValue.GetNumericToFloatValue() + if !ok { + fmt.Fprintf( + os.Stderr, + "%s %s: cannot parse \"%s\" as float.\n", + lib.MlrExeName(), verbNameHistogram, stringValue.String(), + ) + os.Exit(1) + } + if (floatValue >= this.lo) && (floatValue < this.hi) { + idx := int((floatValue - this.lo) * this.mul) + this.countsByField[valueFieldName][idx]++ + } else if floatValue == this.hi { + idx := this.nbins - 1 + this.countsByField[valueFieldName][idx]++ + } + } + } +} + +func (this *TransformerHistogram) emitNonAuto( + endOfStreamContext *types.Context, + outputChannel chan<- *types.RecordAndContext, +) { + countFieldNames := make(map[string]string) + for _, valueFieldName := range this.valueFieldNames { + countFieldNames[valueFieldName] = this.outputPrefix + valueFieldName + "_count" + } + for i := 0; i < this.nbins; i++ { + outrec := types.NewMlrmapAsRecord() + + outrec.PutReference( + this.outputPrefix+"bin_lo", + types.MlrvalPointerFromFloat64((this.lo+float64(i))/this.mul), + ) + outrec.PutReference( + this.outputPrefix+"bin_hi", + types.MlrvalPointerFromFloat64((this.lo+float64(i+1))/this.mul), + ) + + for _, valueFieldName := range this.valueFieldNames { + outrec.PutReference( + countFieldNames[valueFieldName], + types.MlrvalPointerFromInt(this.countsByField[valueFieldName][i]), + ) + } + + outputChannel <- types.NewRecordAndContext(outrec, endOfStreamContext) + } +} + +// ---------------------------------------------------------------- +func (this *TransformerHistogram) transformAuto( + inrecAndContext *types.RecordAndContext, + outputChannel chan<- *types.RecordAndContext, +) { + if !inrecAndContext.EndOfStream { + this.ingestAuto(inrecAndContext) + } else { + this.emitAuto(&inrecAndContext.Context, outputChannel) + outputChannel <- inrecAndContext // end-of-stream marker + } +} + +func (this *TransformerHistogram) ingestAuto( + inrecAndContext *types.RecordAndContext, +) { + inrec := inrecAndContext.Record + for _, valueFieldName := range this.valueFieldNames { + mvalue := inrec.Get(valueFieldName) + if mvalue != nil { + value := mvalue.GetNumericToFloatValueOrDie() + this.vectorsByFieldName[valueFieldName] = append(this.vectorsByFieldName[valueFieldName], value) + } + } +} + +func (this *TransformerHistogram) emitAuto( + endOfStreamContext *types.Context, + outputChannel chan<- *types.RecordAndContext, +) { + haveLoHi := false + lo := 0.0 + hi := 1.0 + nbins := this.nbins + + // Limits pass + for _, valueFieldName := range this.valueFieldNames { + vector := this.vectorsByFieldName[valueFieldName] + n := len(vector) + for i := 0; i < n; i++ { + value := vector[i] + if haveLoHi { + if lo > value { + lo = value + } + if hi < value { + hi = value + } + } else { + lo = value + hi = value + haveLoHi = true + } + } + } + + // Binning pass + mul := float64(nbins) / (hi - lo) + for _, valueFieldName := range this.valueFieldNames { + vector := this.vectorsByFieldName[valueFieldName] + counts := this.countsByField[valueFieldName] + lib.InternalCodingErrorIf(counts == nil) + n := len(vector) + for i := 0; i < n; i++ { + value := vector[i] + if (value >= lo) && (value < hi) { + idx := int(((value - lo) * mul)) + counts[idx]++ + } else if value == hi { + idx := nbins - 1 + counts[idx]++ + } + } + } + + // Emission pass + countFieldNames := make(map[string]string) + for _, valueFieldName := range this.valueFieldNames { + countFieldNames[valueFieldName] = this.outputPrefix + valueFieldName + "_count" + } + + for i := 0; i < nbins; i++ { + outrec := types.NewMlrmapAsRecord() + + outrec.PutReference( + this.outputPrefix+"bin_lo", + types.MlrvalPointerFromFloat64((lo+float64(i))/mul), + ) + outrec.PutReference( + this.outputPrefix+"bin_hi", + types.MlrvalPointerFromFloat64((lo+float64(i+1))/mul), + ) + + for _, valueFieldName := range this.valueFieldNames { + outrec.PutReference( + countFieldNames[valueFieldName], + types.MlrvalPointerFromInt(this.countsByField[valueFieldName][i]), + ) + } + + outputChannel <- types.NewRecordAndContext(outrec, endOfStreamContext) + } +} diff --git a/go/src/types/mlrval_accessors.go b/go/src/types/mlrval_accessors.go index 39348a58b..18839faa2 100644 --- a/go/src/types/mlrval_accessors.go +++ b/go/src/types/mlrval_accessors.go @@ -1,6 +1,8 @@ package types import ( + "fmt" + "os" "strconv" "miller/src/lib" @@ -132,6 +134,19 @@ func (this *Mlrval) GetNumericToFloatValue() (floatValue float64, isFloat bool) } } +func (this *Mlrval) GetNumericToFloatValueOrDie() (floatValue float64) { + floatValue, ok := this.GetNumericToFloatValue() + if !ok { + fmt.Fprintf( + os.Stderr, + "%s: couldn't parse \"%s\" as number.", + lib.MlrExeName(), this.String(), + ) + os.Exit(1) + } + return floatValue +} + func (this *Mlrval) GetBoolValue() (boolValue bool, isBool bool) { if this.mvtype == MT_BOOL { return this.boolval, true diff --git a/go/todo.txt b/go/todo.txt index 7e393e950..d73f3aca3 100644 --- a/go/todo.txt +++ b/go/todo.txt @@ -1,6 +1,10 @@ ================================================================ TOP OF LIST: +* audit + groupByFieldNames = cliutil.VerbGetStringArgOrDie + -> cliutil.VerbGetStringArrayArgOrDie + * regexes o finish stats1 -r o regex captures ...