Port histogram verb from C to Go

This commit is contained in:
John Kerl 2021-02-28 00:43:50 -05:00
parent ca320d8475
commit d48226feb7
9 changed files with 478 additions and 78 deletions

View file

@ -14,7 +14,7 @@ import (
// ----------------------------------------------------------------
func main() {
runtime.GOMAXPROCS(4) // Seems reasonable these days
runtime.GOMAXPROCS(4) // Seems reasonable these days
debug.SetGCPercent(500) // Empirical: See README-profiling.md
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

View file

@ -1,71 +1,71 @@
mlr --opprint histogram -f x,y --lo 0 --hi 1 --nbins 20 ./reg-test/input/small
bin_lo bin_hi x_count y_count
0.000000 0.050000 1 0
0.050000 0.100000 0 0
0.100000 0.150000 0 1
0.150000 0.200000 0 1
0.200000 0.250000 1 0
0.250000 0.300000 0 0
0.300000 0.350000 1 1
0.350000 0.400000 1 0
0.400000 0.450000 0 0
0.450000 0.500000 0 1
0.500000 0.550000 2 1
0.550000 0.600000 2 0
0.600000 0.650000 1 0
0.650000 0.700000 0 0
0.700000 0.750000 0 2
0.750000 0.800000 1 0
0.800000 0.850000 0 0
0.850000 0.900000 0 1
0.900000 0.950000 0 0
0.950000 1.000000 0 2
bin_lo bin_hi x_count y_count
0 0.05 1 0
0.05 0.1 0 0
0.1 0.15 0 1
0.15 0.2 0 1
0.2 0.25 1 0
0.25 0.3 0 0
0.3 0.35 1 1
0.35 0.4 1 0
0.4 0.45 0 0
0.45 0.5 0 1
0.5 0.55 2 1
0.55 0.6 2 0
0.6 0.65 1 0
0.65 0.7 0 0
0.7 0.75 0 2
0.75 0.8 1 0
0.8 0.85 0 0
0.85 0.9 0 1
0.9 0.95 0 0
0.95 1 0 2
mlr --opprint histogram -f x,y --lo 0 --hi 1 --nbins 20 -o foo_ ./reg-test/input/small
foo_bin_lo foo_bin_hi foo_x_count foo_y_count
0.000000 0.050000 1 0
0.050000 0.100000 0 0
0.100000 0.150000 0 1
0.150000 0.200000 0 1
0.200000 0.250000 1 0
0.250000 0.300000 0 0
0.300000 0.350000 1 1
0.350000 0.400000 1 0
0.400000 0.450000 0 0
0.450000 0.500000 0 1
0.500000 0.550000 2 1
0.550000 0.600000 2 0
0.600000 0.650000 1 0
0.650000 0.700000 0 0
0.700000 0.750000 0 2
0.750000 0.800000 1 0
0.800000 0.850000 0 0
0.850000 0.900000 0 1
0.900000 0.950000 0 0
0.950000 1.000000 0 2
0 0.05 1 0
0.05 0.1 0 0
0.1 0.15 0 1
0.15 0.2 0 1
0.2 0.25 1 0
0.25 0.3 0 0
0.3 0.35 1 1
0.35 0.4 1 0
0.4 0.45 0 0
0.45 0.5 0 1
0.5 0.55 2 1
0.55 0.6 2 0
0.6 0.65 1 0
0.65 0.7 0 0
0.7 0.75 0 2
0.75 0.8 1 0
0.8 0.85 0 0
0.85 0.9 0 1
0.9 0.95 0 0
0.95 1 0 2
mlr --opprint histogram --nbins 9 --auto -f x,y ./reg-test/input/ints.dkvp
bin_lo bin_hi x_count y_count
0.000000 1.000000 8 1
1.000000 2.000000 2 2
2.000000 3.000000 5 5
3.000000 4.000000 4 1
4.000000 5.000000 3 2
5.000000 6.000000 1 4
6.000000 7.000000 3 4
7.000000 8.000000 2 4
8.000000 9.000000 2 7
bin_lo bin_hi x_count y_count
0 1 8 1
1 2 2 2
2 3 5 5
3 4 4 1
4 5 3 2
5 6 1 4
6 7 3 4
7 8 2 4
8 9 2 7
mlr --opprint histogram --nbins 9 --auto -f x,y -o foo_ ./reg-test/input/ints.dkvp
foo_bin_lo bin_hi foo_x_count foo_y_count
0.000000 1.000000 8 1
1.000000 2.000000 2 2
2.000000 3.000000 5 5
3.000000 4.000000 4 1
4.000000 5.000000 3 2
5.000000 6.000000 1 4
6.000000 7.000000 3 4
7.000000 8.000000 2 4
8.000000 9.000000 2 7
foo_bin_lo foo_bin_hi foo_x_count foo_y_count
0 1 8 1
1 2 2 2
2 3 5 5
3 4 4 1
4 5 3 2
5 6 1 4
6 7 3 4
7 8 2 4
8 9 2 7

View file

@ -29,6 +29,7 @@ var MAPPER_LOOKUP_TABLE = []transforming.TransformerSetup{
transformers.GroupLikeSetup,
transformers.HavingFieldsSetup,
transformers.HeadSetup,
transformers.HistogramSetup,
transformers.JoinSetup,
transformers.JSONParseSetup,
transformers.JSONStringifySetup,

View file

@ -54,3 +54,19 @@ func VerbGetIntArgOrDie(verb string, opt string, args []string, pargi *int, argc
}
return retval
}
// E.g. with ["-n", "10.3"], makes sure there is something in the "10.3"
// position, scans it as float, and returns it.
func VerbGetFloatArgOrDie(verb string, opt string, args []string, pargi *int, argc int) float64 {
flag := args[*pargi]
stringArg := VerbGetStringArgOrDie(verb, opt, args, pargi, argc)
retval, err := strconv.ParseFloat(stringArg, 64)
if err != nil {
fmt.Fprintf(os.Stderr,
"%s %s: could not scan flag \"%s\" argument \"%s\" as float.\n",
lib.MlrExeName(), verb, flag, stringArg,
)
os.Exit(1)
}
return retval
}

View file

@ -100,9 +100,9 @@ func TryIntFromString(input string) (int, bool) {
}
func TryFloat64FromString(input string) (float64, bool) {
ival, err := strconv.ParseFloat(input, 64)
fval, err := strconv.ParseFloat(input, 64)
if err == nil {
return ival, true
return fval, true
} else {
return 0, false
}

View file

@ -55,7 +55,7 @@ func transformerCatParseCLI(
// Parse local flags
doCounters := false
counterFieldName := ""
groupByFieldNames := ""
var groupByFieldNames []string = nil
for argi < argc /* variable increment: 1 or 2 depending on flag */ {
opt := args[argi]
@ -74,7 +74,7 @@ func transformerCatParseCLI(
counterFieldName = cliutil.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "-g" {
groupByFieldNames = cliutil.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
groupByFieldNames = cliutil.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
} else {
transformerCatUsage(os.Stderr, true, 1)
@ -92,8 +92,8 @@ func transformerCatParseCLI(
// ----------------------------------------------------------------
type TransformerCat struct {
doCounters bool
groupByFieldNameList []string
doCounters bool
groupByFieldNames []string
counter int
countsByGroup map[string]int
@ -106,27 +106,25 @@ type TransformerCat struct {
func NewTransformerCat(
doCounters bool,
counterFieldName string,
groupByFieldNames string,
groupByFieldNames []string,
) (*TransformerCat, error) {
groupByFieldNameList := lib.SplitString(groupByFieldNames, ",")
if counterFieldName != "" {
doCounters = true
}
this := &TransformerCat{
doCounters: doCounters,
groupByFieldNameList: groupByFieldNameList,
counter: 0,
countsByGroup: make(map[string]int),
counterFieldName: counterFieldName,
doCounters: doCounters,
groupByFieldNames: groupByFieldNames,
counter: 0,
countsByGroup: make(map[string]int),
counterFieldName: counterFieldName,
}
if !doCounters {
this.recordTransformerFunc = this.simpleCat
} else {
if groupByFieldNames == "" {
if groupByFieldNames == nil {
this.recordTransformerFunc = this.countersUngrouped
} else {
this.recordTransformerFunc = this.countersGrouped
@ -175,7 +173,7 @@ func (this *TransformerCat) countersGrouped(
if !inrecAndContext.EndOfStream {
inrec := inrecAndContext.Record
groupingKey, ok := inrec.GetSelectedValuesJoined(this.groupByFieldNameList)
groupingKey, ok := inrec.GetSelectedValuesJoined(this.groupByFieldNames)
var counter int = 0
if !ok {
// Treat as unkeyed

View file

@ -0,0 +1,366 @@
package transformers
import (
"fmt"
"os"
"strings"
"miller/src/cliutil"
"miller/src/lib"
"miller/src/transforming"
"miller/src/types"
)
// ----------------------------------------------------------------
const verbNameHistogram = "histogram"
var HistogramSetup = transforming.TransformerSetup{
Verb: verbNameHistogram,
UsageFunc: transformerHistogramUsage,
ParseCLIFunc: transformerHistogramParseCLI,
IgnoresInput: false,
}
func transformerHistogramUsage(
o *os.File,
doExit bool,
exitCode int,
) {
argv0 := lib.MlrExeName()
verb := verbNameHistogram
fmt.Fprintf(o, "Just a histogram. Input values < lo or > hi are not counted.\n")
fmt.Fprintf(o, "Usage: %s %s [options]\n", argv0, verb)
fmt.Fprintf(o, "-f {a,b,c} Value-field names for histogram counts\n")
fmt.Fprintf(o, "--lo {lo} Histogram low value\n")
fmt.Fprintf(o, "--hi {hi} Histogram high value\n")
fmt.Fprintf(o, "--nbins {n} Number of histogram bins\n")
fmt.Fprintf(o, "--auto Automatically computes limits, ignoring --lo and --hi.\n")
fmt.Fprintf(o, " Holds all values in memory before producing any output.\n")
fmt.Fprintf(o, "-o {prefix} Prefix for output field name. Default: no prefix.\n")
fmt.Fprintf(o, "-h|--help Show this message.\n")
if doExit {
os.Exit(exitCode)
}
}
func transformerHistogramParseCLI(
pargi *int,
argc int,
args []string,
_ *cliutil.TReaderOptions,
__ *cliutil.TWriterOptions,
) transforming.IRecordTransformer {
// Skip the verb name from the current spot in the mlr command line
argi := *pargi
verb := args[argi]
argi++
// Parse local flags
var valueFieldNames []string = nil
lo := 0.0
nbins := 0
hi := 0.0
doAuto := false
outputPrefix := ""
for argi < argc /* variable increment: 1 or 2 depending on flag */ {
opt := args[argi]
if !strings.HasPrefix(opt, "-") {
break // No more flag options to process
}
argi++
if opt == "-h" || opt == "--help" {
transformerHistogramUsage(os.Stdout, true, 0)
} else if opt == "-f" {
valueFieldNames = cliutil.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "--lo" {
lo = cliutil.VerbGetFloatArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "--nbins" {
nbins = cliutil.VerbGetIntArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "--hi" {
hi = cliutil.VerbGetFloatArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "--auto" {
doAuto = true
} else if opt == "-o" {
outputPrefix = cliutil.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
} else {
transformerHistogramUsage(os.Stderr, true, 1)
}
}
if valueFieldNames == nil {
transformerHistogramUsage(os.Stderr, true, 1)
}
if nbins == 0 {
transformerHistogramUsage(os.Stderr, true, 1)
}
if lo == hi && !doAuto {
transformerHistogramUsage(os.Stderr, true, 1)
}
transformer, _ := NewTransformerHistogram(
valueFieldNames,
lo,
nbins,
hi,
doAuto,
outputPrefix,
)
*pargi = argi
return transformer
}
// ----------------------------------------------------------------
const histogramVectorInitialSize = 1024
type TransformerHistogram struct {
valueFieldNames []string
lo float64
nbins int
hi float64
mul float64
countsByField map[string][]int
vectorsByFieldName map[string][]float64 // For auto-mode
outputPrefix string
recordTransformerFunc transforming.RecordTransformerFunc
}
// ----------------------------------------------------------------
func NewTransformerHistogram(
valueFieldNames []string,
lo float64,
nbins int,
hi float64,
doAuto bool,
outputPrefix string,
) (*TransformerHistogram, error) {
countsByField := make(map[string][]int)
for _, valueFieldName := range valueFieldNames {
countsByField[valueFieldName] = make([]int, nbins)
for i := 0; i < nbins; i++ {
countsByField[valueFieldName][i] = 0
}
}
this := &TransformerHistogram{
valueFieldNames: valueFieldNames,
countsByField: countsByField,
outputPrefix: outputPrefix,
nbins: nbins,
}
if !doAuto {
this.recordTransformerFunc = this.transformNonAuto
this.lo = lo
this.hi = hi
this.mul = float64(nbins) / (hi - lo)
} else {
this.vectorsByFieldName = make(map[string][]float64)
for _, valueFieldName := range valueFieldNames {
this.vectorsByFieldName[valueFieldName] = make([]float64, 0, histogramVectorInitialSize)
}
this.recordTransformerFunc = this.transformAuto
}
return this, nil
}
// ----------------------------------------------------------------
func (this *TransformerHistogram) Transform(
inrecAndContext *types.RecordAndContext,
outputChannel chan<- *types.RecordAndContext,
) {
this.recordTransformerFunc(inrecAndContext, outputChannel)
}
// ----------------------------------------------------------------
func (this *TransformerHistogram) transformNonAuto(
inrecAndContext *types.RecordAndContext,
outputChannel chan<- *types.RecordAndContext,
) {
if !inrecAndContext.EndOfStream {
this.ingestNonAuto(inrecAndContext)
} else {
this.emitNonAuto(&inrecAndContext.Context, outputChannel)
outputChannel <- inrecAndContext // end-of-stream marker
}
}
func (this *TransformerHistogram) ingestNonAuto(
inrecAndContext *types.RecordAndContext,
) {
inrec := inrecAndContext.Record
for _, valueFieldName := range this.valueFieldNames {
stringValue := inrec.Get(valueFieldName)
if stringValue != nil {
floatValue, ok := stringValue.GetNumericToFloatValue()
if !ok {
fmt.Fprintf(
os.Stderr,
"%s %s: cannot parse \"%s\" as float.\n",
lib.MlrExeName(), verbNameHistogram, stringValue.String(),
)
os.Exit(1)
}
if (floatValue >= this.lo) && (floatValue < this.hi) {
idx := int((floatValue - this.lo) * this.mul)
this.countsByField[valueFieldName][idx]++
} else if floatValue == this.hi {
idx := this.nbins - 1
this.countsByField[valueFieldName][idx]++
}
}
}
}
func (this *TransformerHistogram) emitNonAuto(
endOfStreamContext *types.Context,
outputChannel chan<- *types.RecordAndContext,
) {
countFieldNames := make(map[string]string)
for _, valueFieldName := range this.valueFieldNames {
countFieldNames[valueFieldName] = this.outputPrefix + valueFieldName + "_count"
}
for i := 0; i < this.nbins; i++ {
outrec := types.NewMlrmapAsRecord()
outrec.PutReference(
this.outputPrefix+"bin_lo",
types.MlrvalPointerFromFloat64((this.lo+float64(i))/this.mul),
)
outrec.PutReference(
this.outputPrefix+"bin_hi",
types.MlrvalPointerFromFloat64((this.lo+float64(i+1))/this.mul),
)
for _, valueFieldName := range this.valueFieldNames {
outrec.PutReference(
countFieldNames[valueFieldName],
types.MlrvalPointerFromInt(this.countsByField[valueFieldName][i]),
)
}
outputChannel <- types.NewRecordAndContext(outrec, endOfStreamContext)
}
}
// ----------------------------------------------------------------
func (this *TransformerHistogram) transformAuto(
inrecAndContext *types.RecordAndContext,
outputChannel chan<- *types.RecordAndContext,
) {
if !inrecAndContext.EndOfStream {
this.ingestAuto(inrecAndContext)
} else {
this.emitAuto(&inrecAndContext.Context, outputChannel)
outputChannel <- inrecAndContext // end-of-stream marker
}
}
func (this *TransformerHistogram) ingestAuto(
inrecAndContext *types.RecordAndContext,
) {
inrec := inrecAndContext.Record
for _, valueFieldName := range this.valueFieldNames {
mvalue := inrec.Get(valueFieldName)
if mvalue != nil {
value := mvalue.GetNumericToFloatValueOrDie()
this.vectorsByFieldName[valueFieldName] = append(this.vectorsByFieldName[valueFieldName], value)
}
}
}
func (this *TransformerHistogram) emitAuto(
endOfStreamContext *types.Context,
outputChannel chan<- *types.RecordAndContext,
) {
haveLoHi := false
lo := 0.0
hi := 1.0
nbins := this.nbins
// Limits pass
for _, valueFieldName := range this.valueFieldNames {
vector := this.vectorsByFieldName[valueFieldName]
n := len(vector)
for i := 0; i < n; i++ {
value := vector[i]
if haveLoHi {
if lo > value {
lo = value
}
if hi < value {
hi = value
}
} else {
lo = value
hi = value
haveLoHi = true
}
}
}
// Binning pass
mul := float64(nbins) / (hi - lo)
for _, valueFieldName := range this.valueFieldNames {
vector := this.vectorsByFieldName[valueFieldName]
counts := this.countsByField[valueFieldName]
lib.InternalCodingErrorIf(counts == nil)
n := len(vector)
for i := 0; i < n; i++ {
value := vector[i]
if (value >= lo) && (value < hi) {
idx := int(((value - lo) * mul))
counts[idx]++
} else if value == hi {
idx := nbins - 1
counts[idx]++
}
}
}
// Emission pass
countFieldNames := make(map[string]string)
for _, valueFieldName := range this.valueFieldNames {
countFieldNames[valueFieldName] = this.outputPrefix + valueFieldName + "_count"
}
for i := 0; i < nbins; i++ {
outrec := types.NewMlrmapAsRecord()
outrec.PutReference(
this.outputPrefix+"bin_lo",
types.MlrvalPointerFromFloat64((lo+float64(i))/mul),
)
outrec.PutReference(
this.outputPrefix+"bin_hi",
types.MlrvalPointerFromFloat64((lo+float64(i+1))/mul),
)
for _, valueFieldName := range this.valueFieldNames {
outrec.PutReference(
countFieldNames[valueFieldName],
types.MlrvalPointerFromInt(this.countsByField[valueFieldName][i]),
)
}
outputChannel <- types.NewRecordAndContext(outrec, endOfStreamContext)
}
}

View file

@ -1,6 +1,8 @@
package types
import (
"fmt"
"os"
"strconv"
"miller/src/lib"
@ -132,6 +134,19 @@ func (this *Mlrval) GetNumericToFloatValue() (floatValue float64, isFloat bool)
}
}
func (this *Mlrval) GetNumericToFloatValueOrDie() (floatValue float64) {
floatValue, ok := this.GetNumericToFloatValue()
if !ok {
fmt.Fprintf(
os.Stderr,
"%s: couldn't parse \"%s\" as number.",
lib.MlrExeName(), this.String(),
)
os.Exit(1)
}
return floatValue
}
func (this *Mlrval) GetBoolValue() (boolValue bool, isBool bool) {
if this.mvtype == MT_BOOL {
return this.boolval, true

View file

@ -1,6 +1,10 @@
================================================================
TOP OF LIST:
* audit
groupByFieldNames = cliutil.VerbGetStringArgOrDie
-> cliutil.VerbGetStringArrayArgOrDie
* regexes
o finish stats1 -r
o regex captures ...