miller/pkg/transformers/stats1.go

634 lines
20 KiB
Go

package transformers
import (
"bytes"
"container/list"
"fmt"
"os"
"regexp"
"strings"
"github.com/johnkerl/miller/v6/pkg/cli"
"github.com/johnkerl/miller/v6/pkg/lib"
"github.com/johnkerl/miller/v6/pkg/mlrval"
"github.com/johnkerl/miller/v6/pkg/transformers/utils"
"github.com/johnkerl/miller/v6/pkg/types"
)
// ----------------------------------------------------------------
const verbNameStats1 = "stats1"
var Stats1Setup = TransformerSetup{
Verb: verbNameStats1,
UsageFunc: transformerStats1Usage,
ParseCLIFunc: transformerStats1ParseCLI,
IgnoresInput: false,
}
func transformerStats1Usage(
o *os.File,
) {
fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameStats1)
fmt.Fprint(o,
`Computes univariate statistics for one or more given fields, accumulated across
the input record stream.
Options:
-a {sum,count,...} Names of accumulators: one or more of:
median This is the same as p50
p10 p25.2 p50 p98 p100 etc.
`)
utils.ListStats1Accumulators(o)
fmt.Fprint(o, `
-f {a,b,c} Value-field names on which to compute statistics
--fr {regex} Regex for value-field names on which to compute statistics
(compute statistics on values in all field names matching regex
--fx {regex} Inverted regex for value-field names on which to compute statistics
(compute statistics on values in all field names not matching regex)
-g {d,e,f} Optional group-by-field names
--gr {regex} Regex for optional group-by-field names
(group by values in field names matching regex)
--gx {regex} Inverted regex for optional group-by-field names
(group by values in field names not matching regex)
--grfx {regex} Shorthand for --gr {regex} --fx {that same regex}
-i Use interpolated percentiles, like R's type=7; default like type=1.
Not sensical for string-valued fields.\n");
-s Print iterative stats. Useful in tail -f contexts, in which
case please avoid pprint-format output since end of input
`)
fmt.Fprintln(o, " stream will never be seen. Likewise, if input is coming from `tail -f`")
fmt.Fprintln(o, " be sure to use `--records-per-batch 1`.")
fmt.Fprintln(o, "-h|--help Show this message.")
fmt.Fprintln(o,
"Example: mlr stats1 -a min,p10,p50,p90,max -f value -g size,shape")
fmt.Fprintln(o,
"Example: mlr stats1 -a count,mode -f size")
fmt.Fprintln(o,
"Example: mlr stats1 -a count,mode -f size -g shape")
fmt.Fprintln(o,
"Example: mlr stats1 -a count,mode --fr '^[a-h].*$' --gr '^k.*$'")
fmt.Fprintln(o,
` This computes count and mode statistics on all field names beginning
with a through h, grouped by all field names starting with k.`)
fmt.Println()
fmt.Fprint(o,
`Notes:
* p50 and median are synonymous.
* min and max output the same results as p0 and p100, respectively, but use
less memory.
* String-valued data make sense unless arithmetic on them is required,
e.g. for sum, mean, interpolated percentiles, etc. In case of mixed data,
numbers are less than strings.
* count and mode allow text input; the rest require numeric input.
In particular, 1 and 1.0 are distinct text for count and mode.
* When there are mode ties, the first-encountered datum wins.
`)
}
func transformerStats1ParseCLI(
pargi *int,
argc int,
args []string,
_ *cli.TOptions,
doConstruct bool, // false for first pass of CLI-parse, true for second pass
) IRecordTransformer {
// Skip the verb name from the current spot in the mlr command line
argi := *pargi
verb := args[argi]
argi++
accumulatorNameList := make([]string, 0)
valueFieldNameList := make([]string, 0)
groupByFieldNameList := make([]string, 0)
doRegexValueFieldNames := false
doRegexGroupByFieldNames := false
invertRegexValueFieldNames := false
invertRegexGroupByFieldNames := false
doInterpolatedPercentiles := false
doIterativeStats := false
for argi < argc /* variable increment: 1 or 2 depending on flag */ {
opt := args[argi]
if !strings.HasPrefix(opt, "-") {
break // No more flag options to process
}
if args[argi] == "--" {
break // All transformers must do this so main-flags can follow verb-flags
}
argi++
if opt == "-h" || opt == "--help" {
transformerStats1Usage(os.Stdout)
os.Exit(0)
} else if opt == "-a" {
accumulatorNameList = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "-f" {
valueFieldNameList = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "-g" {
groupByFieldNameList = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "--fr" {
valueFieldNameList = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
doRegexValueFieldNames = true
} else if opt == "--fx" {
valueFieldNameList = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
doRegexValueFieldNames = true
invertRegexValueFieldNames = true
} else if opt == "--gr" {
groupByFieldNameList = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
doRegexGroupByFieldNames = true
} else if opt == "--gx" {
groupByFieldNameList = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
doRegexGroupByFieldNames = true
invertRegexGroupByFieldNames = true
} else if opt == "--grfx" {
doRegexValueFieldNames = true
doRegexGroupByFieldNames = true
invertRegexValueFieldNames = true
valueFieldNameList = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
groupByFieldNameList = lib.CopyStringArray(valueFieldNameList)
} else if opt == "-i" {
doInterpolatedPercentiles = true
} else if opt == "-s" {
doIterativeStats = true
} else if opt == "-S" {
// No-op pass-through for backward compatibility with Miller 5
} else if opt == "-F" {
// No-op pass-through for backward compatibility with Miller 5
} else {
transformerStats1Usage(os.Stderr)
os.Exit(1)
}
}
// TODO: libify for use across verbs.
if len(accumulatorNameList) == 0 {
fmt.Fprintf(os.Stderr, "%s %s: -a option is required.\n", "mlr", verbNameStats1)
fmt.Fprintf(os.Stderr, "Please see %s %s --help for more information.\n", "mlr", verbNameStats1)
os.Exit(1)
}
if len(valueFieldNameList) == 0 {
fmt.Fprintf(os.Stderr, "%s %s: -f option is required.\n", "mlr", verbNameStats1)
fmt.Fprintf(os.Stderr, "Please see %s %s --help for more information.\n", "mlr", verbNameStats1)
os.Exit(1)
}
*pargi = argi
if !doConstruct { // All transformers must do this for main command-line parsing
return nil
}
transformer, err := NewTransformerStats1(
accumulatorNameList,
valueFieldNameList,
groupByFieldNameList,
doRegexValueFieldNames,
doRegexGroupByFieldNames,
invertRegexValueFieldNames,
invertRegexGroupByFieldNames,
doInterpolatedPercentiles,
doIterativeStats,
)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
return transformer
}
// ----------------------------------------------------------------
type TransformerStats1 struct {
// Input:
accumulatorNameList []string
valueFieldNameList []string
groupByFieldNameList []string
// If the group-by field names are non-regexed, these are just the names in
// the groupByFieldNameList. If the group-by field names are regexed, this
// is the union of all the group-by field names encountered in the input,
// over all records.
groupByFieldNamesForOutput *lib.OrderedMap
valueFieldRegexes []*regexp.Regexp
groupByFieldRegexes []*regexp.Regexp
doRegexValueFieldNames bool
doRegexGroupByFieldNames bool
invertRegexValueFieldNames bool
invertRegexGroupByFieldNames bool
doInterpolatedPercentiles bool
doIterativeStats bool
// State:
accumulatorFactory *utils.Stats1AccumulatorFactory
// Accumulators are indexed by
// groupByFieldName -> valueFieldName -> accumulatorName -> accumulator object
// This would be
// namedAccumulators map[string]map[string]map[string]Stats1NamedAccumulator
// except we need maps that preserve insertion order.
namedAccumulators *lib.OrderedMap
// map[string]OrderedMap[string]*mlrval.Mlrval
groupingKeysToGroupByFieldValues map[string]*lib.OrderedMap
}
// Given: accumulate count,sum on values x,y group by a,b.
//
// Example input: Example output:
// a b x y a b x_count x_sum y_count y_sum
// s t 1 2 s t 2 6 2 8
// u v 3 4 u v 1 3 1 4
// s t 5 6 u w 1 7 1 9
// u w 7 9
//
// Multilevel hashmap structure:
// {
// "s,t" : { <--- group-by field names
// "x" : { <--- value field name
// "count" : Stats1CountAccumulator object,
// "sum" : Stats1SumAccumulator object
// },
// "y" : {
// "count" : Stats1CountAccumulator object,
// "sum" : Stats1SumAccumulator object
// },
// },
// "u,v" : {
// "x" : {
// "count" : Stats1CountAccumulator object,
// "sum" : Stats1SumAccumulator object
// },
// "y" : {
// "count" : Stats1CountAccumulator object,
// "sum" : Stats1SumAccumulator object
// },
// },
// "u,w" : {
// "x" : {
// "count" : Stats1CountAccumulator object,
// "sum" : Stats1SumAccumulator object
// },
// "y" : {
// "count" : Stats1CountAccumulator object,
// "sum" : Stats1SumAccumulator object
// },
// },
// }
func NewTransformerStats1(
accumulatorNameList []string,
valueFieldNameList []string,
groupByFieldNameList []string,
doRegexValueFieldNames bool,
doRegexGroupByFieldNames bool,
invertRegexValueFieldNames bool,
invertRegexGroupByFieldNames bool,
doInterpolatedPercentiles bool,
doIterativeStats bool,
) (*TransformerStats1, error) {
for _, name := range accumulatorNameList {
if !utils.ValidateStats1AccumulatorName(name) {
return nil, fmt.Errorf(`mlr stats1: accumulator "%s" not found`, name)
}
}
tr := &TransformerStats1{
accumulatorNameList: accumulatorNameList,
valueFieldNameList: valueFieldNameList,
groupByFieldNameList: groupByFieldNameList,
groupByFieldNamesForOutput: lib.NewOrderedMap(),
doRegexValueFieldNames: doRegexValueFieldNames,
doRegexGroupByFieldNames: doRegexGroupByFieldNames,
invertRegexValueFieldNames: invertRegexValueFieldNames,
invertRegexGroupByFieldNames: invertRegexGroupByFieldNames,
doInterpolatedPercentiles: doInterpolatedPercentiles,
doIterativeStats: doIterativeStats,
accumulatorFactory: utils.NewStats1AccumulatorFactory(),
namedAccumulators: lib.NewOrderedMap(),
groupingKeysToGroupByFieldValues: make(map[string]*lib.OrderedMap),
}
if doRegexGroupByFieldNames {
tr.groupByFieldRegexes = lib.CompileMillerRegexesOrDie(groupByFieldNameList)
} else {
for _, groupByFieldName := range groupByFieldNameList {
tr.groupByFieldNamesForOutput.Put(groupByFieldName, true)
}
}
if doRegexValueFieldNames {
tr.valueFieldRegexes = lib.CompileMillerRegexesOrDie(valueFieldNameList)
}
return tr, nil
}
// Transform is the function executed for every input record, as well as for
// the end-of-stream marker.
func (tr *TransformerStats1) Transform(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
inputDownstreamDoneChannel <-chan bool,
outputDownstreamDoneChannel chan<- bool,
) {
HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel)
if !inrecAndContext.EndOfStream {
tr.handleInputRecord(inrecAndContext, outputRecordsAndContexts)
} else {
tr.handleEndOfRecordStream(inrecAndContext, outputRecordsAndContexts)
}
}
func (tr *TransformerStats1) handleInputRecord(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
) {
inrec := inrecAndContext.Record
// E.g. if grouping by "a" and "b", and the current record has a=circle, b=blue,
// then groupingKey is the string "circle,blue".
var groupingKey string
var groupByFieldValues *lib.OrderedMap // OrderedMap[string]*mlrval.Mlrval
var ok bool
if tr.doRegexGroupByFieldNames {
groupingKey, groupByFieldValues, ok = tr.getGroupByFieldNamesWithRegexes(inrec)
} else {
groupingKey, groupByFieldValues, ok = tr.getGroupByFieldNamesWithoutRegexes(inrec)
}
if !ok {
return
}
level2 := tr.namedAccumulators.Get(groupingKey)
if level2 == nil {
level2 = lib.NewOrderedMap()
tr.namedAccumulators.Put(groupingKey, level2)
// E.g. if grouping by "color" and "shape", and the current record has
// color=blue, shape=circle, then groupByFieldValues is the map
// {"color": "blue", "shape": "circle"}.
tr.groupingKeysToGroupByFieldValues[groupingKey] = groupByFieldValues
}
if tr.doRegexValueFieldNames {
tr.ingestWithValueFieldRegexes(inrec, groupingKey, level2.(*lib.OrderedMap))
} else {
tr.ingestWithoutValueFieldRegexes(inrec, groupingKey, level2.(*lib.OrderedMap))
}
if tr.doIterativeStats {
tr.emitIntoOutputRecord(
inrecAndContext.Record,
groupByFieldValues,
level2.(*lib.OrderedMap),
inrec,
)
outputRecordsAndContexts.PushBack(inrecAndContext)
}
}
// E.g. if grouping by "a" and "b", and the current record has a=circle,
// b=blue, then groupingKey is the string "circle,blue". For grouping without
// regexed group-by field names, the group-by field names/values are the same
// on every record.
func (tr *TransformerStats1) getGroupByFieldNamesWithoutRegexes(
inrec *mlrval.Mlrmap,
) (
groupingKey string,
groupByFieldValues *lib.OrderedMap, // OrderedMap[string]*mlrval.Mlrval,
ok bool,
) {
var groupByFieldValuesArray []*mlrval.Mlrval
groupingKey, groupByFieldValuesArray, ok = inrec.GetSelectedValuesAndJoined(tr.groupByFieldNameList)
if !ok {
return groupingKey, nil, false
}
groupByFieldValues = lib.NewOrderedMap()
for i, groupByFieldValue := range groupByFieldValuesArray {
groupByFieldValues.Put(tr.groupByFieldNameList[i], groupByFieldValue)
}
return groupingKey, groupByFieldValues, ok
}
// E.g. if grouping by "a" and "b", and the current record has a=circle,
// b=blue, then groupingKey is the string "circle,blue". For grouping with
// regexed group-by field names, the group-by field names/values may or may not
// be the same on every record.
func (tr *TransformerStats1) getGroupByFieldNamesWithRegexes(
inrec *mlrval.Mlrmap,
) (
groupingKey string,
groupByFieldValues *lib.OrderedMap, // OrderedMap[string]*mlrval.Mlrval,
ok bool,
) {
var buffer bytes.Buffer
groupByFieldValues = lib.NewOrderedMap()
for pe := inrec.Head; pe != nil; pe = pe.Next {
groupByFieldName := pe.Key
if !tr.matchGroupByFieldName(groupByFieldName) {
continue
}
// Remember the union of all encountered group-by field names
// for output at the end of the record stream.
tr.groupByFieldNamesForOutput.Put(groupByFieldName, true)
groupByFieldValue := pe.Value.Copy()
if !groupByFieldValues.IsEmpty() {
buffer.WriteString(",")
}
buffer.WriteString(groupByFieldValue.String())
groupByFieldValues.Put(groupByFieldName, groupByFieldValue)
}
groupingKey = buffer.String()
return groupingKey, groupByFieldValues, true
}
func (tr *TransformerStats1) ingestWithoutValueFieldRegexes(
inrec *mlrval.Mlrmap,
groupingKey string,
level2 *lib.OrderedMap,
) {
for _, valueFieldName := range tr.valueFieldNameList {
valueFieldValue := inrec.Get(valueFieldName)
if valueFieldValue == nil {
continue
}
level3 := level2.Get(valueFieldName)
if level3 == nil {
level3 = lib.NewOrderedMap()
level2.Put(valueFieldName, level3)
}
for _, accumulatorName := range tr.accumulatorNameList {
namedAccumulator := level3.(*lib.OrderedMap).Get(accumulatorName)
if namedAccumulator == nil {
namedAccumulator = tr.accumulatorFactory.MakeNamedAccumulator(
accumulatorName,
groupingKey,
valueFieldName,
tr.doInterpolatedPercentiles,
)
level3.(*lib.OrderedMap).Put(accumulatorName, namedAccumulator)
}
if valueFieldValue.IsVoid() {
// The accumulator has been initialized with default values;
// continue here. (If we were to continue outside of this loop
// we would be failing to construct the accumulator.)
if accumulatorName != "null_count" {
continue
}
}
namedAccumulator.(*utils.Stats1NamedAccumulator).Ingest(valueFieldValue)
}
}
}
func (tr *TransformerStats1) ingestWithValueFieldRegexes(
inrec *mlrval.Mlrmap,
groupingKey string,
level2 *lib.OrderedMap,
) {
for pe := inrec.Head; pe != nil; pe = pe.Next {
valueFieldName := pe.Key
if !tr.matchValueFieldName(valueFieldName) {
continue
}
valueFieldValue := inrec.Get(valueFieldName)
if valueFieldValue == nil {
continue
}
level3 := level2.Get(valueFieldName)
if level3 == nil {
level3 = lib.NewOrderedMap()
level2.Put(valueFieldName, level3)
}
for _, accumulatorName := range tr.accumulatorNameList {
namedAccumulator := level3.(*lib.OrderedMap).Get(accumulatorName)
if namedAccumulator == nil {
namedAccumulator = tr.accumulatorFactory.MakeNamedAccumulator(
accumulatorName,
groupingKey,
valueFieldName,
tr.doInterpolatedPercentiles,
)
level3.(*lib.OrderedMap).Put(accumulatorName, namedAccumulator)
}
if valueFieldValue.IsVoid() {
// The accumulator has been initialized with default values;
// continue here. (If we were to continue outside of this loop
// we would be failing to construct the accumulator.)
continue
}
namedAccumulator.(*utils.Stats1NamedAccumulator).Ingest(valueFieldValue)
}
}
}
func (tr *TransformerStats1) matchGroupByFieldName(
groupByFieldName string,
) bool {
matches := false
for _, groupByFieldRegex := range tr.groupByFieldRegexes {
if groupByFieldRegex.MatchString(groupByFieldName) {
matches = true
break
}
}
return matches != tr.invertRegexGroupByFieldNames
}
func (tr *TransformerStats1) matchValueFieldName(
valueFieldName string,
) bool {
matches := false
for _, valueFieldRegex := range tr.valueFieldRegexes {
if valueFieldRegex.MatchString(valueFieldName) {
matches = true
break
}
}
return matches != tr.invertRegexValueFieldNames
}
func (tr *TransformerStats1) handleEndOfRecordStream(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
) {
if tr.doIterativeStats {
outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker
return
}
for pa := tr.namedAccumulators.Head; pa != nil; pa = pa.Next {
groupingKey := pa.Key
level2 := pa.Value.(*lib.OrderedMap)
groupByFieldValues := tr.groupingKeysToGroupByFieldValues[groupingKey]
newrec := mlrval.NewMlrmapAsRecord()
tr.emitIntoOutputRecord(
inrecAndContext.Record,
groupByFieldValues,
level2,
newrec,
)
outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context))
}
outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker
}
func (tr *TransformerStats1) emitIntoOutputRecord(
inrec *mlrval.Mlrmap,
groupByFieldValues *lib.OrderedMap, // OrderedMap[string]*mlrval.Mlrval,
level2accumulators *lib.OrderedMap,
outrec *mlrval.Mlrmap,
) {
for pa := tr.groupByFieldNamesForOutput.Head; pa != nil; pa = pa.Next {
groupByFieldName := pa.Key
iValue := groupByFieldValues.Get(groupByFieldName)
if iValue != nil {
outrec.PutCopy(groupByFieldName, iValue.(*mlrval.Mlrval))
}
}
for pb := level2accumulators.Head; pb != nil; pb = pb.Next {
level3 := pb.Value.(*lib.OrderedMap)
for pc := level3.Head; pc != nil; pc = pc.Next {
namedAccumulator := pc.Value.(*utils.Stats1NamedAccumulator)
key, value := namedAccumulator.Emit()
outrec.PutCopy(key, value)
}
}
}