miller/pkg/transformers/top.go
Adam Lesperance 085e831668
The package version must match the major tag version (#1654)
* Update package version

* Update makefile targets

* Update readme packages

* Remaining old packages via rg/sd
2024-09-20 12:10:11 -04:00

290 lines
8.8 KiB
Go

package transformers
import (
"container/list"
"fmt"
"os"
"strings"
"github.com/johnkerl/miller/v6/pkg/cli"
"github.com/johnkerl/miller/v6/pkg/lib"
"github.com/johnkerl/miller/v6/pkg/mlrval"
"github.com/johnkerl/miller/v6/pkg/transformers/utils"
"github.com/johnkerl/miller/v6/pkg/types"
)
// ----------------------------------------------------------------
const verbNameTop = "top"
const verbTopDefaultOutputFieldName = "top_idx"
var TopSetup = TransformerSetup{
Verb: verbNameTop,
UsageFunc: transformerTopUsage,
ParseCLIFunc: transformerTopParseCLI,
IgnoresInput: false,
}
func transformerTopUsage(
o *os.File,
) {
argv0 := "mlr"
verb := verbNameTop
fmt.Fprintf(o, "Usage: %s %s [options]\n", argv0, verb)
fmt.Fprintf(o, "-f {a,b,c} Value-field names for top counts.\n")
fmt.Fprintf(o, "-g {d,e,f} Optional group-by-field names for top counts.\n")
fmt.Fprintf(o, "-n {count} How many records to print per category; default 1.\n")
fmt.Fprintf(o, "-a Print all fields for top-value records; default is\n")
fmt.Fprintf(o, " to print only value and group-by fields. Requires a single\n")
fmt.Fprintf(o, " value-field name only.\n")
fmt.Fprintf(o, "--min Print top smallest values; default is top largest values.\n")
fmt.Fprintf(o, "-F Keep top values as floats even if they look like integers.\n")
fmt.Fprintf(o, "-o {name} Field name for output indices. Default \"%s\".\n", verbTopDefaultOutputFieldName)
fmt.Fprintf(o, " This is ignored if -a is used.\n")
fmt.Fprintf(o, "Prints the n records with smallest/largest values at specified fields,\n")
fmt.Fprintf(o, "optionally by category. If -a is given, then the top records are emitted\n")
fmt.Fprintf(o, "with the same fields as they appeared in the input. Without -a, only fields\n")
fmt.Fprintf(o, "from -f, fields from -g, and the top-index field are emitted. For more information\n")
fmt.Fprintf(o, "please see https://miller.readthedocs.io/en/latest/reference-verbs#top\n")
}
func transformerTopParseCLI(
pargi *int,
argc int,
args []string,
_ *cli.TOptions,
doConstruct bool, // false for first pass of CLI-parse, true for second pass
) IRecordTransformer {
// Skip the verb name from the current spot in the mlr command line
argi := *pargi
verb := args[argi]
argi++
// Parse local flags
topCount := int64(1)
var valueFieldNames []string = nil
var groupByFieldNames []string = nil
showFullRecords := false
doMax := true
outputFieldName := verbTopDefaultOutputFieldName
for argi < argc /* variable increment: 1 or 2 depending on flag */ {
opt := args[argi]
if !strings.HasPrefix(opt, "-") {
break // No more flag options to process
}
if args[argi] == "--" {
break // All transformers must do this so main-flags can follow verb-flags
}
argi++
if opt == "-h" || opt == "--help" {
transformerTopUsage(os.Stdout)
os.Exit(0)
} else if opt == "-n" {
topCount = cli.VerbGetIntArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "-f" {
valueFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "-g" {
groupByFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "-a" {
showFullRecords = true
} else if opt == "--max" {
doMax = true
} else if opt == "--min" {
doMax = false
} else if opt == "-F" {
// Ignored in Miller 6; allowed for command-line backward compatibility
} else if opt == "-o" {
outputFieldName = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
} else {
transformerTopUsage(os.Stderr)
os.Exit(1)
}
}
if valueFieldNames == nil {
transformerTopUsage(os.Stderr)
os.Exit(1)
}
if len(valueFieldNames) > 1 && showFullRecords {
transformerTopUsage(os.Stderr)
os.Exit(1)
}
*pargi = argi
if !doConstruct { // All transformers must do this for main command-line parsing
return nil
}
transformer, _ := NewTransformerTop(
topCount,
valueFieldNames,
groupByFieldNames,
showFullRecords,
doMax,
outputFieldName,
)
return transformer
}
// ----------------------------------------------------------------
type TransformerTop struct {
topCount int64
valueFieldNames []string
groupByFieldNames []string
showFullRecords bool
doMax bool
outputFieldName string
// Two-level map from grouping key (string of joined-together group-by field values),
// to string value-field name, to *utils.TopKeeper
groups *lib.OrderedMap
groupingKeysToGroupByFieldValues map[string][]*mlrval.Mlrval
}
// ----------------------------------------------------------------
func NewTransformerTop(
topCount int64,
valueFieldNames []string,
groupByFieldNames []string,
showFullRecords bool,
doMax bool,
outputFieldName string,
) (*TransformerTop, error) {
tr := &TransformerTop{
topCount: topCount,
valueFieldNames: valueFieldNames,
groupByFieldNames: groupByFieldNames,
showFullRecords: showFullRecords,
doMax: doMax,
outputFieldName: outputFieldName,
groups: lib.NewOrderedMap(),
groupingKeysToGroupByFieldValues: make(map[string][]*mlrval.Mlrval),
}
return tr, nil
}
func (tr *TransformerTop) Transform(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
inputDownstreamDoneChannel <-chan bool,
outputDownstreamDoneChannel chan<- bool,
) {
HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel)
if !inrecAndContext.EndOfStream {
tr.ingest(inrecAndContext)
} else {
tr.emit(inrecAndContext, outputRecordsAndContexts)
}
}
func (tr *TransformerTop) ingest(
inrecAndContext *types.RecordAndContext,
) {
inrec := inrecAndContext.Record
// ["s", "t"]
valueFieldValues, fok := inrec.ReferenceSelectedValues(tr.valueFieldNames)
groupingKey, groupByFieldValues, gok := inrec.GetSelectedValuesAndJoined(tr.groupByFieldNames)
// Heterogeneous-data case -- not all sought fields were present in record
if !fok || !gok {
return
}
iSecondLevel := tr.groups.Get(groupingKey)
var secondLevel *lib.OrderedMap = nil
if iSecondLevel == nil {
secondLevel = lib.NewOrderedMap()
tr.groups.Put(groupingKey, secondLevel)
tr.groupingKeysToGroupByFieldValues[groupingKey] = groupByFieldValues
} else {
secondLevel = iSecondLevel.(*lib.OrderedMap)
}
// for "x", "y" and "1", "2"
for i := range tr.valueFieldNames {
valueFieldName := tr.valueFieldNames[i]
valueFieldValue := valueFieldValues[i]
iTopKeeper := secondLevel.Get(valueFieldName)
var topKeeper *utils.TopKeeper
if iTopKeeper == nil {
topKeeper = utils.NewTopKeeper(tr.topCount, tr.doMax)
secondLevel.Put(valueFieldName, topKeeper)
} else {
topKeeper = iTopKeeper.(*utils.TopKeeper)
}
var maybeRecordAndContext *types.RecordAndContext = nil
if tr.showFullRecords {
maybeRecordAndContext = inrecAndContext
}
topKeeper.Add(
valueFieldValue,
maybeRecordAndContext,
)
}
}
// ----------------------------------------------------------------
func (tr *TransformerTop) emit(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
) {
for pa := tr.groups.Head; pa != nil; pa = pa.Next {
groupingKey := pa.Key
secondLevel := pa.Value.(*lib.OrderedMap)
groupByFieldValues := tr.groupingKeysToGroupByFieldValues[groupingKey]
// Above we required that there be only one value field in the
// show-full-records case. That's because here, we print each record at most
// once, which would need a change in the format presented as output.
if tr.showFullRecords {
for pb := secondLevel.Head; pb != nil; pb = pb.Next {
topKeeper := pb.Value.(*utils.TopKeeper)
for i := int64(0); i < topKeeper.GetSize(); i++ {
outputRecordsAndContexts.PushBack(topKeeper.TopRecordsAndContexts[i].Copy())
}
}
} else {
for i := int64(0); i < tr.topCount; i++ {
newrec := mlrval.NewMlrmapAsRecord()
// Add in a=s,b=t fields:
for j := range tr.groupByFieldNames {
newrec.PutCopy(tr.groupByFieldNames[j], groupByFieldValues[j])
}
// Add in fields such as x_top_1=#
// for "x", "y"
for pb := secondLevel.Head; pb != nil; pb = pb.Next {
valueFieldName := pb.Key
topKeeper := pb.Value.(*utils.TopKeeper)
key := valueFieldName + "_top"
if i < topKeeper.GetSize() {
newrec.PutReference(tr.outputFieldName, mlrval.FromInt(i+1))
newrec.PutReference(key, topKeeper.TopValues[i].Copy())
} else {
newrec.PutReference(tr.outputFieldName, mlrval.FromInt(i+1))
newrec.PutCopy(key, mlrval.VOID)
}
}
outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context))
}
}
}
outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker
}