mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
* Update package version * Update makefile targets * Update readme packages * Remaining old packages via rg/sd
290 lines
8.8 KiB
Go
290 lines
8.8 KiB
Go
package transformers
|
|
|
|
import (
|
|
"container/list"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
|
|
"github.com/johnkerl/miller/v6/pkg/cli"
|
|
"github.com/johnkerl/miller/v6/pkg/lib"
|
|
"github.com/johnkerl/miller/v6/pkg/mlrval"
|
|
"github.com/johnkerl/miller/v6/pkg/transformers/utils"
|
|
"github.com/johnkerl/miller/v6/pkg/types"
|
|
)
|
|
|
|
// ----------------------------------------------------------------
|
|
const verbNameTop = "top"
|
|
const verbTopDefaultOutputFieldName = "top_idx"
|
|
|
|
var TopSetup = TransformerSetup{
|
|
Verb: verbNameTop,
|
|
UsageFunc: transformerTopUsage,
|
|
ParseCLIFunc: transformerTopParseCLI,
|
|
IgnoresInput: false,
|
|
}
|
|
|
|
func transformerTopUsage(
|
|
o *os.File,
|
|
) {
|
|
argv0 := "mlr"
|
|
verb := verbNameTop
|
|
fmt.Fprintf(o, "Usage: %s %s [options]\n", argv0, verb)
|
|
fmt.Fprintf(o, "-f {a,b,c} Value-field names for top counts.\n")
|
|
fmt.Fprintf(o, "-g {d,e,f} Optional group-by-field names for top counts.\n")
|
|
fmt.Fprintf(o, "-n {count} How many records to print per category; default 1.\n")
|
|
fmt.Fprintf(o, "-a Print all fields for top-value records; default is\n")
|
|
fmt.Fprintf(o, " to print only value and group-by fields. Requires a single\n")
|
|
fmt.Fprintf(o, " value-field name only.\n")
|
|
fmt.Fprintf(o, "--min Print top smallest values; default is top largest values.\n")
|
|
fmt.Fprintf(o, "-F Keep top values as floats even if they look like integers.\n")
|
|
fmt.Fprintf(o, "-o {name} Field name for output indices. Default \"%s\".\n", verbTopDefaultOutputFieldName)
|
|
fmt.Fprintf(o, " This is ignored if -a is used.\n")
|
|
|
|
fmt.Fprintf(o, "Prints the n records with smallest/largest values at specified fields,\n")
|
|
fmt.Fprintf(o, "optionally by category. If -a is given, then the top records are emitted\n")
|
|
fmt.Fprintf(o, "with the same fields as they appeared in the input. Without -a, only fields\n")
|
|
fmt.Fprintf(o, "from -f, fields from -g, and the top-index field are emitted. For more information\n")
|
|
fmt.Fprintf(o, "please see https://miller.readthedocs.io/en/latest/reference-verbs#top\n")
|
|
}
|
|
|
|
func transformerTopParseCLI(
|
|
pargi *int,
|
|
argc int,
|
|
args []string,
|
|
_ *cli.TOptions,
|
|
doConstruct bool, // false for first pass of CLI-parse, true for second pass
|
|
) IRecordTransformer {
|
|
|
|
// Skip the verb name from the current spot in the mlr command line
|
|
argi := *pargi
|
|
verb := args[argi]
|
|
argi++
|
|
|
|
// Parse local flags
|
|
topCount := int64(1)
|
|
var valueFieldNames []string = nil
|
|
var groupByFieldNames []string = nil
|
|
showFullRecords := false
|
|
doMax := true
|
|
outputFieldName := verbTopDefaultOutputFieldName
|
|
|
|
for argi < argc /* variable increment: 1 or 2 depending on flag */ {
|
|
opt := args[argi]
|
|
if !strings.HasPrefix(opt, "-") {
|
|
break // No more flag options to process
|
|
}
|
|
if args[argi] == "--" {
|
|
break // All transformers must do this so main-flags can follow verb-flags
|
|
}
|
|
argi++
|
|
|
|
if opt == "-h" || opt == "--help" {
|
|
transformerTopUsage(os.Stdout)
|
|
os.Exit(0)
|
|
|
|
} else if opt == "-n" {
|
|
topCount = cli.VerbGetIntArgOrDie(verb, opt, args, &argi, argc)
|
|
} else if opt == "-f" {
|
|
valueFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
|
|
} else if opt == "-g" {
|
|
groupByFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
|
|
} else if opt == "-a" {
|
|
showFullRecords = true
|
|
} else if opt == "--max" {
|
|
doMax = true
|
|
} else if opt == "--min" {
|
|
doMax = false
|
|
} else if opt == "-F" {
|
|
// Ignored in Miller 6; allowed for command-line backward compatibility
|
|
} else if opt == "-o" {
|
|
outputFieldName = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
|
|
|
|
} else {
|
|
transformerTopUsage(os.Stderr)
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
|
|
if valueFieldNames == nil {
|
|
transformerTopUsage(os.Stderr)
|
|
os.Exit(1)
|
|
}
|
|
if len(valueFieldNames) > 1 && showFullRecords {
|
|
transformerTopUsage(os.Stderr)
|
|
os.Exit(1)
|
|
}
|
|
|
|
*pargi = argi
|
|
if !doConstruct { // All transformers must do this for main command-line parsing
|
|
return nil
|
|
}
|
|
|
|
transformer, _ := NewTransformerTop(
|
|
topCount,
|
|
valueFieldNames,
|
|
groupByFieldNames,
|
|
showFullRecords,
|
|
doMax,
|
|
outputFieldName,
|
|
)
|
|
|
|
return transformer
|
|
}
|
|
|
|
// ----------------------------------------------------------------
|
|
type TransformerTop struct {
|
|
topCount int64
|
|
valueFieldNames []string
|
|
groupByFieldNames []string
|
|
showFullRecords bool
|
|
doMax bool
|
|
outputFieldName string
|
|
|
|
// Two-level map from grouping key (string of joined-together group-by field values),
|
|
// to string value-field name, to *utils.TopKeeper
|
|
groups *lib.OrderedMap
|
|
groupingKeysToGroupByFieldValues map[string][]*mlrval.Mlrval
|
|
}
|
|
|
|
// ----------------------------------------------------------------
|
|
func NewTransformerTop(
|
|
topCount int64,
|
|
valueFieldNames []string,
|
|
groupByFieldNames []string,
|
|
showFullRecords bool,
|
|
doMax bool,
|
|
outputFieldName string,
|
|
) (*TransformerTop, error) {
|
|
|
|
tr := &TransformerTop{
|
|
topCount: topCount,
|
|
valueFieldNames: valueFieldNames,
|
|
groupByFieldNames: groupByFieldNames,
|
|
showFullRecords: showFullRecords,
|
|
doMax: doMax,
|
|
outputFieldName: outputFieldName,
|
|
|
|
groups: lib.NewOrderedMap(),
|
|
groupingKeysToGroupByFieldValues: make(map[string][]*mlrval.Mlrval),
|
|
}
|
|
|
|
return tr, nil
|
|
}
|
|
|
|
func (tr *TransformerTop) Transform(
|
|
inrecAndContext *types.RecordAndContext,
|
|
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
|
|
inputDownstreamDoneChannel <-chan bool,
|
|
outputDownstreamDoneChannel chan<- bool,
|
|
) {
|
|
HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel)
|
|
if !inrecAndContext.EndOfStream {
|
|
tr.ingest(inrecAndContext)
|
|
} else {
|
|
tr.emit(inrecAndContext, outputRecordsAndContexts)
|
|
}
|
|
}
|
|
|
|
func (tr *TransformerTop) ingest(
|
|
inrecAndContext *types.RecordAndContext,
|
|
) {
|
|
inrec := inrecAndContext.Record
|
|
|
|
// ["s", "t"]
|
|
valueFieldValues, fok := inrec.ReferenceSelectedValues(tr.valueFieldNames)
|
|
groupingKey, groupByFieldValues, gok := inrec.GetSelectedValuesAndJoined(tr.groupByFieldNames)
|
|
|
|
// Heterogeneous-data case -- not all sought fields were present in record
|
|
if !fok || !gok {
|
|
return
|
|
}
|
|
iSecondLevel := tr.groups.Get(groupingKey)
|
|
var secondLevel *lib.OrderedMap = nil
|
|
if iSecondLevel == nil {
|
|
secondLevel = lib.NewOrderedMap()
|
|
tr.groups.Put(groupingKey, secondLevel)
|
|
tr.groupingKeysToGroupByFieldValues[groupingKey] = groupByFieldValues
|
|
} else {
|
|
secondLevel = iSecondLevel.(*lib.OrderedMap)
|
|
}
|
|
|
|
// for "x", "y" and "1", "2"
|
|
for i := range tr.valueFieldNames {
|
|
valueFieldName := tr.valueFieldNames[i]
|
|
valueFieldValue := valueFieldValues[i]
|
|
|
|
iTopKeeper := secondLevel.Get(valueFieldName)
|
|
var topKeeper *utils.TopKeeper
|
|
if iTopKeeper == nil {
|
|
topKeeper = utils.NewTopKeeper(tr.topCount, tr.doMax)
|
|
secondLevel.Put(valueFieldName, topKeeper)
|
|
} else {
|
|
topKeeper = iTopKeeper.(*utils.TopKeeper)
|
|
}
|
|
|
|
var maybeRecordAndContext *types.RecordAndContext = nil
|
|
if tr.showFullRecords {
|
|
maybeRecordAndContext = inrecAndContext
|
|
}
|
|
topKeeper.Add(
|
|
valueFieldValue,
|
|
maybeRecordAndContext,
|
|
)
|
|
}
|
|
|
|
}
|
|
|
|
// ----------------------------------------------------------------
|
|
func (tr *TransformerTop) emit(
|
|
inrecAndContext *types.RecordAndContext,
|
|
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
|
|
) {
|
|
for pa := tr.groups.Head; pa != nil; pa = pa.Next {
|
|
groupingKey := pa.Key
|
|
secondLevel := pa.Value.(*lib.OrderedMap)
|
|
groupByFieldValues := tr.groupingKeysToGroupByFieldValues[groupingKey]
|
|
|
|
// Above we required that there be only one value field in the
|
|
// show-full-records case. That's because here, we print each record at most
|
|
// once, which would need a change in the format presented as output.
|
|
if tr.showFullRecords {
|
|
for pb := secondLevel.Head; pb != nil; pb = pb.Next {
|
|
topKeeper := pb.Value.(*utils.TopKeeper)
|
|
for i := int64(0); i < topKeeper.GetSize(); i++ {
|
|
outputRecordsAndContexts.PushBack(topKeeper.TopRecordsAndContexts[i].Copy())
|
|
}
|
|
}
|
|
|
|
} else {
|
|
|
|
for i := int64(0); i < tr.topCount; i++ {
|
|
newrec := mlrval.NewMlrmapAsRecord()
|
|
|
|
// Add in a=s,b=t fields:
|
|
for j := range tr.groupByFieldNames {
|
|
newrec.PutCopy(tr.groupByFieldNames[j], groupByFieldValues[j])
|
|
}
|
|
|
|
// Add in fields such as x_top_1=#
|
|
// for "x", "y"
|
|
for pb := secondLevel.Head; pb != nil; pb = pb.Next {
|
|
valueFieldName := pb.Key
|
|
topKeeper := pb.Value.(*utils.TopKeeper)
|
|
key := valueFieldName + "_top"
|
|
if i < topKeeper.GetSize() {
|
|
newrec.PutReference(tr.outputFieldName, mlrval.FromInt(i+1))
|
|
newrec.PutReference(key, topKeeper.TopValues[i].Copy())
|
|
} else {
|
|
newrec.PutReference(tr.outputFieldName, mlrval.FromInt(i+1))
|
|
newrec.PutCopy(key, mlrval.VOID)
|
|
}
|
|
}
|
|
|
|
outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context))
|
|
}
|
|
}
|
|
}
|
|
|
|
outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker
|
|
}
|