mlr reorder with regex support [WIP] (#1473)

* mlr reorder with regex support for field-name selection

* neaten

* -r -b/-a; unit-test cases
This commit is contained in:
John Kerl 2024-01-21 15:17:33 -05:00 committed by GitHub
parent ac65675ab1
commit 81d11365a0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 269 additions and 145 deletions

View file

@ -27,6 +27,12 @@ type RecordTransformerFunc func(
outputDownstreamDoneChannel chan<- bool,
)
// Used within some verbs
type RecordTransformerHelperFunc func(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
)
type TransformerUsageFunc func(
ostream *os.File,
)

View file

@ -4,6 +4,7 @@ import (
"container/list"
"fmt"
"os"
"regexp"
"strings"
"github.com/johnkerl/miller/pkg/cli"
@ -61,9 +62,9 @@ func transformerReorderParseCLI(
argi++
var fieldNames []string = nil
putAtEnd := false
beforeFieldName := ""
afterFieldName := ""
doRegexes := false
putAfter := false
centerFieldName := ""
for argi < argc /* variable increment: 1 or 2 depending on flag */ {
opt := args[argi]
@ -81,21 +82,23 @@ func transformerReorderParseCLI(
} else if opt == "-f" {
fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
doRegexes = false
} else if opt == "-r" {
fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
doRegexes = true
} else if opt == "-b" {
beforeFieldName = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
afterFieldName = ""
putAtEnd = false
centerFieldName = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
putAfter = false
} else if opt == "-a" {
afterFieldName = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
beforeFieldName = ""
putAtEnd = false
centerFieldName = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
putAfter = true
} else if opt == "-e" {
putAtEnd = true
beforeFieldName = ""
afterFieldName = ""
putAfter = true
centerFieldName = ""
} else {
transformerReorderUsage(os.Stderr)
@ -115,9 +118,9 @@ func transformerReorderParseCLI(
transformer, err := NewTransformerReorder(
fieldNames,
putAtEnd,
beforeFieldName,
afterFieldName,
doRegexes,
putAfter,
centerFieldName,
)
if err != nil {
fmt.Fprintln(os.Stderr, err)
@ -132,43 +135,71 @@ type TransformerReorder struct {
// input
fieldNames []string
fieldNamesSet map[string]bool
beforeFieldName string
afterFieldName string
regexes []*regexp.Regexp
centerFieldName string
putAfter bool
// state
recordTransformerFunc RecordTransformerFunc
recordTransformerFunc RecordTransformerHelperFunc
}
func NewTransformerReorder(
fieldNames []string,
putAtEnd bool,
beforeFieldName string,
afterFieldName string,
doRegexes bool,
putAfter bool,
centerFieldName string,
) (*TransformerReorder, error) {
tr := &TransformerReorder{
fieldNames: fieldNames,
fieldNamesSet: lib.StringListToSet(fieldNames),
beforeFieldName: beforeFieldName,
afterFieldName: afterFieldName,
centerFieldName: centerFieldName,
putAfter: putAfter,
}
if putAtEnd {
tr.recordTransformerFunc = tr.reorderToEnd
} else if beforeFieldName != "" {
tr.recordTransformerFunc = tr.reorderBefore
} else if afterFieldName != "" {
tr.recordTransformerFunc = tr.reorderAfter
if centerFieldName == "" {
if putAfter {
if doRegexes {
tr.recordTransformerFunc = tr.reorderToEndWithRegex
} else {
tr.recordTransformerFunc = tr.reorderToEndNoRegex
}
} else {
if doRegexes {
tr.recordTransformerFunc = tr.reorderToStartWithRegex
} else {
tr.recordTransformerFunc = tr.reorderToStartNoRegex
lib.ReverseStringList(tr.fieldNames)
}
}
} else {
tr.recordTransformerFunc = tr.reorderToStart
lib.ReverseStringList(tr.fieldNames)
if doRegexes {
tr.recordTransformerFunc = tr.reorderBeforeOrAfterWithRegex
} else {
tr.recordTransformerFunc = tr.reorderBeforeOrAfterNoRegex
}
}
if doRegexes {
tr.regexes = make([]*regexp.Regexp, len(fieldNames))
for i, regexString := range fieldNames {
// Handles "a.*b"i Miller case-insensitive-regex specification
regex, err := lib.CompileMillerRegex(regexString)
if err != nil {
fmt.Fprintf(
os.Stderr,
"%s %s: cannot compile regex [%s]\n",
"mlr", verbNameCut, regexString,
)
os.Exit(1)
}
tr.regexes[i] = regex
}
}
return tr, nil
}
// ----------------------------------------------------------------
func (tr *TransformerReorder) Transform(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
@ -176,156 +207,198 @@ func (tr *TransformerReorder) Transform(
outputDownstreamDoneChannel chan<- bool,
) {
HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel)
tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel)
}
// ----------------------------------------------------------------
func (tr *TransformerReorder) reorderToStart(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
inputDownstreamDoneChannel <-chan bool,
outputDownstreamDoneChannel chan<- bool,
) {
if !inrecAndContext.EndOfStream {
inrec := inrecAndContext.Record
for _, fieldName := range tr.fieldNames {
inrec.MoveToHead(fieldName)
}
outputRecordsAndContexts.PushBack(inrecAndContext)
tr.recordTransformerFunc(
inrecAndContext,
outputRecordsAndContexts,
)
} else {
outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker
}
}
// ----------------------------------------------------------------
func (tr *TransformerReorder) reorderToEnd(
func (tr *TransformerReorder) reorderToStartNoRegex(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
inputDownstreamDoneChannel <-chan bool,
outputDownstreamDoneChannel chan<- bool,
) {
if !inrecAndContext.EndOfStream {
inrec := inrecAndContext.Record
for _, fieldName := range tr.fieldNames {
inrec.MoveToTail(fieldName)
}
outputRecordsAndContexts.PushBack(inrecAndContext)
} else {
outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker
inrec := inrecAndContext.Record
for _, fieldName := range tr.fieldNames {
inrec.MoveToHead(fieldName)
}
outputRecordsAndContexts.PushBack(inrecAndContext)
}
// ----------------------------------------------------------------
func (tr *TransformerReorder) reorderBefore(
func (tr *TransformerReorder) reorderToStartWithRegex(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
inputDownstreamDoneChannel <-chan bool,
outputDownstreamDoneChannel chan<- bool,
) {
if !inrecAndContext.EndOfStream {
inrec := inrecAndContext.Record
if inrec.Get(tr.beforeFieldName) == nil {
outputRecordsAndContexts.PushBack(inrecAndContext)
return
}
inrec := inrecAndContext.Record
outrec := mlrval.NewMlrmapAsRecord()
pe := inrec.Head
// * inrec will be GC'ed
// * We will use outrec.PutReference not output.PutCopy since inrec will be GC'ed
for ; pe != nil; pe = pe.Next {
if pe.Key == tr.beforeFieldName {
outrec := mlrval.NewMlrmapAsRecord()
atEnds := list.New()
for pe := inrec.Head; pe != nil; pe = pe.Next {
found := false
for _, regex := range tr.regexes {
if regex.MatchString(pe.Key) {
outrec.PutReference(pe.Key, pe.Value)
found = true
break
}
if !tr.fieldNamesSet[pe.Key] {
outrec.PutReference(pe.Key, pe.Value)
}
if !found {
atEnds.PushBack(pe)
}
}
for atEnd := atEnds.Front(); atEnd != nil; atEnd = atEnd.Next() {
// Ownership transfer; no copy needed
pe := atEnd.Value.(*mlrval.MlrmapEntry)
outrec.PutReference(pe.Key, pe.Value)
}
outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context)
outputRecordsAndContexts.PushBack(outrecAndContext)
}
func (tr *TransformerReorder) reorderToEndNoRegex(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
) {
inrec := inrecAndContext.Record
for _, fieldName := range tr.fieldNames {
inrec.MoveToTail(fieldName)
}
outputRecordsAndContexts.PushBack(inrecAndContext)
}
func (tr *TransformerReorder) reorderToEndWithRegex(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
) {
inrec := inrecAndContext.Record
outrec := mlrval.NewMlrmapAsRecord()
atEnds := list.New()
for pe := inrec.Head; pe != nil; pe = pe.Next {
found := false
for _, regex := range tr.regexes {
if regex.MatchString(pe.Key) {
atEnds.PushBack(pe)
found = true
break
}
}
if !found {
outrec.PutReference(pe.Key, pe.Value)
}
}
for atEnd := atEnds.Front(); atEnd != nil; atEnd = atEnd.Next() {
// Ownership transfer; no copy needed
pe := atEnd.Value.(*mlrval.MlrmapEntry)
outrec.PutReference(pe.Key, pe.Value)
}
outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context)
outputRecordsAndContexts.PushBack(outrecAndContext)
}
func (tr *TransformerReorder) reorderBeforeOrAfterNoRegex(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
) {
inrec := inrecAndContext.Record
if inrec.Get(tr.centerFieldName) == nil {
outputRecordsAndContexts.PushBack(inrecAndContext)
return
}
outrec := mlrval.NewMlrmapAsRecord()
pe := inrec.Head
// We use outrec.PutReference not output.PutCopy since inrec will be GC'ed
for ; pe != nil; pe = pe.Next {
if pe.Key == tr.centerFieldName {
break
}
if !tr.fieldNamesSet[pe.Key] {
outrec.PutReference(pe.Key, pe.Value)
}
}
if !tr.putAfter {
for _, fieldName := range tr.fieldNames {
value := inrec.Get(fieldName)
if value != nil {
outrec.PutReference(fieldName, value)
}
}
value := inrec.Get(tr.beforeFieldName)
if value != nil {
outrec.PutReference(tr.beforeFieldName, value)
}
for ; pe != nil; pe = pe.Next {
if pe.Key != tr.beforeFieldName && !tr.fieldNamesSet[pe.Key] {
outrec.PutReference(pe.Key, pe.Value)
}
}
for _, fieldName := range tr.fieldNames {
inrec.MoveToHead(fieldName)
}
outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context))
} else {
outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker
}
}
// ----------------------------------------------------------------
func (tr *TransformerReorder) reorderAfter(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
inputDownstreamDoneChannel <-chan bool,
outputDownstreamDoneChannel chan<- bool,
) {
if !inrecAndContext.EndOfStream {
inrec := inrecAndContext.Record
if inrec.Get(tr.afterFieldName) == nil {
outputRecordsAndContexts.PushBack(inrecAndContext)
return
}
outrec := mlrval.NewMlrmapAsRecord()
pe := inrec.Head
// * inrec will be GC'ed
// * We will use outrec.PutReference not output.PutCopy since inrec will be GC'ed
for ; pe != nil; pe = pe.Next {
if pe.Key == tr.afterFieldName {
break
}
if !tr.fieldNamesSet[pe.Key] {
outrec.PutReference(pe.Key, pe.Value)
}
}
value := inrec.Get(tr.afterFieldName)
if value != nil {
outrec.PutReference(tr.afterFieldName, value)
}
value := inrec.Get(tr.centerFieldName)
if value != nil {
outrec.PutReference(tr.centerFieldName, value)
}
if tr.putAfter {
for _, fieldName := range tr.fieldNames {
value := inrec.Get(fieldName)
if value != nil {
outrec.PutReference(fieldName, value)
}
}
}
for ; pe != nil; pe = pe.Next {
if pe.Key != tr.afterFieldName && !tr.fieldNamesSet[pe.Key] {
outrec.PutReference(pe.Key, pe.Value)
for ; pe != nil; pe = pe.Next {
if pe.Key != tr.centerFieldName && !tr.fieldNamesSet[pe.Key] {
outrec.PutReference(pe.Key, pe.Value)
}
}
outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context))
}
func (tr *TransformerReorder) reorderBeforeOrAfterWithRegex(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
) {
inrec := inrecAndContext.Record
if inrec.Get(tr.centerFieldName) == nil {
outputRecordsAndContexts.PushBack(inrecAndContext)
return
}
matchingFieldNamesSet := lib.NewOrderedMap()
for pe := inrec.Head; pe != nil; pe = pe.Next {
for _, regex := range tr.regexes {
if regex.MatchString(pe.Key) {
if pe.Key != tr.centerFieldName {
matchingFieldNamesSet.Put(pe.Key, pe.Value)
break
}
}
}
for _, fieldName := range tr.fieldNames {
inrec.MoveToHead(fieldName)
}
outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context))
} else {
outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker
}
// We use outrec.PutReference not output.PutCopy since inrec will be GC'ed
outrec := mlrval.NewMlrmapAsRecord()
for pe := inrec.Head; pe != nil; pe = pe.Next {
if pe.Key == tr.centerFieldName {
if tr.putAfter {
outrec.PutReference(pe.Key, pe.Value)
}
for pf := matchingFieldNamesSet.Head; pf != nil; pf = pf.Next {
outrec.PutReference(pf.Key, pf.Value.(*mlrval.Mlrval))
}
if !tr.putAfter {
outrec.PutReference(pe.Key, pe.Value)
}
} else if !matchingFieldNamesSet.Has(pe.Key) {
outrec.PutReference(pe.Key, pe.Value)
}
}
outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context))
}

View file

@ -0,0 +1 @@
mlr --n2x reorder -r 3,9,8 -a 6 test/input/reorder-regex.nidx

View file

@ -0,0 +1,10 @@
1 a
2 b
4 d
5 e
6 f
3 c
8 h
9 i
7 g
10 j

View file

@ -0,0 +1 @@
mlr --n2x reorder -r 3,9,8 -b 6 test/input/reorder-regex.nidx

View file

@ -0,0 +1,10 @@
1 a
2 b
4 d
5 e
3 c
8 h
9 i
6 f
7 g
10 j

View file

@ -0,0 +1 @@
mlr --n2x reorder -r 3,9,8 -e test/input/reorder-regex.nidx

View file

View file

@ -0,0 +1,10 @@
1 a
2 b
4 d
5 e
6 f
7 g
10 j
3 c
8 h
9 i

View file

@ -0,0 +1 @@
mlr --n2x reorder -r 3,9,8 test/input/reorder-regex.nidx

View file

@ -0,0 +1,10 @@
3 c
8 h
9 i
1 a
2 b
4 d
5 e
6 f
7 g
10 j

View file

@ -0,0 +1 @@
a b c d e f g h i j