From 9b806f9561686cd1035128d082e21048d353aeeb Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 23 Feb 2022 23:32:19 -0500 Subject: [PATCH] Add --left-keep-fields option for mlr join (#967) * Add --left-keep-fields option for mlr join * on-line help for mlr join --lk * doc-build artifacts * test cases --- docs/src/manpage.md | 3 ++ docs/src/manpage.txt | 3 ++ docs/src/reference-verbs.md | 3 ++ internal/pkg/transformers/join.go | 33 +++++++++++++++---- .../transformers/utils/join_bucket_keeper.go | 31 +++++++++++++++-- man/manpage.txt | 3 ++ man/mlr.1 | 3 ++ test/cases/cli-help/0001/expout | 3 ++ test/cases/verb-join/0213/cmd | 1 + test/cases/verb-join/0213/experr | 0 test/cases/verb-join/0213/expout | 3 ++ test/cases/verb-join/0214/cmd | 1 + test/cases/verb-join/0214/experr | 0 test/cases/verb-join/0214/expout | 3 ++ test/cases/verb-join/0215/cmd | 1 + test/cases/verb-join/0215/experr | 0 test/cases/verb-join/0215/expout | 3 ++ test/cases/verb-join/0216/cmd | 1 + test/cases/verb-join/0216/experr | 0 test/cases/verb-join/0216/expout | 3 ++ test/cases/verb-join/0217/cmd | 1 + test/cases/verb-join/0217/experr | 0 test/cases/verb-join/0217/expout | 3 ++ test/cases/verb-join/0218/cmd | 1 + test/cases/verb-join/0218/experr | 0 test/cases/verb-join/0218/expout | 3 ++ test/cases/verb-join/0219/cmd | 1 + test/cases/verb-join/0219/experr | 0 test/cases/verb-join/0219/expout | 3 ++ test/cases/verb-join/0220/cmd | 1 + test/cases/verb-join/0220/experr | 0 test/cases/verb-join/0220/expout | 3 ++ test/input/join-left-keep | 3 ++ test/input/join-right-keep | 5 +++ todo.txt | 7 +--- 35 files changed, 115 insertions(+), 14 deletions(-) create mode 100644 test/cases/verb-join/0213/cmd create mode 100644 test/cases/verb-join/0213/experr create mode 100644 test/cases/verb-join/0213/expout create mode 100644 test/cases/verb-join/0214/cmd create mode 100644 test/cases/verb-join/0214/experr create mode 100644 test/cases/verb-join/0214/expout create mode 100644 test/cases/verb-join/0215/cmd create mode 100644 test/cases/verb-join/0215/experr create mode 100644 test/cases/verb-join/0215/expout create mode 100644 test/cases/verb-join/0216/cmd create mode 100644 test/cases/verb-join/0216/experr create mode 100644 test/cases/verb-join/0216/expout create mode 100644 test/cases/verb-join/0217/cmd create mode 100644 test/cases/verb-join/0217/experr create mode 100644 test/cases/verb-join/0217/expout create mode 100644 test/cases/verb-join/0218/cmd create mode 100644 test/cases/verb-join/0218/experr create mode 100644 test/cases/verb-join/0218/expout create mode 100644 test/cases/verb-join/0219/cmd create mode 100644 test/cases/verb-join/0219/experr create mode 100644 test/cases/verb-join/0219/expout create mode 100644 test/cases/verb-join/0220/cmd create mode 100644 test/cases/verb-join/0220/experr create mode 100644 test/cases/verb-join/0220/expout create mode 100644 test/input/join-left-keep create mode 100644 test/input/join-right-keep diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 825ad8039..e00708424 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -1269,6 +1269,9 @@ VERBS defaults to -j values if omitted. -r {a,b,c} Comma-separated join-field names for right input file(s); defaults to -j values if omitted. + --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field + names from the left file. Automatically includes the join-field name(s). Helpful + for when you only want a limited subset of information from the left file. --lp {text} Additional prefix for non-join output field names from the left file --rp {text} Additional prefix for non-join output field names from diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index b776d44a2..efdf5f9be 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -1248,6 +1248,9 @@ VERBS defaults to -j values if omitted. -r {a,b,c} Comma-separated join-field names for right input file(s); defaults to -j values if omitted. + --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field + names from the left file. Automatically includes the join-field name(s). Helpful + for when you only want a limited subset of information from the left file. --lp {text} Additional prefix for non-join output field names from the left file --rp {text} Additional prefix for non-join output field names from diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index 3b4e73c5f..2b48162e5 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -1572,6 +1572,9 @@ Options: defaults to -j values if omitted. -r {a,b,c} Comma-separated join-field names for right input file(s); defaults to -j values if omitted. + --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field + names from the left file. Automatically includes the join-field name(s). Helpful + for when you only want a limited subset of information from the left file. --lp {text} Additional prefix for non-join output field names from the left file --rp {text} Additional prefix for non-join output field names from diff --git a/internal/pkg/transformers/join.go b/internal/pkg/transformers/join.go index 59e40cb26..035f70e15 100644 --- a/internal/pkg/transformers/join.go +++ b/internal/pkg/transformers/join.go @@ -35,6 +35,7 @@ type tJoinOptions struct { rightPrefix string outputJoinFieldNames []string + leftKeepFieldNames []string leftJoinFieldNames []string rightJoinFieldNames []string @@ -57,6 +58,7 @@ func newJoinOptions() *tJoinOptions { rightPrefix: "", outputJoinFieldNames: nil, + leftKeepFieldNames: nil, leftJoinFieldNames: nil, rightJoinFieldNames: nil, @@ -89,6 +91,9 @@ func transformerJoinUsage( fmt.Fprintf(o, " defaults to -j values if omitted.\n") fmt.Fprintf(o, " -r {a,b,c} Comma-separated join-field names for right input file(s);\n") fmt.Fprintf(o, " defaults to -j values if omitted.\n") + fmt.Fprintf(o, " --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field\n") + fmt.Fprintf(o, " names from the left file. Automatically includes the join-field name(s). Helpful\n") + fmt.Fprintf(o, " for when you only want a limited subset of information from the left file.\n") fmt.Fprintf(o, " --lp {text} Additional prefix for non-join output field names from\n") fmt.Fprintf(o, " the left file\n") fmt.Fprintf(o, " --rp {text} Additional prefix for non-join output field names from\n") @@ -185,6 +190,9 @@ func transformerJoinParseCLI( } else if opt == "-l" { opts.leftJoinFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + } else if opt == "--lk" || opt == "--left-keep-field-names" { + opts.leftKeepFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + } else if opt == "-r" { opts.rightJoinFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) @@ -280,8 +288,9 @@ func transformerJoinParseCLI( type TransformerJoin struct { opts *tJoinOptions - leftFieldNameSet map[string]bool - rightFieldNameSet map[string]bool + leftFieldNameSet map[string]bool + rightFieldNameSet map[string]bool + leftKeepFieldNameSet map[string]bool // For unsorted/half-streaming input ingested bool @@ -302,14 +311,23 @@ func NewTransformerJoin( tr := &TransformerJoin{ opts: opts, - leftFieldNameSet: lib.StringListToSet(opts.leftJoinFieldNames), - rightFieldNameSet: lib.StringListToSet(opts.rightJoinFieldNames), + leftFieldNameSet: lib.StringListToSet(opts.leftJoinFieldNames), + rightFieldNameSet: lib.StringListToSet(opts.rightJoinFieldNames), + leftKeepFieldNameSet: lib.StringListToSet(opts.leftKeepFieldNames), ingested: false, leftBucketsByJoinFieldValues: nil, leftUnpairableRecordsAndContexts: nil, joinBucketKeeper: nil, } + // Suppose left file has "id,foo,bar" and right has "id,baz,quux" and the join field name is + // "id". If they ask for --lk id,foo we should keep only id,foo from the left file. But if + // they ask for --lk foo we should keep id *and* foo fromn the left file. + if tr.leftKeepFieldNameSet != nil { + for _, name := range opts.leftJoinFieldNames { + tr.leftKeepFieldNameSet[name] = true + } + } if opts.allowUnsortedInput { // Half-streaming (default) case: ingest entire left file first. @@ -325,10 +343,11 @@ func NewTransformerJoin( // too much RAM. tr.joinBucketKeeper = utils.NewJoinBucketKeeper( - // opts.prepipe, + // opts.prepipe, opts.leftFileName, &opts.joinFlagOptions.ReaderOptions, opts.leftJoinFieldNames, + tr.leftKeepFieldNameSet, ) tr.recordTransformerFunc = tr.transformDoublyStreaming @@ -346,7 +365,8 @@ func (tr *TransformerJoin) Transform( outputDownstreamDoneChannel chan<- bool, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, + inputDownstreamDoneChannel, outputDownstreamDoneChannel) } // ---------------------------------------------------------------- @@ -500,6 +520,7 @@ func (tr *TransformerJoin) ingestLeftFile() { // TODO: temp for batch-reader refactor lib.InternalCodingErrorIf(leftrecsAndContexts.Len() != 1) leftrecAndContext := leftrecsAndContexts.Front().Value.(*types.RecordAndContext) + leftrecAndContext.Record = utils.KeepLeftFieldNames(leftrecAndContext.Record, tr.leftKeepFieldNameSet) if leftrecAndContext.EndOfStream { done = true diff --git a/internal/pkg/transformers/utils/join_bucket_keeper.go b/internal/pkg/transformers/utils/join_bucket_keeper.go index 535479fb6..6f0e307b9 100644 --- a/internal/pkg/transformers/utils/join_bucket_keeper.go +++ b/internal/pkg/transformers/utils/join_bucket_keeper.go @@ -131,7 +131,8 @@ type JoinBucketKeeper struct { // TODO: merge with leof flag recordReaderDone bool - leftJoinFieldNames []string + leftJoinFieldNames []string + leftKeepFieldNameSet map[string]bool // Given a left-file of the following form (with left-join-field name "L"): // +-----+ @@ -163,6 +164,7 @@ func NewJoinBucketKeeper( leftFileName string, joinReaderOptions *cli.TReaderOptions, leftJoinFieldNames []string, + leftKeepFieldNameSet map[string]bool, ) *JoinBucketKeeper { // Instantiate the record-reader @@ -194,7 +196,8 @@ func NewJoinBucketKeeper( errorChannel: errorChannel, recordReaderDone: false, - leftJoinFieldNames: leftJoinFieldNames, + leftJoinFieldNames: leftJoinFieldNames, + leftKeepFieldNameSet: leftKeepFieldNameSet, JoinBucket: NewJoinBucket(nil), peekRecordAndContext: nil, @@ -575,6 +578,7 @@ func (keeper *JoinBucketKeeper) readRecord() *types.RecordAndContext { // TODO: temp lib.InternalCodingErrorIf(leftrecsAndContexts.Len() != 1) leftrecAndContext := leftrecsAndContexts.Front().Value.(*types.RecordAndContext) + leftrecAndContext.Record = KeepLeftFieldNames(leftrecAndContext.Record, keeper.leftKeepFieldNameSet) if leftrecAndContext.EndOfStream { // end-of-stream marker keeper.recordReaderDone = true return nil @@ -624,3 +628,26 @@ func compareLexically( } return 0 } + +// KeepLeftFieldNames is for when the user wants only selected fields out of the left file. +func KeepLeftFieldNames( + inrec *mlrval.Mlrmap, + leftKeepFieldNameSet map[string]bool, +) *mlrval.Mlrmap { + if inrec == nil { + return inrec + } else if leftKeepFieldNameSet == nil { + // Normal case + return inrec + } else { + outrec := mlrval.NewMlrmap() + for pe := inrec.Head; pe != nil; pe = pe.Next { + if leftKeepFieldNameSet[pe.Key] { + // PutReference, not PutCopy, since the inrec will be freed and this + // is an ownership transfer. + outrec.PutReference(pe.Key, pe.Value) + } + } + return outrec + } +} diff --git a/man/manpage.txt b/man/manpage.txt index b776d44a2..efdf5f9be 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -1248,6 +1248,9 @@ VERBS defaults to -j values if omitted. -r {a,b,c} Comma-separated join-field names for right input file(s); defaults to -j values if omitted. + --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field + names from the left file. Automatically includes the join-field name(s). Helpful + for when you only want a limited subset of information from the left file. --lp {text} Additional prefix for non-join output field names from the left file --rp {text} Additional prefix for non-join output field names from diff --git a/man/mlr.1 b/man/mlr.1 index ed2c566ac..f4fa79408 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -1573,6 +1573,9 @@ Options: defaults to -j values if omitted. -r {a,b,c} Comma-separated join-field names for right input file(s); defaults to -j values if omitted. + --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field + names from the left file. Automatically includes the join-field name(s). Helpful + for when you only want a limited subset of information from the left file. --lp {text} Additional prefix for non-join output field names from the left file --rp {text} Additional prefix for non-join output field names from diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index 3ff353bc2..b5724e8f2 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -430,6 +430,9 @@ Options: defaults to -j values if omitted. -r {a,b,c} Comma-separated join-field names for right input file(s); defaults to -j values if omitted. + --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field + names from the left file. Automatically includes the join-field name(s). Helpful + for when you only want a limited subset of information from the left file. --lp {text} Additional prefix for non-join output field names from the left file --rp {text} Additional prefix for non-join output field names from diff --git a/test/cases/verb-join/0213/cmd b/test/cases/verb-join/0213/cmd new file mode 100644 index 000000000..4dd7193bf --- /dev/null +++ b/test/cases/verb-join/0213/cmd @@ -0,0 +1 @@ +mlr --csv join -j a -f test/input/join-left-keep test/input/join-right-keep diff --git a/test/cases/verb-join/0213/experr b/test/cases/verb-join/0213/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-join/0213/expout b/test/cases/verb-join/0213/expout new file mode 100644 index 000000000..bd15430d4 --- /dev/null +++ b/test/cases/verb-join/0213/expout @@ -0,0 +1,3 @@ +a,b,c,d,x,y,z +1,2,3,4,5,6,7 +1,2,3,4,7,8,9 diff --git a/test/cases/verb-join/0214/cmd b/test/cases/verb-join/0214/cmd new file mode 100644 index 000000000..ef709c759 --- /dev/null +++ b/test/cases/verb-join/0214/cmd @@ -0,0 +1 @@ +mlr --csv join --lk b -j a -f test/input/join-left-keep test/input/join-right-keep diff --git a/test/cases/verb-join/0214/experr b/test/cases/verb-join/0214/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-join/0214/expout b/test/cases/verb-join/0214/expout new file mode 100644 index 000000000..531f3a6e5 --- /dev/null +++ b/test/cases/verb-join/0214/expout @@ -0,0 +1,3 @@ +a,b,x,y,z +1,2,5,6,7 +1,2,7,8,9 diff --git a/test/cases/verb-join/0215/cmd b/test/cases/verb-join/0215/cmd new file mode 100644 index 000000000..a65b53230 --- /dev/null +++ b/test/cases/verb-join/0215/cmd @@ -0,0 +1 @@ +mlr --csv join --lk a -j a -f test/input/join-left-keep test/input/join-right-keep diff --git a/test/cases/verb-join/0215/experr b/test/cases/verb-join/0215/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-join/0215/expout b/test/cases/verb-join/0215/expout new file mode 100644 index 000000000..8113a5b93 --- /dev/null +++ b/test/cases/verb-join/0215/expout @@ -0,0 +1,3 @@ +a,x,y,z +1,5,6,7 +1,7,8,9 diff --git a/test/cases/verb-join/0216/cmd b/test/cases/verb-join/0216/cmd new file mode 100644 index 000000000..4e67450b9 --- /dev/null +++ b/test/cases/verb-join/0216/cmd @@ -0,0 +1 @@ +mlr --csv join --lk a,b -j a -f test/input/join-left-keep test/input/join-right-keep diff --git a/test/cases/verb-join/0216/experr b/test/cases/verb-join/0216/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-join/0216/expout b/test/cases/verb-join/0216/expout new file mode 100644 index 000000000..531f3a6e5 --- /dev/null +++ b/test/cases/verb-join/0216/expout @@ -0,0 +1,3 @@ +a,b,x,y,z +1,2,5,6,7 +1,2,7,8,9 diff --git a/test/cases/verb-join/0217/cmd b/test/cases/verb-join/0217/cmd new file mode 100644 index 000000000..b6c83ccd7 --- /dev/null +++ b/test/cases/verb-join/0217/cmd @@ -0,0 +1 @@ +mlr --csv join -s -j a -f test/input/join-left-keep test/input/join-right-keep diff --git a/test/cases/verb-join/0217/experr b/test/cases/verb-join/0217/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-join/0217/expout b/test/cases/verb-join/0217/expout new file mode 100644 index 000000000..bd15430d4 --- /dev/null +++ b/test/cases/verb-join/0217/expout @@ -0,0 +1,3 @@ +a,b,c,d,x,y,z +1,2,3,4,5,6,7 +1,2,3,4,7,8,9 diff --git a/test/cases/verb-join/0218/cmd b/test/cases/verb-join/0218/cmd new file mode 100644 index 000000000..8d9b6eb92 --- /dev/null +++ b/test/cases/verb-join/0218/cmd @@ -0,0 +1 @@ +mlr --csv join -s --lk b -j a -f test/input/join-left-keep test/input/join-right-keep diff --git a/test/cases/verb-join/0218/experr b/test/cases/verb-join/0218/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-join/0218/expout b/test/cases/verb-join/0218/expout new file mode 100644 index 000000000..531f3a6e5 --- /dev/null +++ b/test/cases/verb-join/0218/expout @@ -0,0 +1,3 @@ +a,b,x,y,z +1,2,5,6,7 +1,2,7,8,9 diff --git a/test/cases/verb-join/0219/cmd b/test/cases/verb-join/0219/cmd new file mode 100644 index 000000000..067bb0d75 --- /dev/null +++ b/test/cases/verb-join/0219/cmd @@ -0,0 +1 @@ +mlr --csv join -s --lk a -j a -f test/input/join-left-keep test/input/join-right-keep diff --git a/test/cases/verb-join/0219/experr b/test/cases/verb-join/0219/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-join/0219/expout b/test/cases/verb-join/0219/expout new file mode 100644 index 000000000..8113a5b93 --- /dev/null +++ b/test/cases/verb-join/0219/expout @@ -0,0 +1,3 @@ +a,x,y,z +1,5,6,7 +1,7,8,9 diff --git a/test/cases/verb-join/0220/cmd b/test/cases/verb-join/0220/cmd new file mode 100644 index 000000000..79e133095 --- /dev/null +++ b/test/cases/verb-join/0220/cmd @@ -0,0 +1 @@ +mlr --csv join -s --lk a,b -j a -f test/input/join-left-keep test/input/join-right-keep diff --git a/test/cases/verb-join/0220/experr b/test/cases/verb-join/0220/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-join/0220/expout b/test/cases/verb-join/0220/expout new file mode 100644 index 000000000..531f3a6e5 --- /dev/null +++ b/test/cases/verb-join/0220/expout @@ -0,0 +1,3 @@ +a,b,x,y,z +1,2,5,6,7 +1,2,7,8,9 diff --git a/test/input/join-left-keep b/test/input/join-left-keep new file mode 100644 index 000000000..a7d6aa215 --- /dev/null +++ b/test/input/join-left-keep @@ -0,0 +1,3 @@ +a,b,c,d +1,2,3,4 +5,6,7,8 diff --git a/test/input/join-right-keep b/test/input/join-right-keep new file mode 100644 index 000000000..a81a22f56 --- /dev/null +++ b/test/input/join-right-keep @@ -0,0 +1,5 @@ +a,x,y,z +1,5,6,7 +1,7,8,9 +2,6,7,8 +2,8,9,9 diff --git a/todo.txt b/todo.txt index 5163ad020..bd5ae6dd6 100644 --- a/todo.txt +++ b/todo.txt @@ -5,13 +5,8 @@ RELEASES o mlr join --left-fields a,b,c o fmt/unfmt/regex doc o FAQ/examples reorg - ? ??? for doc-string contents search -- ? & mlr help namegoeshere foo ... - o several needs-doc issues - i https://github.com/johnkerl/miller/issues?q=is%3Aissue+is%3Aopen+label%3Aneeds-documentation - - 908 - :context - https://github.com/johnkerl/miller/issues/908#issuecomment-1032573038 NFR + o https://github.com/johnkerl/miller/issues?q=is%3Aissue+is%3Aopen+label%3Aneeds-documentation k strptime/882 k fmtifnum, & recursive fmtnum/fmtifnum