Add --left-keep-fields option for mlr join (#967)

* Add --left-keep-fields option for mlr join

* on-line help for mlr join --lk

* doc-build artifacts

* test cases
This commit is contained in:
John Kerl 2022-02-23 23:32:19 -05:00 committed by GitHub
parent f6d897bf7d
commit 9b806f9561
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
35 changed files with 115 additions and 14 deletions

View file

@ -1269,6 +1269,9 @@ VERBS
defaults to -j values if omitted.
-r {a,b,c} Comma-separated join-field names for right input file(s);
defaults to -j values if omitted.
--lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field
names from the left file. Automatically includes the join-field name(s). Helpful
for when you only want a limited subset of information from the left file.
--lp {text} Additional prefix for non-join output field names from
the left file
--rp {text} Additional prefix for non-join output field names from

View file

@ -1248,6 +1248,9 @@ VERBS
defaults to -j values if omitted.
-r {a,b,c} Comma-separated join-field names for right input file(s);
defaults to -j values if omitted.
--lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field
names from the left file. Automatically includes the join-field name(s). Helpful
for when you only want a limited subset of information from the left file.
--lp {text} Additional prefix for non-join output field names from
the left file
--rp {text} Additional prefix for non-join output field names from

View file

@ -1572,6 +1572,9 @@ Options:
defaults to -j values if omitted.
-r {a,b,c} Comma-separated join-field names for right input file(s);
defaults to -j values if omitted.
--lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field
names from the left file. Automatically includes the join-field name(s). Helpful
for when you only want a limited subset of information from the left file.
--lp {text} Additional prefix for non-join output field names from
the left file
--rp {text} Additional prefix for non-join output field names from

View file

@ -35,6 +35,7 @@ type tJoinOptions struct {
rightPrefix string
outputJoinFieldNames []string
leftKeepFieldNames []string
leftJoinFieldNames []string
rightJoinFieldNames []string
@ -57,6 +58,7 @@ func newJoinOptions() *tJoinOptions {
rightPrefix: "",
outputJoinFieldNames: nil,
leftKeepFieldNames: nil,
leftJoinFieldNames: nil,
rightJoinFieldNames: nil,
@ -89,6 +91,9 @@ func transformerJoinUsage(
fmt.Fprintf(o, " defaults to -j values if omitted.\n")
fmt.Fprintf(o, " -r {a,b,c} Comma-separated join-field names for right input file(s);\n")
fmt.Fprintf(o, " defaults to -j values if omitted.\n")
fmt.Fprintf(o, " --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field\n")
fmt.Fprintf(o, " names from the left file. Automatically includes the join-field name(s). Helpful\n")
fmt.Fprintf(o, " for when you only want a limited subset of information from the left file.\n")
fmt.Fprintf(o, " --lp {text} Additional prefix for non-join output field names from\n")
fmt.Fprintf(o, " the left file\n")
fmt.Fprintf(o, " --rp {text} Additional prefix for non-join output field names from\n")
@ -185,6 +190,9 @@ func transformerJoinParseCLI(
} else if opt == "-l" {
opts.leftJoinFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "--lk" || opt == "--left-keep-field-names" {
opts.leftKeepFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
} else if opt == "-r" {
opts.rightJoinFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
@ -280,8 +288,9 @@ func transformerJoinParseCLI(
type TransformerJoin struct {
opts *tJoinOptions
leftFieldNameSet map[string]bool
rightFieldNameSet map[string]bool
leftFieldNameSet map[string]bool
rightFieldNameSet map[string]bool
leftKeepFieldNameSet map[string]bool
// For unsorted/half-streaming input
ingested bool
@ -302,14 +311,23 @@ func NewTransformerJoin(
tr := &TransformerJoin{
opts: opts,
leftFieldNameSet: lib.StringListToSet(opts.leftJoinFieldNames),
rightFieldNameSet: lib.StringListToSet(opts.rightJoinFieldNames),
leftFieldNameSet: lib.StringListToSet(opts.leftJoinFieldNames),
rightFieldNameSet: lib.StringListToSet(opts.rightJoinFieldNames),
leftKeepFieldNameSet: lib.StringListToSet(opts.leftKeepFieldNames),
ingested: false,
leftBucketsByJoinFieldValues: nil,
leftUnpairableRecordsAndContexts: nil,
joinBucketKeeper: nil,
}
// Suppose left file has "id,foo,bar" and right has "id,baz,quux" and the join field name is
// "id". If they ask for --lk id,foo we should keep only id,foo from the left file. But if
// they ask for --lk foo we should keep id *and* foo fromn the left file.
if tr.leftKeepFieldNameSet != nil {
for _, name := range opts.leftJoinFieldNames {
tr.leftKeepFieldNameSet[name] = true
}
}
if opts.allowUnsortedInput {
// Half-streaming (default) case: ingest entire left file first.
@ -325,10 +343,11 @@ func NewTransformerJoin(
// too much RAM.
tr.joinBucketKeeper = utils.NewJoinBucketKeeper(
// opts.prepipe,
// opts.prepipe,
opts.leftFileName,
&opts.joinFlagOptions.ReaderOptions,
opts.leftJoinFieldNames,
tr.leftKeepFieldNameSet,
)
tr.recordTransformerFunc = tr.transformDoublyStreaming
@ -346,7 +365,8 @@ func (tr *TransformerJoin) Transform(
outputDownstreamDoneChannel chan<- bool,
) {
HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel)
tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel)
tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts,
inputDownstreamDoneChannel, outputDownstreamDoneChannel)
}
// ----------------------------------------------------------------
@ -500,6 +520,7 @@ func (tr *TransformerJoin) ingestLeftFile() {
// TODO: temp for batch-reader refactor
lib.InternalCodingErrorIf(leftrecsAndContexts.Len() != 1)
leftrecAndContext := leftrecsAndContexts.Front().Value.(*types.RecordAndContext)
leftrecAndContext.Record = utils.KeepLeftFieldNames(leftrecAndContext.Record, tr.leftKeepFieldNameSet)
if leftrecAndContext.EndOfStream {
done = true

View file

@ -131,7 +131,8 @@ type JoinBucketKeeper struct {
// TODO: merge with leof flag
recordReaderDone bool
leftJoinFieldNames []string
leftJoinFieldNames []string
leftKeepFieldNameSet map[string]bool
// Given a left-file of the following form (with left-join-field name "L"):
// +-----+
@ -163,6 +164,7 @@ func NewJoinBucketKeeper(
leftFileName string,
joinReaderOptions *cli.TReaderOptions,
leftJoinFieldNames []string,
leftKeepFieldNameSet map[string]bool,
) *JoinBucketKeeper {
// Instantiate the record-reader
@ -194,7 +196,8 @@ func NewJoinBucketKeeper(
errorChannel: errorChannel,
recordReaderDone: false,
leftJoinFieldNames: leftJoinFieldNames,
leftJoinFieldNames: leftJoinFieldNames,
leftKeepFieldNameSet: leftKeepFieldNameSet,
JoinBucket: NewJoinBucket(nil),
peekRecordAndContext: nil,
@ -575,6 +578,7 @@ func (keeper *JoinBucketKeeper) readRecord() *types.RecordAndContext {
// TODO: temp
lib.InternalCodingErrorIf(leftrecsAndContexts.Len() != 1)
leftrecAndContext := leftrecsAndContexts.Front().Value.(*types.RecordAndContext)
leftrecAndContext.Record = KeepLeftFieldNames(leftrecAndContext.Record, keeper.leftKeepFieldNameSet)
if leftrecAndContext.EndOfStream { // end-of-stream marker
keeper.recordReaderDone = true
return nil
@ -624,3 +628,26 @@ func compareLexically(
}
return 0
}
// KeepLeftFieldNames is for when the user wants only selected fields out of the left file.
func KeepLeftFieldNames(
inrec *mlrval.Mlrmap,
leftKeepFieldNameSet map[string]bool,
) *mlrval.Mlrmap {
if inrec == nil {
return inrec
} else if leftKeepFieldNameSet == nil {
// Normal case
return inrec
} else {
outrec := mlrval.NewMlrmap()
for pe := inrec.Head; pe != nil; pe = pe.Next {
if leftKeepFieldNameSet[pe.Key] {
// PutReference, not PutCopy, since the inrec will be freed and this
// is an ownership transfer.
outrec.PutReference(pe.Key, pe.Value)
}
}
return outrec
}
}

View file

@ -1248,6 +1248,9 @@ VERBS
defaults to -j values if omitted.
-r {a,b,c} Comma-separated join-field names for right input file(s);
defaults to -j values if omitted.
--lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field
names from the left file. Automatically includes the join-field name(s). Helpful
for when you only want a limited subset of information from the left file.
--lp {text} Additional prefix for non-join output field names from
the left file
--rp {text} Additional prefix for non-join output field names from

View file

@ -1573,6 +1573,9 @@ Options:
defaults to -j values if omitted.
-r {a,b,c} Comma-separated join-field names for right input file(s);
defaults to -j values if omitted.
--lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field
names from the left file. Automatically includes the join-field name(s). Helpful
for when you only want a limited subset of information from the left file.
--lp {text} Additional prefix for non-join output field names from
the left file
--rp {text} Additional prefix for non-join output field names from

View file

@ -430,6 +430,9 @@ Options:
defaults to -j values if omitted.
-r {a,b,c} Comma-separated join-field names for right input file(s);
defaults to -j values if omitted.
--lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field
names from the left file. Automatically includes the join-field name(s). Helpful
for when you only want a limited subset of information from the left file.
--lp {text} Additional prefix for non-join output field names from
the left file
--rp {text} Additional prefix for non-join output field names from

View file

@ -0,0 +1 @@
mlr --csv join -j a -f test/input/join-left-keep test/input/join-right-keep

View file

View file

@ -0,0 +1,3 @@
a,b,c,d,x,y,z
1,2,3,4,5,6,7
1,2,3,4,7,8,9

View file

@ -0,0 +1 @@
mlr --csv join --lk b -j a -f test/input/join-left-keep test/input/join-right-keep

View file

View file

@ -0,0 +1,3 @@
a,b,x,y,z
1,2,5,6,7
1,2,7,8,9

View file

@ -0,0 +1 @@
mlr --csv join --lk a -j a -f test/input/join-left-keep test/input/join-right-keep

View file

View file

@ -0,0 +1,3 @@
a,x,y,z
1,5,6,7
1,7,8,9

View file

@ -0,0 +1 @@
mlr --csv join --lk a,b -j a -f test/input/join-left-keep test/input/join-right-keep

View file

View file

@ -0,0 +1,3 @@
a,b,x,y,z
1,2,5,6,7
1,2,7,8,9

View file

@ -0,0 +1 @@
mlr --csv join -s -j a -f test/input/join-left-keep test/input/join-right-keep

View file

View file

@ -0,0 +1,3 @@
a,b,c,d,x,y,z
1,2,3,4,5,6,7
1,2,3,4,7,8,9

View file

@ -0,0 +1 @@
mlr --csv join -s --lk b -j a -f test/input/join-left-keep test/input/join-right-keep

View file

View file

@ -0,0 +1,3 @@
a,b,x,y,z
1,2,5,6,7
1,2,7,8,9

View file

@ -0,0 +1 @@
mlr --csv join -s --lk a -j a -f test/input/join-left-keep test/input/join-right-keep

View file

View file

@ -0,0 +1,3 @@
a,x,y,z
1,5,6,7
1,7,8,9

View file

@ -0,0 +1 @@
mlr --csv join -s --lk a,b -j a -f test/input/join-left-keep test/input/join-right-keep

View file

View file

@ -0,0 +1,3 @@
a,b,x,y,z
1,2,5,6,7
1,2,7,8,9

View file

@ -0,0 +1,3 @@
a,b,c,d
1,2,3,4
5,6,7,8

View file

@ -0,0 +1,5 @@
a,x,y,z
1,5,6,7
1,7,8,9
2,6,7,8
2,8,9,9

View file

@ -5,13 +5,8 @@ RELEASES
o mlr join --left-fields a,b,c
o fmt/unfmt/regex doc
o FAQ/examples reorg
? ??? for doc-string contents search -- ? & mlr help namegoeshere foo ...
o several needs-doc issues
i https://github.com/johnkerl/miller/issues?q=is%3Aissue+is%3Aopen+label%3Aneeds-documentation
- 908
:context
https://github.com/johnkerl/miller/issues/908#issuecomment-1032573038 NFR
o https://github.com/johnkerl/miller/issues?q=is%3Aissue+is%3Aopen+label%3Aneeds-documentation
k strptime/882
k fmtifnum, & recursive fmtnum/fmtifnum