Dedupe field names by default (#794)

This commit is contained in:
John Kerl 2021-12-22 21:07:29 -05:00 committed by GitHub
parent 6b87a121b0
commit 157e567909
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
75 changed files with 352 additions and 195 deletions

8
.vimrc
View file

@ -1,8 +1,2 @@
map \d :w<C-m>:!clear;echo Building ...; echo; make mlr<C-m>
map \f :w<C-m>:!clear;echo Building ...; echo; make tests-in-order<C-m>
"map \r :w<C-m>:!clear;echo Building ...; echo; make mlrval-tests<C-m>
"map \r :w<C-m>:!clear;echo Building ...; echo; make mlrmap-tests<C-m>
"map \r :w<C-m>:!clear;echo Building ...; echo; make input-tests<C-m>
"map \r :w<C-m>:!clear;echo Building ...; echo; make mlrval-format-test<C-m>
"map \r :w<C-m>:!clear;echo Building ...; echo; make bifs-tests<C-m>
map \r :w<C-m>:!clear;echo Building ...; echo; make bifs-collections-test<C-m>
map \f :w<C-m>:!clear;echo Building ...; echo; make ut<C-m>

120
Makefile
View file

@ -31,126 +31,6 @@ install: build
unit-test ut:
go test github.com/johnkerl/miller/internal/pkg/...
# Keystroke-savers
lib-unbackslash-test:
go test internal/pkg/lib/unbackslash_test.go internal/pkg/lib/unbackslash.go
lib_regex_test:
go test internal/pkg/lib/regex_test.go internal/pkg/lib/regex.go
lib-tests:
go test github.com/johnkerl/miller/internal/pkg/lib/...
mlrval-new-test:
go test internal/pkg/mlrval/new_test.go \
internal/pkg/mlrval/mlrval_type.go \
internal/pkg/mlrval/mlrval_constants.go \
internal/pkg/mlrval/mlrval_new.go \
internal/pkg/mlrval/mlrval_infer.go
mlrval-is-test:
go test internal/pkg/mlrval/is_test.go \
internal/pkg/mlrval/mlrval_type.go \
internal/pkg/mlrval/mlrval_constants.go \
internal/pkg/mlrval/mlrval_new.go \
internal/pkg/mlrval/mlrval_infer.go \
internal/pkg/mlrval/mlrval_is.go
mlrval-get-test:
go test internal/pkg/mlrval/get_test.go \
internal/pkg/mlrval/mlrval_type.go \
internal/pkg/mlrval/mlrval_constants.go \
internal/pkg/mlrval/mlrval_new.go \
internal/pkg/mlrval/mlrval_infer.go \
internal/pkg/mlrval/mlrval_is.go \
internal/pkg/mlrval/mlrval_get.go
mlrval-output-test:
go test internal/pkg/mlrval/output_test.go \
internal/pkg/mlrval/mlrval_type.go \
internal/pkg/mlrval/mlrval_constants.go \
internal/pkg/mlrval/mlrval_new.go \
internal/pkg/mlrval/mlrval_infer.go \
internal/pkg/mlrval/mlrval_is.go \
internal/pkg/mlrval/mlrval_get.go \
internal/pkg/mlrval/mlrval_output.go \
internal/pkg/mlrval/mlrval_format.go
mlrval-format-test:
go test internal/pkg/mlrval/format_test.go \
internal/pkg/mlrval/mlrval_type.go \
internal/pkg/mlrval/mlrval_constants.go \
internal/pkg/mlrval/mlrval_new.go \
internal/pkg/mlrval/mlrval_infer.go \
internal/pkg/mlrval/mlrval_is.go \
internal/pkg/mlrval/mlrval_get.go \
internal/pkg/mlrval/mlrval_output.go \
internal/pkg/mlrval/mlrval_format.go
mlrval-tests:
go test github.com/johnkerl/miller/internal/pkg/mlrval/...
mlrmap-new-test:
go test internal/pkg/mlrval/mlrmap_new_test.go \
internal/pkg/mlrval/mlrmap.go \
internal/pkg/mlrval/mlrval_type.go \
internal/pkg/mlrval/mlrval_constants.go \
internal/pkg/mlrval/mlrval_new.go \
internal/pkg/mlrval/mlrval_infer.go \
internal/pkg/mlrval/mlrval_is.go \
internal/pkg/mlrval/mlrval_get.go \
internal/pkg/mlrval/mlrval_output.go \
internal/pkg/mlrval/mlrval_format.go
mlrmap-accessors-test:
go test internal/pkg/mlrval/mlrmap_accessors_test.go \
internal/pkg/mlrval/mlrmap.go \
internal/pkg/mlrval/mlrmap_accessors.go \
internal/pkg/mlrval/mlrval_type.go \
internal/pkg/mlrval/mlrval_constants.go \
internal/pkg/mlrval/mlrval_new.go \
internal/pkg/mlrval/mlrval_cmp.go \
internal/pkg/mlrval/mlrval_copy.go \
internal/pkg/mlrval/mlrval_infer.go \
internal/pkg/mlrval/mlrval_is.go \
internal/pkg/mlrval/mlrval_get.go \
internal/pkg/mlrval/mlrval_output.go \
internal/pkg/mlrval/mlrval_format.go
mlrmap-tests: mlrmap-new-test mlrmap-accessors-test
input-dkvp-test:
go test internal/pkg/input/record_reader_dkvp_test.go \
internal/pkg/input/record_reader.go \
internal/pkg/input/record_reader_dkvp_nidx.go
input-tests: input-dkvp-test
bifs-arithmetic-test:
go test internal/pkg/bifs/arithmetic_test.go \
internal/pkg/bifs/base.go \
internal/pkg/bifs/arithmetic.go
bifs-bits-test:
go test internal/pkg/bifs/bits_test.go \
internal/pkg/bifs/base.go \
internal/pkg/bifs/arithmetic.go \
internal/pkg/bifs/bits.go
bifs-collections-test:
go test internal/pkg/bifs/collections_test.go \
internal/pkg/bifs/base.go \
internal/pkg/bifs/arithmetic.go \
internal/pkg/bifs/collections.go
bifs-hashing-test:
go test internal/pkg/bifs/hashing_test.go \
internal/pkg/bifs/base.go \
internal/pkg/bifs/arithmetic.go \
internal/pkg/bifs/hashing.go
bifs-sort-test:
go test internal/pkg/bifs/sort_test.go \
internal/pkg/bifs/base.go \
internal/pkg/bifs/arithmetic.go \
internal/pkg/bifs/sort.go
bifs-tests: bifs-arithmetic-test bifs-bits-test bifs-collections-test bifs-hashing-test bifs-sort-test
#mlrval_functions_test:
# go test internal/pkg/mlrval/mlrval_functions_test.go $(ls internal/pkg/types/*.go | grep -v test)
#mlrval_format_test:
# go test internal/pkg/mlrval/mlrval_format_test.go $(ls internal/pkg/types/*.go|grep -v test)
tests-in-order: mlrval-tests mlrmap-tests input-tests bifs-tests
# ----------------------------------------------------------------
# Regression tests (large number)
#

View file

@ -121,31 +121,31 @@ Here's some sample CSV data which is values-only, i.e. headerless:
</pre>
There are clearly nine fields here, but if we try to have Miller parse it as CSV, we
see there are fewer than nine columns:
see something happened:
<pre class="pre-highlight-in-pair">
<b>mlr --csv cat data/nas.csv</b>
</pre>
<pre class="pre-non-highlight-in-pair">
-349801.10097848,4537221.43295653,2,1,NA
-338681.59578181,4537221.43295653,14,1,0.964
-334975.09404959,4537221.43295653,18,1,NA
-332195.21775042,4537221.43295653,21,1,0.96
-331268.59231736,4537221.43295653,22,1,0.962
-330341.96688431,4537221.43295653,23,1,0.962
-326635.46515209,4537221.43295653,27,1,0.958
-349801.10097848,4537221.43295653,2,1,NA,NA_2,NA_3,NA_4,NA_5
-338681.59578181,4537221.43295653,14,1,13.1,1,0.978,0.964,0.964
-334975.09404959,4537221.43295653,18,1,13.1,1,NA,NA,NA
-332195.21775042,4537221.43295653,21,1,13.1,1,0.978,0.974,0.96
-331268.59231736,4537221.43295653,22,1,13.1,1,0.978,0.978,0.962
-330341.96688431,4537221.43295653,23,1,13.1,1,0.978,0.978,0.962
-326635.46515209,4537221.43295653,27,1,13.1,2,0.978,0.972,0.958
</pre>
What happened?
Miller is (by central design) a mapping from name to value, rather than integer
position to value as in most tools in the Unix toolkit such as `sort`, `cut`,
`awk`, etc. So given input `Yea=1,Yea=2` on the same input line, first `Yea=1`
is stored, then updated with `Yea=2`. This is in the input-parser and the value
`Yea=1` is unavailable to any further processing.
`awk`, etc. And its default behavior with repeated column/field names is to append `_2`, `_3`, etc to dedupe them.
So given input `Yea=1,Yea=2` on the same input line, first `Yea=1`
is stored, then updated with `Yea_2=2`. This is in the input-parser.
Here, the first data line is being seen as a header ine, and the repeated `NA`
values are being seen as duplicate keys.
Here, the first data line is being seen as a header line, and the repeated `NA`
values are being seen as duplicate keys that need to be deduplicated.
One solution is to use `--implicit-csv-header`, or its shorter alias `--hi`:

View file

@ -48,7 +48,7 @@ cat data/nas.csv
GENMD-EOF
There are clearly nine fields here, but if we try to have Miller parse it as CSV, we
see there are fewer than nine columns:
see something happened:
GENMD-RUN-COMMAND
mlr --csv cat data/nas.csv
@ -58,12 +58,13 @@ What happened?
Miller is (by central design) a mapping from name to value, rather than integer
position to value as in most tools in the Unix toolkit such as `sort`, `cut`,
`awk`, etc. So given input `Yea=1,Yea=2` on the same input line, first `Yea=1`
is stored, then updated with `Yea=2`. This is in the input-parser and the value
`Yea=1` is unavailable to any further processing.
`awk`, etc. And its default behavior with repeated column/field names is to
append `_2`, `_3`, etc to dedupe them. So given input `Yea=1,Yea=2` on the
same input line, first `Yea=1` is stored, then `Yea_2=2`. This is in the
input-parser.
Here, the first data line is being seen as a header ine, and the repeated `NA`
values are being seen as duplicate keys.
Here, the first data line is being seen as a header line, and the repeated `NA`
values are being seen as duplicate keys that need to be deduplicated.
One solution is to use `--implicit-csv-header`, or its shorter alias `--hi`:

View file

@ -507,6 +507,13 @@ MISCELLANEOUS FLAGS
what you might hope but `--mfrom *.csv --` does.
--mload {filenames} Like `--load` but works with more than one filename,
e.g. `--mload *.mlr --`.
--no-dedupe-field-names By default, if an input record has a field name x and
another also named x, the second will be renamed x_2,
and so on. With this flag provided, the second x's
value will replace the first x's value when the
record is read. This flag has no effect on JSON input
records, where duplicate keys always result in the
last one's value being retained.
--no-fflush Let buffered output not be written after every output
record. The default is flush output after every
record if the output is to the terminal, or less
@ -3010,5 +3017,5 @@ SEE ALSO
2021-12-22 MILLER(1)
2021-12-23 MILLER(1)
</pre>

View file

@ -486,6 +486,13 @@ MISCELLANEOUS FLAGS
what you might hope but `--mfrom *.csv --` does.
--mload {filenames} Like `--load` but works with more than one filename,
e.g. `--mload *.mlr --`.
--no-dedupe-field-names By default, if an input record has a field name x and
another also named x, the second will be renamed x_2,
and so on. With this flag provided, the second x's
value will replace the first x's value when the
record is read. This flag has no effect on JSON input
records, where duplicate keys always result in the
last one's value being retained.
--no-fflush Let buffered output not be written after every output
record. The default is flush output after every
record if the output is to the terminal, or less
@ -2989,4 +2996,4 @@ SEE ALSO
2021-12-22 MILLER(1)
2021-12-23 MILLER(1)

View file

@ -264,3 +264,4 @@ The following differences are rather technical. If they don't sound familiar to
* See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags).
* Emitting a map-valued expression now requires either a temporary variable or the new `emit1` keyword. Please see the
[page on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information.
* By default, field names are deduped for all file formats except JSON. So if you have an input record with `x=8,x=9` then the second field's key is renamed to `x_2` and so on -- the record scans as `x=8,x_2=9`. Use `mlr --no-dedupe-field-names` to suppress this, and have the record be scanned as `x=9`. For JSON, the last duplicated key in an input record is always retained, regardless of `mlr --no-dedupe-field-names`: `{"x":8,"x":9}` scans as if it were `{"x":9}`.

View file

@ -222,3 +222,4 @@ The following differences are rather technical. If they don't sound familiar to
* See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags).
* Emitting a map-valued expression now requires either a temporary variable or the new `emit1` keyword. Please see the
[page on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information.
* By default, field names are deduped for all file formats except JSON. So if you have an input record with `x=8,x=9` then the second field's key is renamed to `x_2` and so on -- the record scans as `x=8,x_2=9`. Use `mlr --no-dedupe-field-names` to suppress this, and have the record be scanned as `x=9`. For JSON, the last duplicated key in an input record is always retained, regardless of `mlr --no-dedupe-field-names`: `{"x":8,"x":9}` scans as if it were `{"x":9}`.

View file

@ -355,6 +355,8 @@ These are flags which don't fit into any other category.
`: Use this to specify one of more input files before the verb(s), rather than after. May be used more than once. The list of filename must end with `--`. This is useful for example since `--from *.csv` doesn't do what you might hope but `--mfrom *.csv --` does.
* `--mload {filenames}
`: Like `--load` but works with more than one filename, e.g. `--mload *.mlr --`.
* `--no-dedupe-field-names
`: By default, if an input record has a field name x and another also named x, the second will be renamed x_2, and so on. With this flag provided, the second x's value will replace the first x's value when the record is read. This flag has no effect on JSON input records, where duplicate keys always result in the last one's value being retained.
* `--no-fflush
`: Let buffered output not be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to allow less-frequent updates when output is to the terminal. This is unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead.
* `--no-hash-records

View file

@ -2557,6 +2557,18 @@ var MiscFlagSection = FlagSection{
},
},
{
name: "--no-dedupe-field-names",
help: `By default, if an input record has a field name x and
another also named x, the second will be renamed x_2, and so on. With this flag provided, the
second x's value will replace the first x's value when the record is read. This flag has no effect
on JSON input records, where duplicate keys always result in the last one's value being retained.`,
parser: func(args []string, argc int, pargi *int, options *TOptions) {
options.ReaderOptions.DedupeFieldNames = false
*pargi += 1
},
},
{
name: "--records-per-batch",
arg: "{n}",

View file

@ -46,6 +46,7 @@ type TReaderOptions struct {
IPSRegex *regexp.Regexp
SuppressIFSRegexing bool // e.g. if they want to do '--ifs .' since '.' is a regex metacharacter
SuppressIPSRegexing bool // e.g. if they want to do '--ips .' since '.' is a regex metacharacter
DedupeFieldNames bool
// If unspecified on the command line, these take input-format-dependent
// defaults. E.g. default FS is comma for DKVP but space for NIDX;
@ -186,6 +187,7 @@ func DefaultReaderOptions() TReaderOptions {
StepAsString: DEFAULT_GEN_STEP_AS_STRING,
StopAsString: DEFAULT_GEN_STOP_AS_STRING,
},
DedupeFieldNames: true,
// TODO: comment
RecordsPerBatch: DEFAULT_RECORDS_PER_BATCH,

View file

@ -182,6 +182,7 @@ func (reader *RecordReaderCSV) getRecordBatch(
eof bool,
) {
recordsAndContexts = list.New()
dedupeFieldNames := reader.readerOptions.DedupeFieldNames
csvRecords, more := <-csvRecordsChannel
if !more {
@ -226,7 +227,11 @@ func (reader *RecordReaderCSV) getRecordBatch(
for i := 0; i < nh; i++ {
key := reader.header[i]
value := mlrval.FromDeferredType(csvRecord[i])
record.PutReference(key, value)
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
} else {
@ -246,18 +251,30 @@ func (reader *RecordReaderCSV) getRecordBatch(
for i = 0; i < n; i++ {
key := reader.header[i]
value := mlrval.FromDeferredType(csvRecord[i])
record.PutReference(key, value)
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
if nh < nd {
// if header shorter than data: use 1-up itoa keys
key := strconv.Itoa(i + 1)
value := mlrval.FromDeferredType(csvRecord[i])
record.PutReference(key, value)
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
if nh > nd {
// if header longer than data: use "" values
for i = nd; i < nh; i++ {
record.PutCopy(reader.header[i], mlrval.VOID)
_, err := record.PutReferenceMaybeDedupe(reader.header[i], mlrval.VOID.Copy(), dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
}
}

View file

@ -179,6 +179,7 @@ func getRecordBatchExplicitCSVHeader(
eof bool,
) {
recordsAndContexts = list.New()
dedupeFieldNames := reader.readerOptions.DedupeFieldNames
lines, more := <-linesChannel
if !more {
@ -247,7 +248,11 @@ func getRecordBatchExplicitCSVHeader(
if !reader.readerOptions.AllowRaggedCSVInput {
for i, field := range fields {
value := mlrval.FromDeferredType(field)
record.PutReference(reader.headerStrings[i], value)
_, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
} else {
nh := len(reader.headerStrings)
@ -256,14 +261,22 @@ func getRecordBatchExplicitCSVHeader(
var i int
for i = 0; i < n; i++ {
value := mlrval.FromDeferredType(fields[i])
record.PutReference(reader.headerStrings[i], value)
_, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
if nh < nd {
// if header shorter than data: use 1-up itoa keys
for i = nh; i < nd; i++ {
key := strconv.Itoa(i + 1)
value := mlrval.FromDeferredType(fields[i])
record.PutReference(key, value)
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
}
if nh > nd {
@ -293,6 +306,7 @@ func getRecordBatchImplicitCSVHeader(
eof bool,
) {
recordsAndContexts = list.New()
dedupeFieldNames := reader.readerOptions.DedupeFieldNames
lines, more := <-linesChannel
if !more {
@ -364,7 +378,11 @@ func getRecordBatchImplicitCSVHeader(
if !reader.readerOptions.AllowRaggedCSVInput {
for i, field := range fields {
value := mlrval.FromDeferredType(field)
record.PutReference(reader.headerStrings[i], value)
_, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
} else {
nh := len(reader.headerStrings)
@ -373,18 +391,30 @@ func getRecordBatchImplicitCSVHeader(
var i int
for i = 0; i < n; i++ {
value := mlrval.FromDeferredType(fields[i])
record.PutReference(reader.headerStrings[i], value)
_, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
if nh < nd {
// if header shorter than data: use 1-up itoa keys
key := strconv.Itoa(i + 1)
value := mlrval.FromDeferredType(fields[i])
record.PutReference(key, value)
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
if nh > nd {
// if header longer than data: use "" values
for i = nd; i < nh; i++ {
record.PutCopy(reader.headerStrings[i], mlrval.VOID)
_, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], mlrval.VOID.Copy(), dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
}
}

View file

@ -16,7 +16,7 @@ import (
// splitter_DKVP_NIDX is a function type for the one bit of code differing
// between the DKVP reader and the NIDX reader, namely, how it splits lines.
type splitter_DKVP_NIDX func(reader *RecordReaderDKVPNIDX, line string) *mlrval.Mlrmap
type splitter_DKVP_NIDX func(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrmap, error)
type RecordReaderDKVPNIDX struct {
readerOptions *cli.TReaderOptions
@ -100,7 +100,7 @@ func (reader *RecordReaderDKVPNIDX) processHandle(
go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch)
for {
recordsAndContexts, eof := reader.getRecordBatch(linesChannel, context)
recordsAndContexts, eof := reader.getRecordBatch(linesChannel, errorChannel, context)
if recordsAndContexts.Len() > 0 {
readerChannel <- recordsAndContexts
}
@ -113,6 +113,7 @@ func (reader *RecordReaderDKVPNIDX) processHandle(
// TODO: comment copiously we're trying to handle slow/fast/short/long reads: tail -f, smallfile, bigfile.
func (reader *RecordReaderDKVPNIDX) getRecordBatch(
linesChannel <-chan *list.List,
errorChannel chan<- error,
context *types.Context,
) (
recordsAndContexts *list.List,
@ -142,7 +143,11 @@ func (reader *RecordReaderDKVPNIDX) getRecordBatch(
}
}
record := reader.splitter(reader, line)
record, err := reader.splitter(reader, line)
if err != nil {
errorChannel <- err
return
}
context.UpdateForInputRecord()
recordAndContext := types.NewRecordAndContext(record, context)
recordsAndContexts.PushBack(recordAndContext)
@ -151,8 +156,9 @@ func (reader *RecordReaderDKVPNIDX) getRecordBatch(
return recordsAndContexts, false
}
func recordFromDKVPLine(reader *RecordReaderDKVPNIDX, line string) *mlrval.Mlrmap {
func recordFromDKVPLine(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrmap, error) {
record := mlrval.NewMlrmapAsRecord()
dedupeFieldNames := reader.readerOptions.DedupeFieldNames
var pairs []string
// TODO: func-pointer this away
@ -181,17 +187,23 @@ func recordFromDKVPLine(reader *RecordReaderDKVPNIDX, line string) *mlrval.Mlrma
// DKVP is a generalization of NIDX.
key := strconv.Itoa(i + 1) // Miller userspace indices are 1-up
value := mlrval.FromDeferredType(kv[0])
record.PutReference(key, value)
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
return nil, err
}
} else {
key := kv[0]
value := mlrval.FromDeferredType(kv[1])
record.PutReference(key, value)
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
return nil, err
}
}
}
return record
return record, nil
}
func recordFromNIDXLine(reader *RecordReaderDKVPNIDX, line string) *mlrval.Mlrmap {
func recordFromNIDXLine(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrmap, error) {
record := mlrval.NewMlrmapAsRecord()
var values []string
@ -212,5 +224,5 @@ func recordFromNIDXLine(reader *RecordReaderDKVPNIDX, line string) *mlrval.Mlrma
mval := mlrval.FromDeferredType(value)
record.PutReference(key, mval)
}
return record
return record, nil
}

View file

@ -16,13 +16,15 @@ func TestRecordFromDKVPLine(t *testing.T) {
assert.Nil(t, err)
line := ""
record := recordFromDKVPLine(reader, line)
record, err := recordFromDKVPLine(reader, line)
assert.NotNil(t, record)
assert.Nil(t, err)
assert.Equal(t, 0, record.FieldCount)
line = "a=1,b=2,c=3"
record = recordFromDKVPLine(reader, line)
record, err = recordFromDKVPLine(reader, line)
assert.NotNil(t, record)
assert.Nil(t, err)
assert.Equal(t, 3, record.FieldCount)
assert.NotNil(t, record.Head)
@ -33,20 +35,25 @@ func TestRecordFromDKVPLine(t *testing.T) {
assert.Equal(t, record.Head.Next.Key, "b")
assert.Equal(t, record.Head.Next.Next.Key, "c")
// Default is to dedupe to a=1,b=2,b_2=3
line = "a=1,b=2,b=3"
record = recordFromDKVPLine(reader, line)
record, err = recordFromDKVPLine(reader, line)
assert.NotNil(t, record)
assert.Equal(t, 2, record.FieldCount)
assert.Nil(t, err)
assert.Equal(t, 3, record.FieldCount)
assert.NotNil(t, record.Head)
assert.NotNil(t, record.Head.Next)
assert.Nil(t, record.Head.Next.Next)
assert.NotNil(t, record.Head.Next.Next)
assert.Nil(t, record.Head.Next.Next.Next)
assert.Equal(t, record.Head.Key, "a")
assert.Equal(t, record.Head.Next.Key, "b")
assert.Equal(t, record.Head.Next.Next.Key, "b_2")
line = "a,b,c"
record = recordFromDKVPLine(reader, line)
record, err = recordFromDKVPLine(reader, line)
assert.NotNil(t, record)
assert.Nil(t, err)
assert.Equal(t, 3, record.FieldCount)
assert.NotNil(t, record.Head)

View file

@ -257,6 +257,7 @@ func (reader *RecordReaderXTAB) recordFromXTABLines(
stanza *list.List,
) (*mlrval.Mlrmap, error) {
record := mlrval.NewMlrmapAsRecord()
dedupeFieldNames := reader.readerOptions.DedupeFieldNames
for e := stanza.Front(); e != nil; e = e.Next() {
line := e.Value.(string)
@ -274,10 +275,16 @@ func (reader *RecordReaderXTAB) recordFromXTABLines(
key := kv[0]
if len(kv) == 1 {
value := mlrval.VOID
record.PutReference(key, value)
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
return nil, err
}
} else {
value := mlrval.FromDeferredType(kv[1])
record.PutReference(key, value)
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
return nil, err
}
}
}

View file

@ -3,6 +3,8 @@ package mlrval
import (
"bytes"
"errors"
"fmt"
"strconv"
"github.com/johnkerl/miller/internal/pkg/lib"
)
@ -28,30 +30,65 @@ func (mlrmap *Mlrmap) Get(key string) *Mlrval {
// PutReference copies the key but not the value. This is not safe for DSL use,
// where we could create undesired references between different objects. Only
// intended to be used at callsites which allocate a mlrval solely for the
// purpose of putting into a map, e.g. input-record readers.
// intended to be used at callsites which allocate a mlrval on the spot, solely
// for the purpose of putting into the map.
func (mlrmap *Mlrmap) PutReference(key string, value *Mlrval) {
pe := mlrmap.findEntry(key)
if pe == nil {
pe = newMlrmapEntry(key, value)
if mlrmap.Head == nil {
mlrmap.Head = pe
mlrmap.Tail = pe
} else {
pe.Prev = mlrmap.Tail
pe.Next = nil
mlrmap.Tail.Next = pe
mlrmap.Tail = pe
}
if mlrmap.keysToEntries != nil {
mlrmap.keysToEntries[key] = pe
}
mlrmap.FieldCount++
mlrmap.putReferenceNewAux(key, value)
} else {
pe.Value = value
}
}
// putReferenceNewAux is a helper function for code shared between PutReference
// and PutReferenceMaybeDedupe. It should not be invoked from anywhere else --
// it doesn't do its own check if the key already exists in the record or not.
func (mlrmap *Mlrmap) putReferenceNewAux(key string, value *Mlrval) {
pe := newMlrmapEntry(key, value)
if mlrmap.Head == nil {
mlrmap.Head = pe
mlrmap.Tail = pe
} else {
pe.Prev = mlrmap.Tail
pe.Next = nil
mlrmap.Tail.Next = pe
mlrmap.Tail = pe
}
if mlrmap.keysToEntries != nil {
mlrmap.keysToEntries[key] = pe
}
mlrmap.FieldCount++
}
// PutReferenceMaybeDedupe is the default inserter for key-value pairs in input records --
// if the input is 'x=8,x=9` then we make a record with x=8 and x_2=9. This can be suppressed
// via a command-line flag which this method's dedupe flag respects.
func (mlrmap *Mlrmap) PutReferenceMaybeDedupe(key string, value *Mlrval, dedupe bool) (string, error) {
if !dedupe {
mlrmap.PutReference(key, value)
return key, nil
}
pe := mlrmap.findEntry(key)
if pe == nil {
mlrmap.putReferenceNewAux(key, value)
return key, nil
}
for i := 2; i < 1000; i++ {
newKey := key + "_" + strconv.Itoa(i)
pe := mlrmap.findEntry(newKey)
if pe == nil {
mlrmap.putReferenceNewAux(newKey, value)
return newKey, nil
}
}
return key, errors.New(
fmt.Sprintf("record has too many input fields named \"%s\"", key),
)
}
// PutCopy copies the key and value (deep-copying in case the value is array/map).
// This is safe for DSL use. See also PutReference.
func (mlrmap *Mlrmap) PutCopy(key string, value *Mlrval) {

View file

@ -486,6 +486,13 @@ MISCELLANEOUS FLAGS
what you might hope but `--mfrom *.csv --` does.
--mload {filenames} Like `--load` but works with more than one filename,
e.g. `--mload *.mlr --`.
--no-dedupe-field-names By default, if an input record has a field name x and
another also named x, the second will be renamed x_2,
and so on. With this flag provided, the second x's
value will replace the first x's value when the
record is read. This flag has no effect on JSON input
records, where duplicate keys always result in the
last one's value being retained.
--no-fflush Let buffered output not be written after every output
record. The default is flush output after every
record if the output is to the terminal, or less
@ -2989,4 +2996,4 @@ SEE ALSO
2021-12-22 MILLER(1)
2021-12-23 MILLER(1)

View file

@ -2,12 +2,12 @@
.\" Title: mlr
.\" Author: [see the "AUTHOR" section]
.\" Generator: ./mkman.rb
.\" Date: 2021-12-22
.\" Date: 2021-12-23
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "MILLER" "1" "2021-12-22" "\ \&" "\ \&"
.TH "MILLER" "1" "2021-12-23" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Portability definitions
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -605,6 +605,13 @@ These are flags which don't fit into any other category.
what you might hope but `--mfrom *.csv --` does.
--mload {filenames} Like `--load` but works with more than one filename,
e.g. `--mload *.mlr --`.
--no-dedupe-field-names By default, if an input record has a field name x and
another also named x, the second will be renamed x_2,
and so on. With this flag provided, the second x's
value will replace the first x's value when the
record is read. This flag has no effect on JSON input
records, where duplicate keys always result in the
last one's value being retained.
--no-fflush Let buffered output not be written after every output
record. The default is flush output after every
record if the output is to the terminal, or less

View file

@ -0,0 +1 @@
mlr --no-dedupe-field-names --icsv --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,5 @@
{
"a": 1,
"b": 5,
"c": 4
}

View file

@ -0,0 +1,2 @@
a,b,b,c,b
1,2,3,4,5

View file

@ -0,0 +1 @@
mlr --icsv --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,7 @@
{
"a": 1,
"b": 2,
"b_2": 3,
"c": 4,
"b_3": 5
}

View file

@ -0,0 +1,2 @@
a,b,b,c,b
1,2,3,4,5

View file

@ -0,0 +1 @@
mlr --no-dedupe-field-names --icsvlite --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,5 @@
{
"a": 1,
"b": 5,
"c": 4
}

View file

@ -0,0 +1,2 @@
a,b,b,c,b
1,2,3,4,5

View file

@ -0,0 +1 @@
mlr --icsvlite --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,7 @@
{
"a": 1,
"b": 2,
"b_2": 3,
"c": 4,
"b_3": 5
}

View file

@ -0,0 +1,2 @@
a,b,b,c,b
1,2,3,4,5

View file

@ -0,0 +1 @@
mlr --no-dedupe-field-names --idkvp --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,5 @@
{
"a": 1,
"b": 5,
"c": 4
}

View file

@ -0,0 +1 @@
a=1,b=2,b=3,c=4,b=5

View file

@ -0,0 +1 @@
mlr --idkvp --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,7 @@
{
"a": 1,
"b": 2,
"b_2": 3,
"c": 4,
"b_3": 5
}

View file

@ -0,0 +1 @@
a=1,b=2,b=3,c=4,b=5

View file

@ -0,0 +1 @@
mlr --no-dedupe-field-names --ijson --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,5 @@
{
"a": 1,
"b": 5,
"c": 4
}

View file

@ -0,0 +1 @@
{"a":1,"b":2,"b":3,"c":4,"b":5}

View file

@ -0,0 +1 @@
mlr --ijson --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,5 @@
{
"a": 1,
"b": 5,
"c": 4
}

View file

@ -0,0 +1 @@
{"a":1,"b":2,"b":3,"c":4,"b":5}

View file

@ -0,0 +1 @@
mlr --no-dedupe-field-names --inidx --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,7 @@
{
"1": 1,
"2": 2,
"3": 3,
"4": 4,
"5": 5
}

View file

@ -0,0 +1 @@
1 2 3 4 5

View file

@ -0,0 +1 @@
mlr --inidx --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,7 @@
{
"1": 1,
"2": 2,
"3": 3,
"4": 4,
"5": 5
}

View file

@ -0,0 +1 @@
1 2 3 4 5

View file

@ -0,0 +1 @@
mlr --no-dedupe-field-names --ipprint --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,5 @@
{
"a": 1,
"b": 5,
"c": 4
}

View file

@ -0,0 +1,2 @@
a b b c b
1 2 3 4 5

View file

@ -0,0 +1 @@
mlr --ipprint --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,7 @@
{
"a": 1,
"b": 2,
"b_2": 3,
"c": 4,
"b_3": 5
}

View file

@ -0,0 +1,2 @@
a b b c b
1 2 3 4 5

View file

@ -0,0 +1 @@
mlr --no-dedupe-field-names --ixtab --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,5 @@
{
"a": 1,
"b": 5,
"c": 4
}

View file

@ -0,0 +1,5 @@
a 1
b 2
b 3
c 4
b 5

View file

@ -0,0 +1 @@
mlr --ixtab --ojson cat ${CASEDIR}/input

View file

@ -0,0 +1,7 @@
{
"a": 1,
"b": 2,
"b_2": 3,
"c": 4,
"b_3": 5
}

View file

@ -0,0 +1,5 @@
a 1
b 2
b 3
c 4
b 5