mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
Fix unflatten with field names like . .x or x..y (#1735)
* Fix unflatten with field name like `.` `.x` or `x..y` * docs & test data
This commit is contained in:
parent
8088850505
commit
cc1cd954ea
12 changed files with 164 additions and 43 deletions
2
docs/src/data/flatten-dots.csv
Normal file
2
docs/src/data/flatten-dots.csv
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
a,b.,.c,.,d..e,f.g
|
||||
1,2,3,4,5,6
|
||||
|
|
|
@ -348,6 +348,59 @@ a.1,a.3,a.5
|
|||
]
|
||||
</pre>
|
||||
|
||||
## Non-inferencing cases
|
||||
|
||||
An additional heuristic is that if a field name starts with a `.`, ends with
|
||||
a `.`, or has two or more consecutive `.` characters, no attempt is made
|
||||
to unflatten it on conversion from non-JSON to JSON.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat data/flatten-dots.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
a,b.,.c,.,d..e,f.g
|
||||
1,2,3,4,5,6
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --icsv --oxtab cat data/flatten-dots.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
a 1
|
||||
b. 2
|
||||
.c 3
|
||||
. 4
|
||||
d..e 5
|
||||
f.g 6
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --icsv --ojson cat data/flatten-dots.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
[
|
||||
{
|
||||
"a": 1,
|
||||
"b.": 2,
|
||||
".c": 3,
|
||||
".": 4,
|
||||
"d..e": 5,
|
||||
"f": {
|
||||
"g": 6
|
||||
}
|
||||
}
|
||||
]
|
||||
</pre>
|
||||
|
||||
## Non-inferencing cases
|
||||
|
||||
An additional heuristic is that if a field name starts with a `.`, ends with
|
||||
a `.`, or has two or more consecutive `.` characters, no attempt is made
|
||||
to unflatten it on conversion from non-JSON to JSON.
|
||||
|
||||
## Manual control
|
||||
|
||||
|
||||
## Manual control
|
||||
|
||||
To see what our options are for manually controlling flattening and
|
||||
|
|
|
|||
|
|
@ -156,6 +156,33 @@ GENMD-RUN-COMMAND
|
|||
mlr --c2j cat data/non-consecutive.csv
|
||||
GENMD-EOF
|
||||
|
||||
## Non-inferencing cases
|
||||
|
||||
An additional heuristic is that if a field name starts with a `.`, ends with
|
||||
a `.`, or has two or more consecutive `.` characters, no attempt is made
|
||||
to unflatten it on conversion from non-JSON to JSON.
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
cat data/flatten-dots.csv
|
||||
GENMD-EOF
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --icsv --oxtab cat data/flatten-dots.csv
|
||||
GENMD-EOF
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --icsv --ojson cat data/flatten-dots.csv
|
||||
GENMD-EOF
|
||||
|
||||
## Non-inferencing cases
|
||||
|
||||
An additional heuristic is that if a field name starts with a `.`, ends with
|
||||
a `.`, or has two or more consecutive `.` characters, no attempt is made
|
||||
to unflatten it on conversion from non-JSON to JSON.
|
||||
|
||||
## Manual control
|
||||
|
||||
|
||||
## Manual control
|
||||
|
||||
To see what our options are for manually controlling flattening and
|
||||
|
|
|
|||
|
|
@ -424,7 +424,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
1mFLATTEN-UNFLATTEN FLAGS0m
|
||||
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
|
||||
|
||||
See the Flatten/unflatten doc page for more information.
|
||||
See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
|
||||
|
||||
--flatsep or --jflatsep {string}
|
||||
Separator for flattening multi-level JSON keys, e.g.
|
||||
|
|
@ -435,10 +435,10 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
then this flattens to `y.1=7,y.2=8,y.3=9, and
|
||||
similarly for maps. With `--no-auto-flatten`, instead
|
||||
we get `$y=[1, 2, 3]`.
|
||||
--no-auto-unflatten When input non-JSON and output is JSON, suppress the
|
||||
default auto-unflatten behavior. Default: if the
|
||||
--no-auto-unflatten When input is non-JSON and output is JSON, suppress
|
||||
the default auto-unflatten behavior. Default: if the
|
||||
input has `y.1=7,y.2=8,y.3=9` then this unflattens to
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With
|
||||
`--no-auto-flatten`, instead we get
|
||||
`${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
|
||||
|
|
@ -3737,5 +3737,5 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
|
||||
https://miller.readthedocs.io
|
||||
|
||||
2024-11-23 4mMILLER24m(1)
|
||||
2024-12-23 4mMILLER24m(1)
|
||||
</pre>
|
||||
|
|
|
|||
|
|
@ -403,7 +403,7 @@
|
|||
1mFLATTEN-UNFLATTEN FLAGS0m
|
||||
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
|
||||
|
||||
See the Flatten/unflatten doc page for more information.
|
||||
See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
|
||||
|
||||
--flatsep or --jflatsep {string}
|
||||
Separator for flattening multi-level JSON keys, e.g.
|
||||
|
|
@ -414,10 +414,10 @@
|
|||
then this flattens to `y.1=7,y.2=8,y.3=9, and
|
||||
similarly for maps. With `--no-auto-flatten`, instead
|
||||
we get `$y=[1, 2, 3]`.
|
||||
--no-auto-unflatten When input non-JSON and output is JSON, suppress the
|
||||
default auto-unflatten behavior. Default: if the
|
||||
--no-auto-unflatten When input is non-JSON and output is JSON, suppress
|
||||
the default auto-unflatten behavior. Default: if the
|
||||
input has `y.1=7,y.2=8,y.3=9` then this unflattens to
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With
|
||||
`--no-auto-flatten`, instead we get
|
||||
`${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
|
||||
|
|
@ -3716,4 +3716,4 @@
|
|||
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
|
||||
https://miller.readthedocs.io
|
||||
|
||||
2024-11-23 4mMILLER24m(1)
|
||||
2024-12-23 4mMILLER24m(1)
|
||||
|
|
|
|||
|
|
@ -195,14 +195,14 @@ are overridden in all cases by setting output format to `format2`.
|
|||
|
||||
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
|
||||
|
||||
See the Flatten/unflatten doc page for more information.
|
||||
See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
|
||||
|
||||
|
||||
**Flags:**
|
||||
|
||||
* `--flatsep or --jflatsep {string}`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`.
|
||||
* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.
|
||||
* `--no-auto-unflatten`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.
|
||||
* `--no-auto-unflatten`: When input is non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
|
||||
## Format-conversion keystroke-saver flags
|
||||
|
||||
|
|
|
|||
|
|
@ -403,7 +403,7 @@
|
|||
1mFLATTEN-UNFLATTEN FLAGS0m
|
||||
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
|
||||
|
||||
See the Flatten/unflatten doc page for more information.
|
||||
See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
|
||||
|
||||
--flatsep or --jflatsep {string}
|
||||
Separator for flattening multi-level JSON keys, e.g.
|
||||
|
|
@ -414,10 +414,10 @@
|
|||
then this flattens to `y.1=7,y.2=8,y.3=9, and
|
||||
similarly for maps. With `--no-auto-flatten`, instead
|
||||
we get `$y=[1, 2, 3]`.
|
||||
--no-auto-unflatten When input non-JSON and output is JSON, suppress the
|
||||
default auto-unflatten behavior. Default: if the
|
||||
--no-auto-unflatten When input is non-JSON and output is JSON, suppress
|
||||
the default auto-unflatten behavior. Default: if the
|
||||
input has `y.1=7,y.2=8,y.3=9` then this unflattens to
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With
|
||||
`--no-auto-flatten`, instead we get
|
||||
`${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
|
||||
|
|
@ -3716,4 +3716,4 @@
|
|||
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
|
||||
https://miller.readthedocs.io
|
||||
|
||||
2024-11-23 4mMILLER24m(1)
|
||||
2024-12-23 4mMILLER24m(1)
|
||||
|
|
|
|||
12
man/mlr.1
12
man/mlr.1
|
|
@ -2,12 +2,12 @@
|
|||
.\" Title: mlr
|
||||
.\" Author: [see the "AUTHOR" section]
|
||||
.\" Generator: ./mkman.rb
|
||||
.\" Date: 2024-11-23
|
||||
.\" Date: 2024-12-23
|
||||
.\" Manual: \ \&
|
||||
.\" Source: \ \&
|
||||
.\" Language: English
|
||||
.\"
|
||||
.TH "MILLER" "1" "2024-11-23" "\ \&" "\ \&"
|
||||
.TH "MILLER" "1" "2024-12-23" "\ \&" "\ \&"
|
||||
.\" -----------------------------------------------------------------
|
||||
.\" * Portability definitions
|
||||
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
@ -492,7 +492,7 @@ are overridden in all cases by setting output format to `format2`.
|
|||
.nf
|
||||
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
|
||||
|
||||
See the Flatten/unflatten doc page for more information.
|
||||
See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
|
||||
|
||||
--flatsep or --jflatsep {string}
|
||||
Separator for flattening multi-level JSON keys, e.g.
|
||||
|
|
@ -503,10 +503,10 @@ See the Flatten/unflatten doc page for more information.
|
|||
then this flattens to `y.1=7,y.2=8,y.3=9, and
|
||||
similarly for maps. With `--no-auto-flatten`, instead
|
||||
we get `$y=[1, 2, 3]`.
|
||||
--no-auto-unflatten When input non-JSON and output is JSON, suppress the
|
||||
default auto-unflatten behavior. Default: if the
|
||||
--no-auto-unflatten When input is non-JSON and output is JSON, suppress
|
||||
the default auto-unflatten behavior. Default: if the
|
||||
input has `y.1=7,y.2=8,y.3=9` then this unflattens to
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With
|
||||
`--no-auto-flatten`, instead we get
|
||||
`${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
.fi
|
||||
|
|
|
|||
|
|
@ -2877,7 +2877,7 @@ var OutputColorizationFlagSection = FlagSection{
|
|||
func FlattenUnflattenPrintInfo() {
|
||||
fmt.Println("These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).")
|
||||
fmt.Println()
|
||||
fmt.Println("See the Flatten/unflatten doc page for more information.")
|
||||
fmt.Println("See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.")
|
||||
}
|
||||
|
||||
func init() { FlattenUnflattenFlagSection.Sort() }
|
||||
|
|
@ -2901,7 +2901,7 @@ var FlattenUnflattenFlagSection = FlagSection{
|
|||
|
||||
{
|
||||
name: "--no-auto-flatten",
|
||||
help: "When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.",
|
||||
help: "When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.",
|
||||
parser: func(args []string, argc int, pargi *int, options *TOptions) {
|
||||
options.WriterOptions.AutoFlatten = false
|
||||
*pargi += 1
|
||||
|
|
@ -2910,7 +2910,7 @@ var FlattenUnflattenFlagSection = FlagSection{
|
|||
|
||||
{
|
||||
name: "--no-auto-unflatten",
|
||||
help: "When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.",
|
||||
help: "When input is non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.",
|
||||
parser: func(args []string, argc int, pargi *int, options *TOptions) {
|
||||
options.WriterOptions.AutoUnflatten = false
|
||||
*pargi += 1
|
||||
|
|
|
|||
|
|
@ -106,7 +106,18 @@ func (mlrmap *Mlrmap) isFlattenable() bool {
|
|||
// For mlr unflatten without -f. This undoes Unflatten. This is for conversion
|
||||
// from non-JSON to JSON. If there are fields x.a, x.b, x.c, etc. they're put
|
||||
// into a single field x with map-valued value keyed by "a", "b", "c".
|
||||
|
||||
//
|
||||
// There is a heurtistic here though. Miller is (wildly) multi-format and needs
|
||||
// to accommodate all manner of data. In the JSON world, "." is the default
|
||||
// delimiter for nested data, and we're here to handle that. But in the R world,
|
||||
// "." is just like "_" in other languages: witness "data.frame" rather than
|
||||
// "data_frame". If the "." was intended as punctuation, in a say a field named
|
||||
// "a.b" with value 3, then unflatten-to-JSON will make `{"a": {"b": 3}}`. This
|
||||
// is just our default behavior; users can use --no-auto-unflatten. Weirder
|
||||
// are field names like ".", ".x", "x.", "x..y", etc. The heuristic here
|
||||
// is that when we split on "." and any of the pieces around/between the dots
|
||||
// are empty string, we don't try to unflatten that field.
|
||||
//
|
||||
// Special case: if the resulting string keys are string representations of 1,
|
||||
// 2, 3, etc -- without gaps -- then the map is converted to an array.
|
||||
//
|
||||
|
|
@ -134,22 +145,38 @@ func (mlrmap *Mlrmap) CopyUnflattened(
|
|||
|
||||
// We'll come through this loop once for x.a, another for x.b, etc.
|
||||
for pe := mlrmap.Head; pe != nil; pe = pe.Next {
|
||||
// Is the field name something dot something?
|
||||
if strings.Contains(pe.Key, separator) {
|
||||
arrayOfIndices := SplitAXHelper(pe.Key, separator)
|
||||
arrayval := arrayOfIndices.intf.([]*Mlrval)
|
||||
lib.InternalCodingErrorIf(len(arrayval) < 1)
|
||||
// If the input field name was "x.a" then remember the "x".
|
||||
baseIndex := arrayval[0].String()
|
||||
affectedBaseIndices[baseIndex] = true
|
||||
// Use PutIndexed to assign $x["a"] = 7, or $x["b"] = 8, etc.
|
||||
other.PutIndexed(
|
||||
CopyMlrvalArray(arrayval),
|
||||
unflattenTerminal(pe.Value).Copy(),
|
||||
)
|
||||
} else {
|
||||
// If there are no dots in the field name, treat it as a terminal.
|
||||
if !strings.Contains(pe.Key, separator) {
|
||||
other.PutReference(pe.Key, unflattenTerminal(pe.Value))
|
||||
continue
|
||||
}
|
||||
|
||||
arrayOfIndices := SplitAXHelper(pe.Key, separator)
|
||||
arrayval := arrayOfIndices.intf.([]*Mlrval)
|
||||
lib.InternalCodingErrorIf(len(arrayval) < 1)
|
||||
|
||||
// Check for "" in any of the split pieces; treat the field as terminal if so.
|
||||
legitDots := true
|
||||
for i, _ := range arrayval {
|
||||
piece := arrayval[i].String()
|
||||
if piece == "" {
|
||||
legitDots = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if !legitDots {
|
||||
other.PutReference(pe.Key, unflattenTerminal(pe.Value))
|
||||
continue
|
||||
}
|
||||
|
||||
// If the input field name was "x.a" then remember the "x".
|
||||
baseIndex := arrayval[0].String()
|
||||
affectedBaseIndices[baseIndex] = true
|
||||
// Use PutIndexed to assign $x["a"] = 7, or $x["b"] = 8, etc.
|
||||
other.PutIndexed(
|
||||
CopyMlrvalArray(arrayval),
|
||||
unflattenTerminal(pe.Value).Copy(),
|
||||
)
|
||||
}
|
||||
|
||||
// Go through all the field names which were turned into maps -- e.g. "x"
|
||||
|
|
|
|||
|
|
@ -24,6 +24,13 @@
|
|||
"wrapper": {
|
||||
"empty3": {},
|
||||
"emtpy4": []
|
||||
}
|
||||
},
|
||||
"x": {
|
||||
"y": 1
|
||||
},
|
||||
"@": 2,
|
||||
"x@": 3,
|
||||
"@y": 4,
|
||||
"x@@y": 5
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -13,3 +13,8 @@ empty1 {}
|
|||
empty2 []
|
||||
wrapper@empty3 {}
|
||||
wrapper@emtpy4 []
|
||||
x@y 1
|
||||
@ 2
|
||||
x@ 3
|
||||
@y 4
|
||||
x@@y 5
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue