mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 10:15:36 +00:00
Input separators as regexes
This commit is contained in:
parent
2c432e58f1
commit
dcb0252c19
66 changed files with 687 additions and 264 deletions
|
|
@ -553,6 +553,7 @@ Essentials:
|
|||
mlr help file-formats
|
||||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
@ -628,6 +629,7 @@ Essentials:
|
|||
mlr help file-formats
|
||||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
|
|||
|
|
@ -131,6 +131,7 @@ HELP OPTIONS
|
|||
mlr help file-formats
|
||||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
@ -626,6 +627,17 @@ SEPARATOR FLAGS
|
|||
- C-style escape sequences, e.g. `--rs '\r\n' --fs '\t'`.
|
||||
- To avoid backslashing, you can use any of the following names:
|
||||
|
||||
ascii_esc = "\x1b"
|
||||
ascii_etx = "\x04"
|
||||
ascii_fs = "\x1c"
|
||||
ascii_gs = "\x1d"
|
||||
ascii_null = "\x01"
|
||||
ascii_rs = "\x1e"
|
||||
ascii_soh = "\x02"
|
||||
ascii_stx = "\x03"
|
||||
ascii_us = "\x1f"
|
||||
asv_fs = "\x1f"
|
||||
asv_rs = "\x1e"
|
||||
colon = ":"
|
||||
comma = ","
|
||||
cr = "\r"
|
||||
|
|
@ -640,7 +652,12 @@ SEPARATOR FLAGS
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\t"
|
||||
tabs = "(\t)+"
|
||||
usv_fs = "\xe2\x90\x9f"
|
||||
usv_rs = "\xe2\x90\x9e"
|
||||
whitespace = "([ \t])+"
|
||||
|
||||
* Default separators by format:
|
||||
|
||||
|
|
@ -2741,5 +2758,5 @@ SEE ALSO
|
|||
|
||||
|
||||
|
||||
2021-09-20 MILLER(1)
|
||||
2021-09-21 MILLER(1)
|
||||
</pre>
|
||||
|
|
|
|||
|
|
@ -110,6 +110,7 @@ HELP OPTIONS
|
|||
mlr help file-formats
|
||||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
@ -605,6 +606,17 @@ SEPARATOR FLAGS
|
|||
- C-style escape sequences, e.g. `--rs '\r\n' --fs '\t'`.
|
||||
- To avoid backslashing, you can use any of the following names:
|
||||
|
||||
ascii_esc = "\x1b"
|
||||
ascii_etx = "\x04"
|
||||
ascii_fs = "\x1c"
|
||||
ascii_gs = "\x1d"
|
||||
ascii_null = "\x01"
|
||||
ascii_rs = "\x1e"
|
||||
ascii_soh = "\x02"
|
||||
ascii_stx = "\x03"
|
||||
ascii_us = "\x1f"
|
||||
asv_fs = "\x1f"
|
||||
asv_rs = "\x1e"
|
||||
colon = ":"
|
||||
comma = ","
|
||||
cr = "\r"
|
||||
|
|
@ -619,7 +631,12 @@ SEPARATOR FLAGS
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\t"
|
||||
tabs = "(\t)+"
|
||||
usv_fs = "\xe2\x90\x9f"
|
||||
usv_rs = "\xe2\x90\x9e"
|
||||
whitespace = "([ \t])+"
|
||||
|
||||
* Default separators by format:
|
||||
|
||||
|
|
@ -2720,4 +2737,4 @@ SEE ALSO
|
|||
|
||||
|
||||
|
||||
2021-09-20 MILLER(1)
|
||||
2021-09-21 MILLER(1)
|
||||
|
|
|
|||
|
|
@ -137,6 +137,15 @@ For example (see [https://github.com/johnkerl/miller/issues/178](https://github.
|
|||
|
||||
Miller now has a read-evaluate-print-loop ([REPL](repl.md)) where you can single-step through your data-file record, express arbitrary statements to converse with the data, etc.
|
||||
|
||||
## Regex support for IFS and IPS
|
||||
|
||||
You can now split fields on whitespace when whitespace is a mix of tabs and
|
||||
spaces. As well, you can use regular expressions for the input field-separator
|
||||
and the input pair-separator. Please see the section on
|
||||
[multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
|
||||
|
||||
In particular, for NIDX format, the default IFS now allows splitting on one or more of space or tab.
|
||||
|
||||
## Case-folded sorting options
|
||||
|
||||
The [sort](reference-verbs.md#sort) verb now accepts `-c` and `-cr` options for case-folded ascending/descending sort, respetively.
|
||||
|
|
|
|||
|
|
@ -97,6 +97,15 @@ GENMD_EOF
|
|||
|
||||
Miller now has a read-evaluate-print-loop ([REPL](repl.md)) where you can single-step through your data-file record, express arbitrary statements to converse with the data, etc.
|
||||
|
||||
## Regex support for IFS and IPS
|
||||
|
||||
You can now split fields on whitespace when whitespace is a mix of tabs and
|
||||
spaces. As well, you can use regular expressions for the input field-separator
|
||||
and the input pair-separator. Please see the section on
|
||||
[multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
|
||||
|
||||
In particular, for NIDX format, the default IFS now allows splitting on one or more of space or tab.
|
||||
|
||||
## Case-folded sorting options
|
||||
|
||||
The [sort](reference-verbs.md#sort) verb now accepts `-c` and `-cr` options for case-folded ascending/descending sort, respetively.
|
||||
|
|
|
|||
|
|
@ -49,6 +49,7 @@ Essentials:
|
|||
mlr help file-formats
|
||||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
@ -102,6 +103,7 @@ Essentials:
|
|||
mlr help file-formats
|
||||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
|
|||
|
|
@ -4,8 +4,13 @@
|
|||
? twi-dm re all-contribs: all-contributors.org
|
||||
* nikos materials -> fold in
|
||||
|
||||
C! repifs !! https://pkg.go.dev/regexp#Regexp.Split 2-for-1 -- get regexp as well ?
|
||||
- make table at seps.md.in, ffmt down vs XS across
|
||||
c force CSV IFS single-char at CLIP? or CSV setup? IFS1?
|
||||
C need multi-IRS reader for ASV & explicit CRLF (if supported); also need no-traiing-lf handling
|
||||
|
||||
? look at:
|
||||
regtest/cases-pending-go-port/dsl-output-redirects/0071/cmd
|
||||
regtest/cases-pending-go-port/dsl-redirects/0004/cmd wtf no diff?
|
||||
regtest/cases-pending-go-port/dsl-redirects/0010/cmd jvstack ?
|
||||
|
||||
* r-strings branch!
|
||||
C stats1 --fr
|
||||
|
|
@ -20,6 +25,8 @@ e fzf-ish w/ head -n 4, --from, up-arrow & append verb, then cat -- find & updat
|
|||
https://github.com/johnkerl/miller/issues/77#issuecomment-538553828
|
||||
|
||||
c! seps \001 etc !
|
||||
mlrc --iasv --oxtab cat regtest/input/example.asv
|
||||
mlr --iasv --oxtab cat regtest/input/example.asv
|
||||
mlrc --iusv --oxtab cat regtest/input/example.usv
|
||||
mlr --iusv --oxtab cat regtest/input/example.usv
|
||||
|
||||
|
|
@ -27,6 +34,9 @@ o check for determinism regtest/cases/verb-join-prepipe/0003/cmd
|
|||
* UT for https://github.com/johnkerl/miller/issues/653
|
||||
|
||||
----------------------------------------------------------------
|
||||
headerless CSV: separate page
|
||||
e nidx, --implicit-csv-header, --headerless-csv-output; surver miller/issues ...
|
||||
e make a --headerless-csv-input alias
|
||||
|
||||
record-heterogeneity:
|
||||
l link-to's:
|
||||
|
|
|
|||
|
|
@ -138,8 +138,9 @@ keys from the header line together with the values from each data line, the
|
|||
second record has a missing value for key `c` (which ought to be fillable),
|
||||
while the third record has a value `10` with no key for it.
|
||||
|
||||
Using the `--allow-ragged-csv-input` option we can fill values in too-short
|
||||
rows, and provide a key (column number starting with 1) for too-long rows:
|
||||
Using the [`--allow-ragged-csv-input` flag](reference-main-flag-list.md#csv-only-flags)
|
||||
we can fill values in too-short rows, and provide a key (column number starting
|
||||
with 1) for too-long rows:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --icsv --ojson --allow-ragged-csv-input cat data/het/ragged.csv</b>
|
||||
|
|
|
|||
|
|
@ -489,6 +489,17 @@ Notes about all other separators:
|
|||
- C-style escape sequences, e.g. `--rs '\r\n' --fs '\t'`.
|
||||
- To avoid backslashing, you can use any of the following names:
|
||||
|
||||
ascii_esc = "\x1b"
|
||||
ascii_etx = "\x04"
|
||||
ascii_fs = "\x1c"
|
||||
ascii_gs = "\x1d"
|
||||
ascii_null = "\x01"
|
||||
ascii_rs = "\x1e"
|
||||
ascii_soh = "\x02"
|
||||
ascii_stx = "\x03"
|
||||
ascii_us = "\x1f"
|
||||
asv_fs = "\x1f"
|
||||
asv_rs = "\x1e"
|
||||
colon = ":"
|
||||
comma = ","
|
||||
cr = "\r"
|
||||
|
|
@ -503,7 +514,12 @@ Notes about all other separators:
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\t"
|
||||
tabs = "(\t)+"
|
||||
usv_fs = "\xe2\x90\x9f"
|
||||
usv_rs = "\xe2\x90\x9e"
|
||||
whitespace = "([ \t])+"
|
||||
|
||||
* Default separators by format:
|
||||
|
||||
|
|
|
|||
|
|
@ -50,8 +50,9 @@ part of the JSON specification.
|
|||
|
||||
## Input and output separators
|
||||
|
||||
Miller lets you use the same separators for input and output, or, to change
|
||||
them between input and output, if you wish to transform your data in that way.
|
||||
Miller lets you use the same separators for input and output (e.g. CSV input,
|
||||
CSV output), or, to change them between input and output (e.g. CSV input, JSON
|
||||
output), if you wish to transform your data in that way.
|
||||
|
||||
Miller uses the names `IRS` and `ORS` for the input and output record
|
||||
separators, `IFS` and `OFS` for the input and output field separators, and
|
||||
|
|
@ -59,6 +60,14 @@ separators, `IFS` and `OFS` for the input and output field separators, and
|
|||
|
||||
For example:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat data/a.dkvp</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
a=1,b=2,c=3
|
||||
a=4,b=5,c=6
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --ifs , --ofs ';' --ips = --ops : cut -o -f c,a,b data/a.dkvp</b>
|
||||
</pre>
|
||||
|
|
@ -71,6 +80,14 @@ If your data has non-default separators and you don't want to change those
|
|||
between input and output, you can use `--rs`, `--fs`, and `--ps`. Setting `--fs
|
||||
:` is the same as setting `--ifs : --ofs :`, but with fewer keystrokes.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat data/modsep.dkvp</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
a:1;b:2;c:3
|
||||
a:4;b:5;c:6
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --fs ';' --ps : cut -o -f c,a,b data/modsep.dkvp</b>
|
||||
</pre>
|
||||
|
|
@ -79,7 +96,7 @@ c:3;a:1;b:2
|
|||
c:6;a:4;b:5
|
||||
</pre>
|
||||
|
||||
## Multi-character separators
|
||||
## Multi-character and regular-expression separators
|
||||
|
||||
The separators default to single characters, but can be multiple characters if you like:
|
||||
|
||||
|
|
@ -91,16 +108,17 @@ c:=3;;;a:=1;;;b:=2
|
|||
c:=6;;;a:=4;;;b:=5
|
||||
</pre>
|
||||
|
||||
While the separators can be multiple characters, [regular
|
||||
expressions](reference-main-regular-expressions.md) (which Miller supports in
|
||||
many ways) are not (as of mid-2021) supported by Miller. So, in the above
|
||||
example, you can say the field-separator is one semicolon, or three, but two or
|
||||
four won't be recognized using `--ifs ';;;'`.
|
||||
As of September 2021:
|
||||
|
||||
To fill this need, in the absence of full regular-expression support, Miller
|
||||
has a `--repifs` option for input. This means, for example, using `--ifs
|
||||
' ' --repifs` you can have the field separator be one _or more_ spaces. (Mixes
|
||||
of spaces and tabs, however, won't be recognized as a separator.)
|
||||
* `IFS` and `IPS` can be regular expressions.
|
||||
* `IRS` must be a single character (nominally `\n`).
|
||||
* `OFS`, `OPS`, and `ORS` can be multi-character.
|
||||
|
||||
Since `IFS` and `IPS` can be regular expressions, if your data has field
|
||||
separators which are one or more consecutive spaces, you can use `--ifs '(
|
||||
)+'`. But that gets a little tedious, so Miller has the `--repifs` and
|
||||
`--repips` flags you can use if you like. This wraps the `IFS` or `IPS`, say
|
||||
`X`, as `(X)+`.
|
||||
|
||||
The `--repifs` flag means that multiple successive occurrences of the field
|
||||
separator count as one. For example, in CSV data we often signify nulls by
|
||||
|
|
@ -120,31 +138,19 @@ see by the dawn's
|
|||
early light what so
|
||||
</pre>
|
||||
|
||||
(TODO: FIXME)
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --ifs ' ' --repifs --inidx --oxtab cat data/extra-spaces.txt</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
1 oh
|
||||
2
|
||||
3
|
||||
4
|
||||
5 say
|
||||
6
|
||||
7
|
||||
8 can
|
||||
9 you
|
||||
2 say
|
||||
3 can
|
||||
4 you
|
||||
|
||||
1 see
|
||||
2
|
||||
3
|
||||
4 by
|
||||
5
|
||||
6
|
||||
7
|
||||
8 the
|
||||
9 dawn's
|
||||
2 by
|
||||
3 the
|
||||
4 dawn's
|
||||
|
||||
1 early
|
||||
2 light
|
||||
|
|
@ -152,6 +158,51 @@ early light what so
|
|||
4 so
|
||||
</pre>
|
||||
|
||||
## Aliases
|
||||
|
||||
Many things we'd like to write as separators need to be escaped from the shell
|
||||
-- e.g. `--ifs ';'` or `--ofs '|'`, and so on. You can use the following if you like:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr help list-separator-aliases</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
ascii_esc = "\x1b"
|
||||
ascii_etx = "\x04"
|
||||
ascii_fs = "\x1c"
|
||||
ascii_gs = "\x1d"
|
||||
ascii_null = "\x01"
|
||||
ascii_rs = "\x1e"
|
||||
ascii_soh = "\x02"
|
||||
ascii_stx = "\x03"
|
||||
ascii_us = "\x1f"
|
||||
asv_fs = "\x1f"
|
||||
asv_rs = "\x1e"
|
||||
colon = ":"
|
||||
comma = ","
|
||||
cr = "\r"
|
||||
crcr = "\r\r"
|
||||
crlf = "\r\n"
|
||||
crlfcrlf = "\r\n\r\n"
|
||||
equals = "="
|
||||
lf = "\n"
|
||||
lflf = "\n\n"
|
||||
newline = "\n"
|
||||
pipe = "|"
|
||||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\t"
|
||||
tabs = "(\t)+"
|
||||
usv_fs = "\xe2\x90\x9f"
|
||||
usv_rs = "\xe2\x90\x9e"
|
||||
whitespace = "([ \t])+"
|
||||
</pre>
|
||||
|
||||
Note that `spaces`, `tabs`, and `whitespace` already are regexes so you
|
||||
shouldn't use `--repifs` with them.
|
||||
|
||||
## Command-line flags
|
||||
|
||||
Given the above, we now have seen the following flags:
|
||||
|
|
@ -162,15 +213,13 @@ Given the above, we now have seen the following flags:
|
|||
--ps --ips --ops
|
||||
</pre>
|
||||
|
||||
Also note that you can use names for certain characters: e.g. `--fs space` is
|
||||
the same as `--fs ' '`. A full list is: `colon`, `comma`, `equals`, `newline`,
|
||||
`pipe`, `semicolon`, `slash`, `space`, `tab`.
|
||||
See also the [separator-flags section](reference-main-flag-list.md#separator-flags).
|
||||
|
||||
## DSL built-in variables
|
||||
|
||||
Miller exposes for you read-only [built-in variables](reference-dsl-variables.md#built-in-variables) with
|
||||
names `IRS`, `ORS`, `IFS`, `OFS`, `IPS`, and `OPS`. Unlike in AWK, you can't set these in begin-blocks --
|
||||
their values indicate what you set at the command line -- so their use is limited.
|
||||
their values indicate what you specified at the command line -- so their use is limited.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --ifs , --ofs ';' --ips = --ops : --from data/a.dkvp put '$d = ">>>" . IFS . "|||" . OFS . "<<<"'</b>
|
||||
|
|
@ -182,21 +231,22 @@ a:4;b:5;c:6;d:>>>,|||;<<<
|
|||
|
||||
## Which separators apply to which file formats
|
||||
|
||||
TODO:
|
||||
Notes:
|
||||
|
||||
* If CSV field separator is tab, we have TSV; see more examples (ASV, USV, etc.) at in the [CSV section](file-formats.md#csvtsvasvusvetc).
|
||||
* JSON: ignores all separator flags from the command line.
|
||||
* Headerless CSV overlaps quite a bit with NIDX format using comma for IFS. See also the page on [CSV with and without headers](csv-with-and-without-headers.md).
|
||||
|
||||
| | **RS** | **FS** | **PS** |
|
||||
|------------|---------|---------|----------|
|
||||
| **CSV** | Default `\n` * | Default `,` | None |
|
||||
| **TSV** | Default `\n` * | Default `\t` | None |
|
||||
| **JSON** | N/A; records are between `{` and `}` | `,` but not alterable | `:` but not alterable |
|
||||
| **DKVP** | Default `\n` | Default `,` | Default `=` |
|
||||
| **NIDX** | Default `\n` | Default space | None |
|
||||
| **XTAB** | `\n\n` ** | `\n` * | Space with repeats |
|
||||
| **PPRINT** | Default `\n` * | Space with repeats | None |
|
||||
| **Markdown** | `\n` * but not alterable | One or more spaces then `|` then one or more spaces | None |
|
||||
| [**CSV and CSV-lite**](file-formats.md#csvtsvasvusvetc) | Default `\n` * | Default `,` | None |
|
||||
| [**TSV**](file-formats.md#csvtsvasvusvetc) | Default `\n` * | Default `\t` | None |
|
||||
| [**JSON**](file-formats.md#json) | N/A; records are between `{` and `}` | `,` but not alterable | `:` but not alterable |
|
||||
| [**DKVP**](file-formats.md#dkvp-key-value-pairs) | Default `\n` | Default `,` | Default `=` |
|
||||
| [**NIDX**](file-formats.md#nidx-index-numbered-toolkit-style) | Default `\n` | Default space | None |
|
||||
| [**XTAB**](file-formats.md#xtab-vertical-tabular) | `\n\n` ** | `\n` * | Space with repeats |
|
||||
| [**PPRINT**](file-formats.md#pprint-pretty-printed-tabular) | Default `\n` * | Space with repeats | None |
|
||||
| [**Markdown**](file-formats.md#markdown-tabular) | `\n` * but not alterable | One or more spaces then `|` then one or more spaces; not alterable | None |
|
||||
|
||||
\* or `\r\n` on Windows
|
||||
|
||||
|
|
|
|||
|
|
@ -30,8 +30,9 @@ part of the JSON specification.
|
|||
|
||||
## Input and output separators
|
||||
|
||||
Miller lets you use the same separators for input and output, or, to change
|
||||
them between input and output, if you wish to transform your data in that way.
|
||||
Miller lets you use the same separators for input and output (e.g. CSV input,
|
||||
CSV output), or, to change them between input and output (e.g. CSV input, JSON
|
||||
output), if you wish to transform your data in that way.
|
||||
|
||||
Miller uses the names `IRS` and `ORS` for the input and output record
|
||||
separators, `IFS` and `OFS` for the input and output field separators, and
|
||||
|
|
@ -39,6 +40,10 @@ separators, `IFS` and `OFS` for the input and output field separators, and
|
|||
|
||||
For example:
|
||||
|
||||
GENMD_RUN_COMMAND
|
||||
cat data/a.dkvp
|
||||
GENMD_EOF
|
||||
|
||||
GENMD_RUN_COMMAND
|
||||
mlr --ifs , --ofs ';' --ips = --ops : cut -o -f c,a,b data/a.dkvp
|
||||
GENMD_EOF
|
||||
|
|
@ -47,11 +52,15 @@ If your data has non-default separators and you don't want to change those
|
|||
between input and output, you can use `--rs`, `--fs`, and `--ps`. Setting `--fs
|
||||
:` is the same as setting `--ifs : --ofs :`, but with fewer keystrokes.
|
||||
|
||||
GENMD_RUN_COMMAND
|
||||
cat data/modsep.dkvp
|
||||
GENMD_EOF
|
||||
|
||||
GENMD_RUN_COMMAND
|
||||
mlr --fs ';' --ps : cut -o -f c,a,b data/modsep.dkvp
|
||||
GENMD_EOF
|
||||
|
||||
## Multi-character separators
|
||||
## Multi-character and regular-expression separators
|
||||
|
||||
The separators default to single characters, but can be multiple characters if you like:
|
||||
|
||||
|
|
@ -59,16 +68,17 @@ GENMD_RUN_COMMAND
|
|||
mlr --ifs ';' --ips : --ofs ';;;' --ops := cut -o -f c,a,b data/modsep.dkvp
|
||||
GENMD_EOF
|
||||
|
||||
While the separators can be multiple characters, [regular
|
||||
expressions](reference-main-regular-expressions.md) (which Miller supports in
|
||||
many ways) are not (as of mid-2021) supported by Miller. So, in the above
|
||||
example, you can say the field-separator is one semicolon, or three, but two or
|
||||
four won't be recognized using `--ifs ';;;'`.
|
||||
As of September 2021:
|
||||
|
||||
To fill this need, in the absence of full regular-expression support, Miller
|
||||
has a `--repifs` option for input. This means, for example, using `--ifs
|
||||
' ' --repifs` you can have the field separator be one _or more_ spaces. (Mixes
|
||||
of spaces and tabs, however, won't be recognized as a separator.)
|
||||
* `IFS` and `IPS` can be regular expressions.
|
||||
* `IRS` must be a single character (nominally `\n`).
|
||||
* `OFS`, `OPS`, and `ORS` can be multi-character.
|
||||
|
||||
Since `IFS` and `IPS` can be regular expressions, if your data has field
|
||||
separators which are one or more consecutive spaces, you can use `--ifs '(
|
||||
)+'`. But that gets a little tedious, so Miller has the `--repifs` and
|
||||
`--repips` flags you can use if you like. This wraps the `IFS` or `IPS`, say
|
||||
`X`, as `(X)+`.
|
||||
|
||||
The `--repifs` flag means that multiple successive occurrences of the field
|
||||
separator count as one. For example, in CSV data we often signify nulls by
|
||||
|
|
@ -83,12 +93,22 @@ GENMD_RUN_COMMAND
|
|||
cat data/extra-spaces.txt
|
||||
GENMD_EOF
|
||||
|
||||
(TODO: FIXME)
|
||||
|
||||
GENMD_RUN_COMMAND
|
||||
mlr --ifs ' ' --repifs --inidx --oxtab cat data/extra-spaces.txt
|
||||
GENMD_EOF
|
||||
|
||||
## Aliases
|
||||
|
||||
Many things we'd like to write as separators need to be escaped from the shell
|
||||
-- e.g. `--ifs ';'` or `--ofs '|'`, and so on. You can use the following if you like:
|
||||
|
||||
GENMD_RUN_COMMAND
|
||||
mlr help list-separator-aliases
|
||||
GENMD_EOF
|
||||
|
||||
Note that `spaces`, `tabs`, and `whitespace` already are regexes so you
|
||||
shouldn't use `--repifs` with them.
|
||||
|
||||
## Command-line flags
|
||||
|
||||
Given the above, we now have seen the following flags:
|
||||
|
|
@ -99,15 +119,13 @@ GENMD_CARDIFY
|
|||
--ps --ips --ops
|
||||
GENMD_EOF
|
||||
|
||||
Also note that you can use names for certain characters: e.g. `--fs space` is
|
||||
the same as `--fs ' '`. A full list is: `colon`, `comma`, `equals`, `newline`,
|
||||
`pipe`, `semicolon`, `slash`, `space`, `tab`.
|
||||
See also the [separator-flags section](reference-main-flag-list.md#separator-flags).
|
||||
|
||||
## DSL built-in variables
|
||||
|
||||
Miller exposes for you read-only [built-in variables](reference-dsl-variables.md#built-in-variables) with
|
||||
names `IRS`, `ORS`, `IFS`, `OFS`, `IPS`, and `OPS`. Unlike in AWK, you can't set these in begin-blocks --
|
||||
their values indicate what you set at the command line -- so their use is limited.
|
||||
their values indicate what you specified at the command line -- so their use is limited.
|
||||
|
||||
GENMD_RUN_COMMAND
|
||||
mlr --ifs , --ofs ';' --ips = --ops : --from data/a.dkvp put '$d = ">>>" . IFS . "|||" . OFS . "<<<"'
|
||||
|
|
@ -115,21 +133,22 @@ GENMD_EOF
|
|||
|
||||
## Which separators apply to which file formats
|
||||
|
||||
TODO:
|
||||
Notes:
|
||||
|
||||
* If CSV field separator is tab, we have TSV; see more examples (ASV, USV, etc.) at in the [CSV section](file-formats.md#csvtsvasvusvetc).
|
||||
* JSON: ignores all separator flags from the command line.
|
||||
* Headerless CSV overlaps quite a bit with NIDX format using comma for IFS. See also the page on [CSV with and without headers](csv-with-and-without-headers.md).
|
||||
|
||||
| | **RS** | **FS** | **PS** |
|
||||
|------------|---------|---------|----------|
|
||||
| **CSV** | Default `\n` * | Default `,` | None |
|
||||
| **TSV** | Default `\n` * | Default `\t` | None |
|
||||
| **JSON** | N/A; records are between `{` and `}` | `,` but not alterable | `:` but not alterable |
|
||||
| **DKVP** | Default `\n` | Default `,` | Default `=` |
|
||||
| **NIDX** | Default `\n` | Default space | None |
|
||||
| **XTAB** | `\n\n` ** | `\n` * | Space with repeats |
|
||||
| **PPRINT** | Default `\n` * | Space with repeats | None |
|
||||
| **Markdown** | `\n` * but not alterable | One or more spaces then `|` then one or more spaces | None |
|
||||
| [**CSV and CSV-lite**](file-formats.md#csvtsvasvusvetc) | Default `\n` * | Default `,` | None |
|
||||
| [**TSV**](file-formats.md#csvtsvasvusvetc) | Default `\n` * | Default `\t` | None |
|
||||
| [**JSON**](file-formats.md#json) | N/A; records are between `{` and `}` | `,` but not alterable | `:` but not alterable |
|
||||
| [**DKVP**](file-formats.md#dkvp-key-value-pairs) | Default `\n` | Default `,` | Default `=` |
|
||||
| [**NIDX**](file-formats.md#nidx-index-numbered-toolkit-style) | Default `\n` | Default space | None |
|
||||
| [**XTAB**](file-formats.md#xtab-vertical-tabular) | `\n\n` ** | `\n` * | Space with repeats |
|
||||
| [**PPRINT**](file-formats.md#pprint-pretty-printed-tabular) | Default `\n` * | Space with repeats | None |
|
||||
| [**Markdown**](file-formats.md#markdown-tabular) | `\n` * but not alterable | One or more spaces then `|` then one or more spaces; not alterable | None |
|
||||
|
||||
\* or `\r\n` on Windows
|
||||
|
||||
|
|
|
|||
1
go/regtest/cases-pending-go-port/io-rfc-csv/0017/cmd
Normal file
1
go/regtest/cases-pending-go-port/io-rfc-csv/0017/cmd
Normal file
|
|
@ -0,0 +1 @@
|
|||
mlr --iasv --oxtab cat regtest/input/example.asv
|
||||
7
go/regtest/cases-pending-go-port/io-rfc-csv/0017/expout
Normal file
7
go/regtest/cases-pending-go-port/io-rfc-csv/0017/expout
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
a d
|
||||
b e
|
||||
c f
|
||||
|
||||
a g
|
||||
b h
|
||||
c i
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
a b
|
||||
x_y_cov 2.000000
|
||||
1
go/regtest/cases/io-separator-aliases/0001/cmd
Normal file
1
go/regtest/cases/io-separator-aliases/0001/cmd
Normal file
|
|
@ -0,0 +1 @@
|
|||
mlr --from ${CASEDIR}/input --n2j cat
|
||||
0
go/regtest/cases/io-separator-aliases/0001/experr
Normal file
0
go/regtest/cases/io-separator-aliases/0001/experr
Normal file
20
go/regtest/cases/io-separator-aliases/0001/expout
Normal file
20
go/regtest/cases/io-separator-aliases/0001/expout
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
{
|
||||
"1": "a",
|
||||
"2": "b",
|
||||
"3": "c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "b",
|
||||
"3": "c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "b",
|
||||
"3": "c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "b",
|
||||
"3": "c"
|
||||
}
|
||||
4
go/regtest/cases/io-separator-aliases/0001/input
Normal file
4
go/regtest/cases/io-separator-aliases/0001/input
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
a b c
|
||||
a b c
|
||||
a b c
|
||||
a b c
|
||||
1
go/regtest/cases/io-separator-aliases/0002/cmd
Normal file
1
go/regtest/cases/io-separator-aliases/0002/cmd
Normal file
|
|
@ -0,0 +1 @@
|
|||
mlr --from ${CASEDIR}/input --n2j --ifs space cat
|
||||
0
go/regtest/cases/io-separator-aliases/0002/experr
Normal file
0
go/regtest/cases/io-separator-aliases/0002/experr
Normal file
23
go/regtest/cases/io-separator-aliases/0002/expout
Normal file
23
go/regtest/cases/io-separator-aliases/0002/expout
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"1": "a\tb",
|
||||
"2": "c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "b\tc"
|
||||
}
|
||||
{
|
||||
"1": "a\t\t\tb",
|
||||
"2": "",
|
||||
"3": "",
|
||||
"4": "",
|
||||
"5": "c"
|
||||
}
|
||||
{
|
||||
"1": "a\t\t",
|
||||
"2": "\tb",
|
||||
"3": "",
|
||||
"4": "\t",
|
||||
"5": "",
|
||||
"6": "c"
|
||||
}
|
||||
4
go/regtest/cases/io-separator-aliases/0002/input
Normal file
4
go/regtest/cases/io-separator-aliases/0002/input
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
a b c
|
||||
a b c
|
||||
a b c
|
||||
a b c
|
||||
1
go/regtest/cases/io-separator-aliases/0003/cmd
Normal file
1
go/regtest/cases/io-separator-aliases/0003/cmd
Normal file
|
|
@ -0,0 +1 @@
|
|||
mlr --from ${CASEDIR}/input --n2j --ifs spaces cat
|
||||
0
go/regtest/cases/io-separator-aliases/0003/experr
Normal file
0
go/regtest/cases/io-separator-aliases/0003/experr
Normal file
18
go/regtest/cases/io-separator-aliases/0003/expout
Normal file
18
go/regtest/cases/io-separator-aliases/0003/expout
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
{
|
||||
"1": "a\tb",
|
||||
"2": "c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "b\tc"
|
||||
}
|
||||
{
|
||||
"1": "a\t\t\tb",
|
||||
"2": "c"
|
||||
}
|
||||
{
|
||||
"1": "a\t\t",
|
||||
"2": "\tb",
|
||||
"3": "\t",
|
||||
"4": "c"
|
||||
}
|
||||
4
go/regtest/cases/io-separator-aliases/0003/input
Normal file
4
go/regtest/cases/io-separator-aliases/0003/input
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
a b c
|
||||
a b c
|
||||
a b c
|
||||
a b c
|
||||
1
go/regtest/cases/io-separator-aliases/0004/cmd
Normal file
1
go/regtest/cases/io-separator-aliases/0004/cmd
Normal file
|
|
@ -0,0 +1 @@
|
|||
mlr --from ${CASEDIR}/input --n2j --ifs tab cat
|
||||
0
go/regtest/cases/io-separator-aliases/0004/experr
Normal file
0
go/regtest/cases/io-separator-aliases/0004/experr
Normal file
21
go/regtest/cases/io-separator-aliases/0004/expout
Normal file
21
go/regtest/cases/io-separator-aliases/0004/expout
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
{
|
||||
"1": "a",
|
||||
"2": "b c"
|
||||
}
|
||||
{
|
||||
"1": "a b",
|
||||
"2": "c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "",
|
||||
"3": "",
|
||||
"4": "b c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "",
|
||||
"3": " ",
|
||||
"4": "b ",
|
||||
"5": " c"
|
||||
}
|
||||
4
go/regtest/cases/io-separator-aliases/0004/input
Normal file
4
go/regtest/cases/io-separator-aliases/0004/input
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
a b c
|
||||
a b c
|
||||
a b c
|
||||
a b c
|
||||
1
go/regtest/cases/io-separator-aliases/0005/cmd
Normal file
1
go/regtest/cases/io-separator-aliases/0005/cmd
Normal file
|
|
@ -0,0 +1 @@
|
|||
mlr --from ${CASEDIR}/input --n2j --ifs tabs cat
|
||||
0
go/regtest/cases/io-separator-aliases/0005/experr
Normal file
0
go/regtest/cases/io-separator-aliases/0005/experr
Normal file
18
go/regtest/cases/io-separator-aliases/0005/expout
Normal file
18
go/regtest/cases/io-separator-aliases/0005/expout
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
{
|
||||
"1": "a",
|
||||
"2": "b c"
|
||||
}
|
||||
{
|
||||
"1": "a b",
|
||||
"2": "c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "b c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": " ",
|
||||
"3": "b ",
|
||||
"4": " c"
|
||||
}
|
||||
4
go/regtest/cases/io-separator-aliases/0005/input
Normal file
4
go/regtest/cases/io-separator-aliases/0005/input
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
a b c
|
||||
a b c
|
||||
a b c
|
||||
a b c
|
||||
1
go/regtest/cases/io-separator-aliases/0006/cmd
Normal file
1
go/regtest/cases/io-separator-aliases/0006/cmd
Normal file
|
|
@ -0,0 +1 @@
|
|||
mlr --from ${CASEDIR}/input --n2j --ifs whitespace cat
|
||||
0
go/regtest/cases/io-separator-aliases/0006/experr
Normal file
0
go/regtest/cases/io-separator-aliases/0006/experr
Normal file
20
go/regtest/cases/io-separator-aliases/0006/expout
Normal file
20
go/regtest/cases/io-separator-aliases/0006/expout
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
{
|
||||
"1": "a",
|
||||
"2": "b",
|
||||
"3": "c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "b",
|
||||
"3": "c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "b",
|
||||
"3": "c"
|
||||
}
|
||||
{
|
||||
"1": "a",
|
||||
"2": "b",
|
||||
"3": "c"
|
||||
}
|
||||
4
go/regtest/cases/io-separator-aliases/0006/input
Normal file
4
go/regtest/cases/io-separator-aliases/0006/input
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
a b c
|
||||
a b c
|
||||
a b c
|
||||
a b c
|
||||
0
go/regtest/cases/mix-null-field-input/0018/experr
Normal file
0
go/regtest/cases/mix-null-field-input/0018/experr
Normal file
2
go/regtest/cases/mix-null-field-input/0018/expout
Normal file
2
go/regtest/cases/mix-null-field-input/0018/expout
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
a b
|
||||
x_y_cov 2
|
||||
1
go/regtest/input/example.asv
Normal file
1
go/regtest/input/example.asv
Normal file
|
|
@ -0,0 +1 @@
|
|||
abcdefghi
|
||||
|
|
@ -71,6 +71,7 @@ func init() {
|
|||
name: "Flags",
|
||||
handlerInfos: []tHandlerInfo{
|
||||
{name: "flags", zaryHandlerFunc: showFlagHelp},
|
||||
{name: "list-separator-aliases", zaryHandlerFunc: listSeparatorAliases},
|
||||
// Per-section entries will be computed and installed below
|
||||
},
|
||||
},
|
||||
|
|
@ -292,6 +293,10 @@ func showFlagHelp() {
|
|||
cli.FLAG_TABLE.ShowHelp()
|
||||
}
|
||||
|
||||
func listSeparatorAliases() {
|
||||
cli.ListSeparatorAliasesForOnlineHelp()
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func helpAuxents() {
|
||||
fmt.Print(`Miller has a few otherwise-standalone executables packaged within it.
|
||||
|
|
|
|||
|
|
@ -144,8 +144,8 @@ func ReplMain(args []string) int {
|
|||
}
|
||||
}
|
||||
|
||||
cli.ApplyReaderOptionDefaults(&options.ReaderOptions)
|
||||
cli.ApplyWriterOptionDefaults(&options.WriterOptions)
|
||||
cli.FinalizeReaderOptions(&options.ReaderOptions)
|
||||
cli.FinalizeWriterOptions(&options.WriterOptions)
|
||||
|
||||
// --auto-flatten is on by default. But if input and output formats are both JSON,
|
||||
// then we don't need to actually do anything. See also mlrcli_parse.go.
|
||||
|
|
|
|||
|
|
@ -50,14 +50,11 @@ import (
|
|||
// Data types used within the flags table.
|
||||
|
||||
// FlagParser is a function which takes a flag such as `--foo`.
|
||||
//
|
||||
// * It should assume that a flag.Owns method has already been invoked to be
|
||||
// sure that this function is indeed the right one to call for `--foo`.
|
||||
//
|
||||
// * The FlagParser function is responsible for advancing *pargi by 1 (if
|
||||
// `--foo`) or 2 (if `--foo bar`), checking to see if argc is long enough in
|
||||
// the latter case, and mutating the options struct.
|
||||
//
|
||||
// * Successful handling of the flag is indicated by this function making a
|
||||
// non-zero increment of *pargi.
|
||||
type FlagParser func(
|
||||
|
|
@ -68,6 +65,7 @@ type FlagParser func(
|
|||
)
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// FlagTable holds all the flags for Miller, organized into sections.
|
||||
type FlagTable struct {
|
||||
sections []*FlagSection
|
||||
|
|
|
|||
|
|
@ -3,12 +3,10 @@ package cli
|
|||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"mlr/src/lib"
|
||||
)
|
||||
|
||||
// For flags with values, e.g. ["-n" "10"], while we're looking at the "-n"
|
||||
// this let us see if the "10" slot exists.
|
||||
// CheckArgCount is for flags with values, e.g. ["-n" "10"], while we're
|
||||
// looking at the "-n": this let us see if the "10" slot exists.
|
||||
func CheckArgCount(args []string, argi int, argc int, n int) {
|
||||
if (argc - argi) < n {
|
||||
fmt.Fprintf(os.Stderr, "%s: option \"%s\" missing argument(s).\n", "mlr", args[argi])
|
||||
|
|
@ -17,43 +15,13 @@ func CheckArgCount(args []string, argi int, argc int, n int) {
|
|||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// TODO: give symbolic name to all the RHSes
|
||||
|
||||
var SEPARATOR_NAMES_TO_VALUES = map[string]string{
|
||||
"colon": ":",
|
||||
"comma": ",",
|
||||
"cr": "\\r",
|
||||
"crcr": "\\r\\r",
|
||||
"crlf": "\\r\\n",
|
||||
"crlfcrlf": "\\r\\n\\r\\n",
|
||||
"equals": "=",
|
||||
"lf": "\\n",
|
||||
"lflf": "\\n\\n",
|
||||
"newline": "\\n",
|
||||
"pipe": "|",
|
||||
"semicolon": ";",
|
||||
"slash": "/",
|
||||
"space": " ",
|
||||
"tab": "\\t",
|
||||
|
||||
"ascii_null": "\\x01",
|
||||
"ascii_soh": "\\x02",
|
||||
"ascii_stx": "\\x03",
|
||||
"ascii_etx": "\\x04",
|
||||
|
||||
"ascii_esc": "\\x1b",
|
||||
"ascii_fs": "\\x1c",
|
||||
"ascii_gs": "\\x1d",
|
||||
"ascii_rs": "\\x1e",
|
||||
"ascii_us": "\\x1f",
|
||||
}
|
||||
|
||||
// SeparatorFromArg is for letting people do things like `--ifs pipe`
|
||||
// rather than `--ifs '|'`.
|
||||
func SeparatorFromArg(name string) string {
|
||||
sep, ok := SEPARATOR_NAMES_TO_VALUES[name]
|
||||
if !ok {
|
||||
// "\001" -> control-A, etc.
|
||||
return lib.UnbackslashStringLiteral(name)
|
||||
if ok {
|
||||
return sep
|
||||
} else {
|
||||
return name
|
||||
}
|
||||
return sep
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,79 +15,13 @@ import (
|
|||
"mlr/src/lib"
|
||||
)
|
||||
|
||||
const ASV_FS = "\x1f"
|
||||
const ASV_RS = "\x1e"
|
||||
const USV_FS = "\xe2\x90\x9f"
|
||||
const USV_RS = "\xe2\x90\x9e"
|
||||
|
||||
const ASV_FS_FOR_HELP = "\\x1f"
|
||||
const ASV_RS_FOR_HELP = "\\x1e"
|
||||
const USV_FS_FOR_HELP = "U+241F (UTF-8 \\xe2\\x90\\x9f)"
|
||||
const USV_RS_FOR_HELP = "U+241E (UTF-8 \\xe2\\x90\\x9e)"
|
||||
const DEFAULT_JSON_FLATTEN_SEPARATOR = "."
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// TODO: move these to their own file
|
||||
|
||||
// E.g. if IFS isn't specified, it's space for NIDX and comma for DKVP, etc.
|
||||
|
||||
var defaultFSes = map[string]string{
|
||||
// "gen" : // TODO
|
||||
"csv": ",",
|
||||
"csvlite": ",",
|
||||
"dkvp": ",",
|
||||
"json": "N/A", // not honored; not parameterizable in JSON format
|
||||
"nidx": " ",
|
||||
"markdown": " ",
|
||||
"pprint": " ",
|
||||
"xtab": "\n", // todo: windows-dependent ...
|
||||
}
|
||||
|
||||
var defaultPSes = map[string]string{
|
||||
"csv": "N/A",
|
||||
"csvlite": "N/A",
|
||||
"dkvp": "=",
|
||||
"json": "N/A", // not honored; not parameterizable in JSON format
|
||||
"markdown": "N/A",
|
||||
"nidx": "N/A",
|
||||
"pprint": "N/A",
|
||||
"xtab": " ", // todo: windows-dependent ...
|
||||
}
|
||||
|
||||
var defaultRSes = map[string]string{
|
||||
"csv": "\n",
|
||||
"csvlite": "\n",
|
||||
"dkvp": "\n",
|
||||
"json": "N/A", // not honored; not parameterizable in JSON format
|
||||
"markdown": "\n",
|
||||
"nidx": "\n",
|
||||
"pprint": "\n",
|
||||
"xtab": "\n\n", // todo: maybe jettison the idea of this being alterable
|
||||
}
|
||||
|
||||
var defaultAllowRepeatIFSes = map[string]bool{
|
||||
"csv": false,
|
||||
"csvlite": false,
|
||||
"dkvp": false,
|
||||
"json": false,
|
||||
"markdown": false,
|
||||
"nidx": false,
|
||||
"pprint": true,
|
||||
"xtab": false,
|
||||
}
|
||||
|
||||
var defaultAllowRepeatIPSes = map[string]bool{
|
||||
"csv": false,
|
||||
"csvlite": false,
|
||||
"dkvp": false,
|
||||
"json": false,
|
||||
"markdown": false,
|
||||
"nidx": false,
|
||||
"pprint": false,
|
||||
"xtab": true,
|
||||
}
|
||||
|
||||
func ApplyReaderOptionDefaults(readerOptions *TReaderOptions) {
|
||||
// FinalizeReaderOptions does a few things. One is if a file format was
|
||||
// specified but one or more separators were not, a defaut specific to that
|
||||
// file format is applied. The second is computing regexes for IPS and IFS, and
|
||||
// unbackslashing IRS. This is because the '\n' at the command line which is
|
||||
// Go "\\n" (a backslash and an n) needs to become the single newline
|
||||
// character, and likewise for "\t", etc.
|
||||
func FinalizeReaderOptions(readerOptions *TReaderOptions) {
|
||||
if !readerOptions.IFSWasSpecified {
|
||||
readerOptions.IFS = defaultFSes[readerOptions.InputFileFormat]
|
||||
}
|
||||
|
|
@ -98,14 +32,37 @@ func ApplyReaderOptionDefaults(readerOptions *TReaderOptions) {
|
|||
readerOptions.IRS = defaultRSes[readerOptions.InputFileFormat]
|
||||
}
|
||||
if !readerOptions.AllowRepeatIFSWasSpecified {
|
||||
readerOptions.AllowRepeatIFS = defaultAllowRepeatIFSes[readerOptions.InputFileFormat]
|
||||
// Special case for Miller 6 upgrade -- now that we have regexing for mixes of tabs
|
||||
// and spaces, that should now be the default for NIDX. But *only* for NIDX format,
|
||||
// and if IFS wasn't specified.
|
||||
if readerOptions.InputFileFormat == "nidx" && !readerOptions.IFSWasSpecified {
|
||||
readerOptions.IFS = WHITESPACE
|
||||
} else {
|
||||
readerOptions.AllowRepeatIFS = defaultAllowRepeatIFSes[readerOptions.InputFileFormat]
|
||||
}
|
||||
}
|
||||
if !readerOptions.AllowRepeatIPSWasSpecified {
|
||||
readerOptions.AllowRepeatIPS = defaultAllowRepeatIPSes[readerOptions.InputFileFormat]
|
||||
}
|
||||
|
||||
if readerOptions.AllowRepeatIFS {
|
||||
readerOptions.IFSRegex = lib.CompileMillerRegexOrDie("(" + readerOptions.IFS + ")+")
|
||||
} else {
|
||||
readerOptions.IFSRegex = lib.CompileMillerRegexOrDie(readerOptions.IFS)
|
||||
}
|
||||
if readerOptions.AllowRepeatIPS {
|
||||
readerOptions.IPSRegex = lib.CompileMillerRegexOrDie("(" + readerOptions.IPS + ")+")
|
||||
} else {
|
||||
readerOptions.IPSRegex = lib.CompileMillerRegexOrDie(readerOptions.IPS)
|
||||
}
|
||||
|
||||
readerOptions.IRS = lib.UnbackslashStringLiteral(readerOptions.IRS)
|
||||
}
|
||||
|
||||
func ApplyWriterOptionDefaults(writerOptions *TWriterOptions) {
|
||||
// FinalizeWriterOptions unbackslashes OPS, OFS, and ORS. This is because
|
||||
// because the '\n' at the command line which is Go "\\n" (a backslash and an
|
||||
// n) needs to become the single newline character., and likewise for "\t", etc.
|
||||
func FinalizeWriterOptions(writerOptions *TWriterOptions) {
|
||||
if !writerOptions.OFSWasSpecified {
|
||||
writerOptions.OFS = defaultFSes[writerOptions.OutputFileFormat]
|
||||
}
|
||||
|
|
@ -115,6 +72,10 @@ func ApplyWriterOptionDefaults(writerOptions *TWriterOptions) {
|
|||
if !writerOptions.ORSWasSpecified {
|
||||
writerOptions.ORS = defaultRSes[writerOptions.OutputFileFormat]
|
||||
}
|
||||
|
||||
writerOptions.OFS = lib.UnbackslashStringLiteral(writerOptions.OFS)
|
||||
writerOptions.OPS = lib.UnbackslashStringLiteral(writerOptions.OPS)
|
||||
writerOptions.ORS = lib.UnbackslashStringLiteral(writerOptions.ORS)
|
||||
}
|
||||
|
||||
// ================================================================
|
||||
|
|
@ -211,11 +172,11 @@ Notes about all other separators:
|
|||
fmt.Println()
|
||||
|
||||
// Go doesn't preserve insertion order in its arrays so here we are inlining a sort.
|
||||
aliases := lib.GetArrayKeysSorted(SEPARATOR_NAMES_TO_VALUES_FOR_ONLINE_HELP)
|
||||
aliases := lib.GetArrayKeysSorted(SEPARATOR_NAMES_TO_VALUES)
|
||||
for _, alias := range aliases {
|
||||
// Really absurd level of indent needed to get fixed-with font in mkdocs here,
|
||||
// I don't know why. Usually it only takes 4, not 10.
|
||||
fmt.Printf(" %-10s = \"%s\"\n", alias, SEPARATOR_NAMES_TO_VALUES_FOR_ONLINE_HELP[alias])
|
||||
fmt.Printf(" %-10s = \"%s\"\n", alias, SEPARATOR_NAMES_TO_VALUES[alias])
|
||||
}
|
||||
fmt.Println()
|
||||
|
||||
|
|
@ -243,6 +204,16 @@ Notes about all other separators:
|
|||
}
|
||||
}
|
||||
|
||||
func ListSeparatorAliasesForOnlineHelp() {
|
||||
// Go doesn't preserve insertion order in its arrays so here we are inlining a sort.
|
||||
aliases := lib.GetArrayKeysSorted(SEPARATOR_NAMES_TO_VALUES)
|
||||
for _, alias := range aliases {
|
||||
// Really absurd level of indent needed to get fixed-with font in mkdocs here,
|
||||
// I don't know why. Usually it only takes 4, not 10.
|
||||
fmt.Printf("%-10s = \"%s\"\n", alias, SEPARATOR_NAMES_TO_VALUES[alias])
|
||||
}
|
||||
}
|
||||
|
||||
func init() { SeparatorFlagSection.Sort() }
|
||||
|
||||
var SeparatorFlagSection = FlagSection{
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@
|
|||
package cli
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"mlr/src/lib"
|
||||
)
|
||||
|
||||
|
|
@ -36,6 +38,8 @@ type TReaderOptions struct {
|
|||
IRS string
|
||||
AllowRepeatIFS bool
|
||||
AllowRepeatIPS bool
|
||||
IFSRegex *regexp.Regexp
|
||||
IPSRegex *regexp.Regexp
|
||||
|
||||
// If unspecified on the command line, these take input-format-dependent
|
||||
// defaults. E.g. default FS is comma for DKVP but space for NIDX;
|
||||
|
|
|
|||
134
go/src/cli/separators.go
Normal file
134
go/src/cli/separators.go
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
package cli
|
||||
|
||||
const COLON = ":"
|
||||
const COMMA = ","
|
||||
const CR = "\\r"
|
||||
const CRCR = "\\r\\r"
|
||||
const CRLF = "\\r\\n"
|
||||
const CRLFCRLF = "\\r\\n\\r\\n"
|
||||
const EQUALS = "="
|
||||
const LF = "\\n"
|
||||
const LFLF = "\\n\\n"
|
||||
const NEWLINE = "\\n"
|
||||
const PIPE = "|"
|
||||
const SEMICOLON = ";"
|
||||
const SLASH = "/"
|
||||
const SPACE = " "
|
||||
const SPACES = "( )+"
|
||||
const TAB = "\\t"
|
||||
const TABS = "(\\t)+"
|
||||
const WHITESPACE = "([ \\t])+"
|
||||
|
||||
const ASCII_ESC = "\\x1b"
|
||||
const ASCII_ETX = "\\x04"
|
||||
const ASCII_FS = "\\x1c"
|
||||
const ASCII_GS = "\\x1d"
|
||||
const ASCII_NULL = "\\x01"
|
||||
const ASCII_RS = "\\x1e"
|
||||
const ASCII_SOH = "\\x02"
|
||||
const ASCII_STX = "\\x03"
|
||||
const ASCII_US = "\\x1f"
|
||||
|
||||
const ASV_FS = "\\x1f"
|
||||
const ASV_RS = "\\x1e"
|
||||
const USV_FS = "\\xe2\\x90\\x9f"
|
||||
const USV_RS = "\\xe2\\x90\\x9e"
|
||||
|
||||
const ASV_FS_FOR_HELP = "\\x1f"
|
||||
const ASV_RS_FOR_HELP = "\\x1e"
|
||||
const USV_FS_FOR_HELP = "U+241F (UTF-8 \\xe2\\x90\\x9f)"
|
||||
const USV_RS_FOR_HELP = "U+241E (UTF-8 \\xe2\\x90\\x9e)"
|
||||
|
||||
const DEFAULT_JSON_FLATTEN_SEPARATOR = "."
|
||||
|
||||
var SEPARATOR_NAMES_TO_VALUES = map[string]string{
|
||||
"ascii_esc": ASCII_ESC,
|
||||
"ascii_etx": ASCII_ETX,
|
||||
"ascii_fs": ASCII_FS,
|
||||
"ascii_gs": ASCII_GS,
|
||||
"ascii_null": ASCII_NULL,
|
||||
"ascii_rs": ASCII_RS,
|
||||
"ascii_soh": ASCII_SOH,
|
||||
"ascii_stx": ASCII_STX,
|
||||
"ascii_us": ASCII_US,
|
||||
"asv_fs": ASV_FS,
|
||||
"asv_rs": ASV_RS,
|
||||
"colon": COLON,
|
||||
"comma": COMMA,
|
||||
"cr": CR,
|
||||
"crcr": CRCR,
|
||||
"crlf": CRLF,
|
||||
"crlfcrlf": CRLFCRLF,
|
||||
"equals": EQUALS,
|
||||
"lf": LF,
|
||||
"lflf": LFLF,
|
||||
"newline": NEWLINE,
|
||||
"pipe": PIPE,
|
||||
"semicolon": SEMICOLON,
|
||||
"slash": SLASH,
|
||||
"space": SPACE,
|
||||
"spaces": SPACES,
|
||||
"tab": TAB,
|
||||
"tabs": TABS,
|
||||
"usv_fs": USV_FS,
|
||||
"usv_rs": USV_RS,
|
||||
"whitespace": WHITESPACE,
|
||||
}
|
||||
|
||||
// E.g. if IFS isn't specified, it's space for NIDX and comma for DKVP, etc.
|
||||
|
||||
var defaultFSes = map[string]string{
|
||||
// "gen" : // TODO
|
||||
"csv": ",",
|
||||
"csvlite": ",",
|
||||
"dkvp": ",",
|
||||
"json": "N/A", // not alterable; not parameterizable in JSON format
|
||||
"nidx": " ",
|
||||
"markdown": " ",
|
||||
"pprint": " ",
|
||||
"xtab": "\n", // todo: windows-dependent ...
|
||||
}
|
||||
|
||||
var defaultPSes = map[string]string{
|
||||
"csv": "N/A",
|
||||
"csvlite": "N/A",
|
||||
"dkvp": "=",
|
||||
"json": "N/A", // not alterable; not parameterizable in JSON format
|
||||
"markdown": "N/A",
|
||||
"nidx": "N/A",
|
||||
"pprint": "N/A",
|
||||
"xtab": " ", // todo: windows-dependent ...
|
||||
}
|
||||
|
||||
var defaultRSes = map[string]string{
|
||||
"csv": "\n",
|
||||
"csvlite": "\n",
|
||||
"dkvp": "\n",
|
||||
"json": "N/A", // not alterable; not parameterizable in JSON format
|
||||
"markdown": "\n",
|
||||
"nidx": "\n",
|
||||
"pprint": "\n",
|
||||
"xtab": "\n\n", // todo: maybe jettison the idea of this being alterable
|
||||
}
|
||||
|
||||
var defaultAllowRepeatIFSes = map[string]bool{
|
||||
"csv": false,
|
||||
"csvlite": false,
|
||||
"dkvp": false,
|
||||
"json": false,
|
||||
"markdown": false,
|
||||
"nidx": false,
|
||||
"pprint": true,
|
||||
"xtab": false,
|
||||
}
|
||||
|
||||
var defaultAllowRepeatIPSes = map[string]bool{
|
||||
"csv": false,
|
||||
"csvlite": false,
|
||||
"dkvp": false,
|
||||
"json": false,
|
||||
"markdown": false,
|
||||
"nidx": false,
|
||||
"pprint": false,
|
||||
"xtab": true,
|
||||
}
|
||||
|
|
@ -11,13 +11,10 @@ import (
|
|||
"mlr/src/cli"
|
||||
)
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// * If $MLRRC is set, use it and only it.
|
||||
// * Otherwise try first $HOME/.mlrrc and then ./.mlrrc but let them
|
||||
// stack: e.g. $HOME/.mlrrc is lots of settings and maybe in one
|
||||
// subdir you want to override just a setting or two.
|
||||
|
||||
// TODO: move to separate file?
|
||||
// loadMlrrcOrDie rule: If $MLRRC is set, use it and only it. Otherwise try
|
||||
// first $HOME/.mlrrc and then ./.mlrrc but let them stack: e.g. $HOME/.mlrrc
|
||||
// is lots of settings and maybe in one subdir you want to override just a
|
||||
// setting or two.
|
||||
func loadMlrrcOrDie(
|
||||
options *cli.TOptions,
|
||||
) {
|
||||
|
|
@ -41,6 +38,7 @@ func loadMlrrcOrDie(
|
|||
tryLoadMlrrc(options, "./.mlrrc")
|
||||
}
|
||||
|
||||
// tryLoadMlrrc is a helper function for loadMlrrcOrDie.
|
||||
func tryLoadMlrrc(
|
||||
options *cli.TOptions,
|
||||
path string,
|
||||
|
|
@ -85,6 +83,7 @@ func tryLoadMlrrc(
|
|||
return true
|
||||
}
|
||||
|
||||
// handleMlrrcLine is a helper function for loadMlrrcOrDie.
|
||||
func handleMlrrcLine(
|
||||
options *cli.TOptions,
|
||||
line string,
|
||||
|
|
|
|||
|
|
@ -12,7 +12,8 @@ import (
|
|||
"mlr/src/version"
|
||||
)
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// ParseCommandLine is the entrypoint for handling the Miller command line:
|
||||
// flags, verbs and their flags, and input file name(s).
|
||||
func ParseCommandLine(args []string) (
|
||||
options cli.TOptions,
|
||||
recordTransformers []transformers.IRecordTransformer,
|
||||
|
|
@ -60,8 +61,8 @@ func ParseCommandLine(args []string) (
|
|||
}
|
||||
}
|
||||
|
||||
cli.ApplyReaderOptionDefaults(&options.ReaderOptions)
|
||||
cli.ApplyWriterOptionDefaults(&options.WriterOptions)
|
||||
cli.FinalizeReaderOptions(&options.ReaderOptions)
|
||||
cli.FinalizeWriterOptions(&options.WriterOptions)
|
||||
|
||||
// Set an optional global formatter for floating-point values
|
||||
if options.WriterOptions.FPOFMT != "" {
|
||||
|
|
@ -120,10 +121,9 @@ func ParseCommandLine(args []string) (
|
|||
return options, recordTransformers, nil
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Returns a list of transformers, from the starting point in args given by *pargi.
|
||||
// Bumps *pargi to point to remaining post-transformer-setup args, i.e. filenames.
|
||||
|
||||
// parseTransformers returns a list of transformers, from the starting point in
|
||||
// args given by *pargi. Bumps *pargi to point to remaining
|
||||
// post-transformer-setup args, i.e. filenames.
|
||||
func parseTransformers(
|
||||
args []string,
|
||||
pargi *int,
|
||||
|
|
|
|||
|
|
@ -33,23 +33,20 @@ import (
|
|||
|
||||
// ----------------------------------------------------------------
|
||||
type RecordReaderCSVLite struct {
|
||||
readerOptions *cli.TReaderOptions
|
||||
emptyStringMlrval types.Mlrval
|
||||
readerOptions *cli.TReaderOptions
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func NewRecordReaderCSVLite(readerOptions *cli.TReaderOptions) *RecordReaderCSVLite {
|
||||
return &RecordReaderCSVLite{
|
||||
readerOptions: readerOptions,
|
||||
emptyStringMlrval: types.MlrvalFromString(""),
|
||||
readerOptions: readerOptions,
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func NewRecordReaderPPRINT(readerOptions *cli.TReaderOptions) *RecordReaderCSVLite {
|
||||
return &RecordReaderCSVLite{
|
||||
readerOptions: readerOptions,
|
||||
emptyStringMlrval: types.MlrvalFromString(""),
|
||||
readerOptions: readerOptions,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -170,7 +167,7 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader(
|
|||
continue
|
||||
}
|
||||
|
||||
fields := lib.SplitString(line, reader.readerOptions.IFS)
|
||||
fields := lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1)
|
||||
if reader.readerOptions.AllowRepeatIFS {
|
||||
fields = reader.stripEmpties(fields)
|
||||
}
|
||||
|
|
@ -216,7 +213,7 @@ func (reader *RecordReaderCSVLite) processHandleExplicitCSVHeader(
|
|||
if nh > nd {
|
||||
// if header longer than data: use "" values
|
||||
for i = nd; i < nh; i++ {
|
||||
record.PutCopy(headerStrings[i], &reader.emptyStringMlrval)
|
||||
record.PutCopy(headerStrings[i], types.MLRVAL_VOID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -279,7 +276,7 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader(
|
|||
continue
|
||||
}
|
||||
|
||||
fields := lib.SplitString(line, reader.readerOptions.IFS)
|
||||
fields := lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1)
|
||||
if reader.readerOptions.AllowRepeatIFS {
|
||||
fields = reader.stripEmpties(fields)
|
||||
}
|
||||
|
|
@ -327,7 +324,7 @@ func (reader *RecordReaderCSVLite) processHandleImplicitCSVHeader(
|
|||
if nh > nd {
|
||||
// if header longer than data: use "" values
|
||||
for i = nd; i < nh; i++ {
|
||||
record.PutCopy(headerStrings[i], &reader.emptyStringMlrval)
|
||||
record.PutCopy(headerStrings[i], types.MLRVAL_VOID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,7 +13,6 @@ import (
|
|||
|
||||
type RecordReaderDKVP struct {
|
||||
readerOptions *cli.TReaderOptions
|
||||
// TODO: parameterize IRS
|
||||
}
|
||||
|
||||
func NewRecordReaderDKVP(readerOptions *cli.TReaderOptions) *RecordReaderDKVP {
|
||||
|
|
@ -99,7 +98,7 @@ func (reader *RecordReaderDKVP) processHandle(
|
|||
// xxx temp pending autodetect, and pending more windows-port work
|
||||
line = strings.TrimRight(line, "\r")
|
||||
|
||||
record := reader.recordFromDKVPLine(&line)
|
||||
record := reader.recordFromDKVPLine(line)
|
||||
context.UpdateForInputRecord()
|
||||
inputChannel <- types.NewRecordAndContext(
|
||||
record,
|
||||
|
|
@ -110,12 +109,13 @@ func (reader *RecordReaderDKVP) processHandle(
|
|||
|
||||
// ----------------------------------------------------------------
|
||||
func (reader *RecordReaderDKVP) recordFromDKVPLine(
|
||||
line *string,
|
||||
line string,
|
||||
) *types.Mlrmap {
|
||||
record := types.NewMlrmap()
|
||||
pairs := lib.SplitString(*line, reader.readerOptions.IFS)
|
||||
pairs := lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1)
|
||||
|
||||
for i, pair := range pairs {
|
||||
kv := strings.SplitN(pair, reader.readerOptions.IPS, 2)
|
||||
kv := lib.RegexSplitString(reader.readerOptions.IPSRegex, pair, 2)
|
||||
// TODO check length 0. also, check input is empty since "".split() -> [""] not []
|
||||
if len(kv) == 1 {
|
||||
// E.g the pair has no equals sign: "a" rather than "a=1" or
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ import (
|
|||
)
|
||||
|
||||
type RecordReaderNIDX struct {
|
||||
// TODO: use the parameterization for readerOptions.IFS/readerOptions.IPS
|
||||
readerOptions *cli.TReaderOptions
|
||||
}
|
||||
|
||||
|
|
@ -100,7 +99,7 @@ func (reader *RecordReaderNIDX) processHandle(
|
|||
line = strings.TrimRight(line, "\n")
|
||||
line = strings.TrimRight(line, "\r")
|
||||
|
||||
record := recordFromNIDXLine(line, reader.readerOptions.IFS)
|
||||
record := reader.recordFromNIDXLine(line)
|
||||
|
||||
context.UpdateForInputRecord()
|
||||
inputChannel <- types.NewRecordAndContext(
|
||||
|
|
@ -111,12 +110,11 @@ func (reader *RecordReaderNIDX) processHandle(
|
|||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func recordFromNIDXLine(
|
||||
func (reader *RecordReaderNIDX) recordFromNIDXLine(
|
||||
line string,
|
||||
ifs string,
|
||||
) *types.Mlrmap {
|
||||
record := types.NewMlrmap()
|
||||
values := lib.SplitString(line, ifs) // TODO: repifs ...
|
||||
values := lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1)
|
||||
var i int = 0
|
||||
for _, value := range values {
|
||||
i++
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ import (
|
|||
"container/list"
|
||||
"errors"
|
||||
"io"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"mlr/src/cli"
|
||||
|
|
@ -15,21 +14,13 @@ import (
|
|||
|
||||
type RecordReaderXTAB struct {
|
||||
readerOptions *cli.TReaderOptions
|
||||
ifsRegex *regexp.Regexp
|
||||
// TODO: parameterize IRS
|
||||
|
||||
// TODO: port from C
|
||||
// int allow_repeat_ips;
|
||||
// int do_auto_line_term;
|
||||
// int at_eof;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func NewRecordReaderXTAB(readerOptions *cli.TReaderOptions) *RecordReaderXTAB {
|
||||
return &RecordReaderXTAB{
|
||||
readerOptions: readerOptions,
|
||||
// TODO: incorporate IFS
|
||||
ifsRegex: regexp.MustCompile("\\s+"),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -154,8 +145,7 @@ func (reader *RecordReaderXTAB) recordFromXTABLines(
|
|||
for entry := lines.Front(); entry != nil; entry = entry.Next() {
|
||||
line := entry.Value.(string)
|
||||
|
||||
// TODO -- incorporate IFS
|
||||
kv := reader.ifsRegex.Split(line, 2)
|
||||
kv := lib.RegexSplitString(reader.readerOptions.IPSRegex, line, 2)
|
||||
if len(kv) < 1 {
|
||||
return nil, errors.New("mlr: internal coding error in XTAB reader")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -94,6 +94,17 @@ func CompileMillerRegexOrDie(regexString string) *regexp.Regexp {
|
|||
return regex
|
||||
}
|
||||
|
||||
// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
|
||||
// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
|
||||
// but "" splits to [""] when I wish it were []. This function does the latter.
|
||||
func RegexSplitString(regex *regexp.Regexp, input string, n int) []string {
|
||||
if input == "" {
|
||||
return make([]string, 0)
|
||||
} else {
|
||||
return regex.Split(input, n)
|
||||
}
|
||||
}
|
||||
|
||||
// MakeEmptyRegexCaptures is for initial CST state at the start of executing
|
||||
// the DSL expression for the current record. Even if '$x =~ "(..)_(...)" set
|
||||
// "\1" and "\2" on the previous record, at start of processing for the current
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import (
|
|||
|
||||
type RecordWriterMarkdown struct {
|
||||
writerOptions *cli.TWriterOptions
|
||||
ors string
|
||||
|
||||
numHeaderLinesOutput int
|
||||
lastJoinedHeader string
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@ import (
|
|||
|
||||
type RecordWriterNIDX struct {
|
||||
writerOptions *cli.TWriterOptions
|
||||
ofs string
|
||||
ors string
|
||||
}
|
||||
|
||||
func NewRecordWriterNIDX(writerOptions *cli.TWriterOptions) *RecordWriterNIDX {
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ func (writer *RecordWriterPPRINT) Write(
|
|||
)
|
||||
if nonEmpty {
|
||||
// Print a newline
|
||||
ostream.Write([]byte("\n"))
|
||||
ostream.Write([]byte(writer.writerOptions.ORS))
|
||||
}
|
||||
// Start a new batch
|
||||
writer.batch = list.New()
|
||||
|
|
@ -166,13 +166,13 @@ func (writer *RecordWriterPPRINT) writeHeterogenousListNonBarred(
|
|||
buffer.WriteString(colorizer.MaybeColorizeKey(formatted, outputIsStdout))
|
||||
} else {
|
||||
buffer.WriteString(colorizer.MaybeColorizeKey(pe.Key, outputIsStdout))
|
||||
buffer.WriteString("\n") // TODO: ORS
|
||||
buffer.WriteString(writer.writerOptions.ORS)
|
||||
}
|
||||
} else {
|
||||
formatted := fmt.Sprintf("%*s ", maxWidths[pe.Key], pe.Key)
|
||||
buffer.WriteString(colorizer.MaybeColorizeKey(formatted, outputIsStdout))
|
||||
if pe.Next == nil {
|
||||
buffer.WriteString("\n") // TODO: ORS
|
||||
buffer.WriteString(writer.writerOptions.ORS)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -194,13 +194,13 @@ func (writer *RecordWriterPPRINT) writeHeterogenousListNonBarred(
|
|||
buffer.WriteString(colorizer.MaybeColorizeValue(formatted, outputIsStdout))
|
||||
} else {
|
||||
buffer.WriteString(colorizer.MaybeColorizeValue(s, outputIsStdout))
|
||||
buffer.WriteString("\n") // TODO: ORS
|
||||
buffer.WriteString(writer.writerOptions.ORS)
|
||||
}
|
||||
} else {
|
||||
formatted := fmt.Sprintf("%*s ", maxWidths[pe.Key], s)
|
||||
buffer.WriteString(colorizer.MaybeColorizeValue(formatted, outputIsStdout))
|
||||
if pe.Next == nil {
|
||||
buffer.WriteString("\n") // TODO: ORS
|
||||
buffer.WriteString(writer.writerOptions.ORS)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -257,7 +257,7 @@ func (writer *RecordWriterPPRINT) writeHeterogenousListBarred(
|
|||
buffer.WriteString(horizontalMiddle)
|
||||
} else {
|
||||
buffer.WriteString(horizontalEnd)
|
||||
buffer.WriteString("\n") // TOOD: ORS
|
||||
buffer.WriteString(writer.writerOptions.ORS)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -274,7 +274,7 @@ func (writer *RecordWriterPPRINT) writeHeterogenousListBarred(
|
|||
buffer.WriteString(verticalMiddle)
|
||||
} else {
|
||||
buffer.WriteString(verticalEnd)
|
||||
buffer.WriteString("\n") // TOOD: ORS
|
||||
buffer.WriteString(writer.writerOptions.ORS)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -285,7 +285,7 @@ func (writer *RecordWriterPPRINT) writeHeterogenousListBarred(
|
|||
buffer.WriteString(horizontalMiddle)
|
||||
} else {
|
||||
buffer.WriteString(horizontalEnd)
|
||||
buffer.WriteString("\n") // TOOD: ORS
|
||||
buffer.WriteString(writer.writerOptions.ORS)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -309,7 +309,7 @@ func (writer *RecordWriterPPRINT) writeHeterogenousListBarred(
|
|||
buffer.WriteString(fmt.Sprint(verticalMiddle))
|
||||
} else {
|
||||
buffer.WriteString(verticalEnd)
|
||||
buffer.WriteString("\n") // TOOD: ORS
|
||||
buffer.WriteString(writer.writerOptions.ORS)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -321,7 +321,7 @@ func (writer *RecordWriterPPRINT) writeHeterogenousListBarred(
|
|||
buffer.WriteString(horizontalMiddle)
|
||||
} else {
|
||||
buffer.WriteString(horizontalEnd)
|
||||
buffer.WriteString("\n") // TOOD: ORS
|
||||
buffer.WriteString(writer.writerOptions.ORS)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ func (writer *RecordWriterXTAB) Write(
|
|||
if writer.onFirst {
|
||||
writer.onFirst = false
|
||||
} else {
|
||||
buffer.WriteString("\n")
|
||||
buffer.WriteString("\n") // TODO: ORS
|
||||
}
|
||||
|
||||
for pe := outrec.Head; pe != nil; pe = pe.Next {
|
||||
|
|
@ -56,10 +56,10 @@ func (writer *RecordWriterXTAB) Write(
|
|||
buffer.WriteString(colorizer.MaybeColorizeKey(pe.Key, outputIsStdout))
|
||||
buffer.WriteString(" ")
|
||||
for i := 0; i < padLength; i++ {
|
||||
buffer.WriteString(" ")
|
||||
buffer.WriteString(writer.writerOptions.OPS)
|
||||
}
|
||||
buffer.WriteString(colorizer.MaybeColorizeValue(pe.Value.String(), outputIsStdout))
|
||||
buffer.WriteString("\n")
|
||||
buffer.WriteString("\n") // TODO: ORS
|
||||
}
|
||||
ostream.Write(buffer.Bytes())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -219,7 +219,7 @@ func transformerJoinParseCLI(
|
|||
}
|
||||
}
|
||||
|
||||
cli.ApplyReaderOptionDefaults(&opts.joinFlagOptions.ReaderOptions)
|
||||
cli.FinalizeReaderOptions(&opts.joinFlagOptions.ReaderOptions)
|
||||
|
||||
if opts.leftFileName == "" {
|
||||
fmt.Fprintf(os.Stderr, "%s %s: need left file name\n", "mlr", verb)
|
||||
|
|
|
|||
|
|
@ -249,7 +249,7 @@ func transformerPutOrFilterParseCLI(
|
|||
}
|
||||
}
|
||||
|
||||
cli.ApplyWriterOptionDefaults(&options.WriterOptions)
|
||||
cli.FinalizeWriterOptions(&options.WriterOptions)
|
||||
|
||||
// If they've used either of 'mlr put -f {filename}' or 'mlr put -e
|
||||
// {expression}' then that specifies their DSL expression. But if they've
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ func transformerTeeParseCLI(
|
|||
}
|
||||
}
|
||||
|
||||
cli.ApplyWriterOptionDefaults(&localOptions.WriterOptions)
|
||||
cli.FinalizeWriterOptions(&localOptions.WriterOptions)
|
||||
|
||||
// Get the filename/command from the command line, after the flags
|
||||
if argi >= argc {
|
||||
|
|
|
|||
|
|
@ -110,6 +110,7 @@ HELP OPTIONS
|
|||
mlr help file-formats
|
||||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
@ -605,6 +606,17 @@ SEPARATOR FLAGS
|
|||
- C-style escape sequences, e.g. `--rs '\r\n' --fs '\t'`.
|
||||
- To avoid backslashing, you can use any of the following names:
|
||||
|
||||
ascii_esc = "\x1b"
|
||||
ascii_etx = "\x04"
|
||||
ascii_fs = "\x1c"
|
||||
ascii_gs = "\x1d"
|
||||
ascii_null = "\x01"
|
||||
ascii_rs = "\x1e"
|
||||
ascii_soh = "\x02"
|
||||
ascii_stx = "\x03"
|
||||
ascii_us = "\x1f"
|
||||
asv_fs = "\x1f"
|
||||
asv_rs = "\x1e"
|
||||
colon = ":"
|
||||
comma = ","
|
||||
cr = "\r"
|
||||
|
|
@ -619,7 +631,12 @@ SEPARATOR FLAGS
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\t"
|
||||
tabs = "(\t)+"
|
||||
usv_fs = "\xe2\x90\x9f"
|
||||
usv_rs = "\xe2\x90\x9e"
|
||||
whitespace = "([ \t])+"
|
||||
|
||||
* Default separators by format:
|
||||
|
||||
|
|
@ -2720,4 +2737,4 @@ SEE ALSO
|
|||
|
||||
|
||||
|
||||
2021-09-20 MILLER(1)
|
||||
2021-09-21 MILLER(1)
|
||||
|
|
|
|||
21
man6/mlr6.1
21
man6/mlr6.1
|
|
@ -2,12 +2,12 @@
|
|||
.\" Title: mlr
|
||||
.\" Author: [see the "AUTHOR" section]
|
||||
.\" Generator: ./mkman.rb
|
||||
.\" Date: 2021-09-20
|
||||
.\" Date: 2021-09-21
|
||||
.\" Manual: \ \&
|
||||
.\" Source: \ \&
|
||||
.\" Language: English
|
||||
.\"
|
||||
.TH "MILLER" "1" "2021-09-20" "\ \&" "\ \&"
|
||||
.TH "MILLER" "1" "2021-09-21" "\ \&" "\ \&"
|
||||
.\" -----------------------------------------------------------------
|
||||
.\" * Portability definitions
|
||||
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
@ -145,6 +145,7 @@ Essentials:
|
|||
mlr help file-formats
|
||||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
@ -748,6 +749,17 @@ Notes about all other separators:
|
|||
- C-style escape sequences, e.g. `--rs '\er\en' --fs '\et'`.
|
||||
- To avoid backslashing, you can use any of the following names:
|
||||
|
||||
ascii_esc = "\ex1b"
|
||||
ascii_etx = "\ex04"
|
||||
ascii_fs = "\ex1c"
|
||||
ascii_gs = "\ex1d"
|
||||
ascii_null = "\ex01"
|
||||
ascii_rs = "\ex1e"
|
||||
ascii_soh = "\ex02"
|
||||
ascii_stx = "\ex03"
|
||||
ascii_us = "\ex1f"
|
||||
asv_fs = "\ex1f"
|
||||
asv_rs = "\ex1e"
|
||||
colon = ":"
|
||||
comma = ","
|
||||
cr = "\er"
|
||||
|
|
@ -762,7 +774,12 @@ Notes about all other separators:
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\et"
|
||||
tabs = "(\et)+"
|
||||
usv_fs = "\exe2\ex90\ex9f"
|
||||
usv_rs = "\exe2\ex90\ex9e"
|
||||
whitespace = "([ \et])+"
|
||||
|
||||
* Default separators by format:
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue