mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
Make --ifs-regex and --ips-regex explicit command-line flags (#799)
* Function-pointerize IXS/IXSRegex to reduce runtime iffelsing * remove IsRegexString and SuppressIXSRegex * regression tests passing * doc updates
This commit is contained in:
parent
c43a7109d1
commit
096bb9bc12
26 changed files with 417 additions and 245 deletions
|
|
@ -132,6 +132,7 @@ HELP OPTIONS
|
|||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help list-separator-regex-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
@ -709,11 +710,14 @@ SEPARATOR FLAGS
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\t"
|
||||
tabs = "(\t)+"
|
||||
usv_fs = "\xe2\x90\x9f"
|
||||
usv_rs = "\xe2\x90\x9e"
|
||||
|
||||
- Similarly, you can use the following for `--ifs-regex` and `--ips-regex`:
|
||||
|
||||
spaces = "( )+"
|
||||
tabs = "(\t)+"
|
||||
whitespace = "([ \t])+"
|
||||
|
||||
* Default separators by format:
|
||||
|
|
@ -730,12 +734,10 @@ SEPARATOR FLAGS
|
|||
|
||||
--fs {string} Specify FS for input and output.
|
||||
--ifs {string} Specify FS for input.
|
||||
--ifs-regex {string} Specify FS for input as a regular expression.
|
||||
--ips {string} Specify PS for input.
|
||||
--ips-regex {string} Specify PS for input as a regular expression.
|
||||
--irs {string} Specify RS for input.
|
||||
--no-ifs-regex Don't treat IFS value as a regular expression. Useful
|
||||
if your IFS is ".".
|
||||
--no-ips-regex Don't treat IPS value as a regular expression. Useful
|
||||
if your IPS is ".".
|
||||
--ofs {string} Specify FS for output.
|
||||
--ops {string} Specify PS for output.
|
||||
--ors {string} Specify RS for output.
|
||||
|
|
@ -1250,7 +1252,6 @@ VERBS
|
|||
--ifs {field-separator character}
|
||||
--ips {pair-separator character}
|
||||
--repifs
|
||||
--repips
|
||||
--implicit-csv-header
|
||||
--no-implicit-csv-header
|
||||
For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will
|
||||
|
|
@ -3017,5 +3018,5 @@ SEE ALSO
|
|||
|
||||
|
||||
|
||||
2021-12-23 MILLER(1)
|
||||
2021-12-25 MILLER(1)
|
||||
</pre>
|
||||
|
|
|
|||
|
|
@ -111,6 +111,7 @@ HELP OPTIONS
|
|||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help list-separator-regex-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
@ -688,11 +689,14 @@ SEPARATOR FLAGS
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\t"
|
||||
tabs = "(\t)+"
|
||||
usv_fs = "\xe2\x90\x9f"
|
||||
usv_rs = "\xe2\x90\x9e"
|
||||
|
||||
- Similarly, you can use the following for `--ifs-regex` and `--ips-regex`:
|
||||
|
||||
spaces = "( )+"
|
||||
tabs = "(\t)+"
|
||||
whitespace = "([ \t])+"
|
||||
|
||||
* Default separators by format:
|
||||
|
|
@ -709,12 +713,10 @@ SEPARATOR FLAGS
|
|||
|
||||
--fs {string} Specify FS for input and output.
|
||||
--ifs {string} Specify FS for input.
|
||||
--ifs-regex {string} Specify FS for input as a regular expression.
|
||||
--ips {string} Specify PS for input.
|
||||
--ips-regex {string} Specify PS for input as a regular expression.
|
||||
--irs {string} Specify RS for input.
|
||||
--no-ifs-regex Don't treat IFS value as a regular expression. Useful
|
||||
if your IFS is ".".
|
||||
--no-ips-regex Don't treat IPS value as a regular expression. Useful
|
||||
if your IPS is ".".
|
||||
--ofs {string} Specify FS for output.
|
||||
--ops {string} Specify PS for output.
|
||||
--ors {string} Specify RS for output.
|
||||
|
|
@ -1229,7 +1231,6 @@ VERBS
|
|||
--ifs {field-separator character}
|
||||
--ips {pair-separator character}
|
||||
--repifs
|
||||
--repips
|
||||
--implicit-csv-header
|
||||
--no-implicit-csv-header
|
||||
For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will
|
||||
|
|
@ -2996,4 +2997,4 @@ SEE ALSO
|
|||
|
||||
|
||||
|
||||
2021-12-23 MILLER(1)
|
||||
2021-12-25 MILLER(1)
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ Essentials:
|
|||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help list-separator-regex-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
|
|||
|
|
@ -541,11 +541,14 @@ Notes about all other separators:
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\t"
|
||||
tabs = "(\t)+"
|
||||
usv_fs = "\xe2\x90\x9f"
|
||||
usv_rs = "\xe2\x90\x9e"
|
||||
|
||||
- Similarly, you can use the following for `--ifs-regex` and `--ips-regex`:
|
||||
|
||||
spaces = "( )+"
|
||||
tabs = "(\t)+"
|
||||
whitespace = "([ \t])+"
|
||||
|
||||
* Default separators by format:
|
||||
|
|
@ -567,14 +570,14 @@ Notes about all other separators:
|
|||
`: Specify FS for input and output.
|
||||
* `--ifs {string}
|
||||
`: Specify FS for input.
|
||||
* `--ifs-regex {string}
|
||||
`: Specify FS for input as a regular expression.
|
||||
* `--ips {string}
|
||||
`: Specify PS for input.
|
||||
* `--ips-regex {string}
|
||||
`: Specify PS for input as a regular expression.
|
||||
* `--irs {string}
|
||||
`: Specify RS for input.
|
||||
* `--no-ifs-regex
|
||||
`: Don't treat IFS value as a regular expression. Useful if your IFS is ".".
|
||||
* `--no-ips-regex
|
||||
`: Don't treat IPS value as a regular expression. Useful if your IPS is ".".
|
||||
* `--ofs {string}
|
||||
`: Specify FS for output.
|
||||
* `--ops {string}
|
||||
|
|
|
|||
|
|
@ -76,6 +76,24 @@ c:3;a:1;b:2
|
|||
c:6;a:4;b:5
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --csv head -n 2 example.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
color,shape,flag,k,index,quantity,rate
|
||||
yellow,triangle,true,1,11,43.6498,9.8870
|
||||
red,square,true,2,15,79.2778,0.0130
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --csv --ofs pipe head -n 2 example.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
color|shape|flag|k|index|quantity|rate
|
||||
yellow|triangle|true|1|11|43.6498|9.8870
|
||||
red|square|true|2|15|79.2778|0.0130
|
||||
</pre>
|
||||
|
||||
If your data has non-default separators and you don't want to change those
|
||||
between input and output, you can use `--rs`, `--fs`, and `--ps`. Setting `--fs
|
||||
:` is the same as setting `--ifs : --ofs :`, but with fewer keystrokes.
|
||||
|
|
@ -96,9 +114,11 @@ c:3;a:1;b:2
|
|||
c:6;a:4;b:5
|
||||
</pre>
|
||||
|
||||
## Multi-character and regular-expression separators
|
||||
## Multi-character separators
|
||||
|
||||
The separators default to single characters, but can be multiple characters if you like:
|
||||
All separators can be multi-character, except for file formats which don't
|
||||
allow parameterization (see below). And for CSV (CSV-lite doesn't have these
|
||||
restrictions), IRS must be `\n` and IFS must be a single character.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --ifs ';' --ips : --ofs ';;;' --ops := cut -o -f c,a,b data/modsep.dkvp</b>
|
||||
|
|
@ -108,23 +128,13 @@ c:=3;;;a:=1;;;b:=2
|
|||
c:=6;;;a:=4;;;b:=5
|
||||
</pre>
|
||||
|
||||
As of September 2021:
|
||||
|
||||
* `IFS` and `IPS` can be regular expressions.
|
||||
* `IRS` can be multi-character (except for file formats which don't allow parameterization -- see below)
|
||||
* `OFS`, `OPS`, and `ORS` can be multi-character.
|
||||
|
||||
Since `IFS` and `IPS` can be regular expressions, if your data has field
|
||||
separators which are one or more consecutive spaces, you can use `--ifs '(
|
||||
)+'`. But that gets a little tedious, so Miller has the `--repifs` and
|
||||
`--repips` flags you can use if you like. This wraps the `IFS` or `IPS`, say
|
||||
`X`, as `(X)+`.
|
||||
|
||||
The `--repifs` flag means that multiple successive occurrences of the field
|
||||
If your data has field separators which are one or more consecutive spaces, you
|
||||
can use `--ifs space --repifs`.
|
||||
More generally, the `--repifs` flag means that multiple successive occurrences of the field
|
||||
separator count as one. For example, in CSV data we often signify nulls by
|
||||
empty strings, e.g. `2,9,,,,,6,5,4`. On the other hand, if the field separator
|
||||
is a space, it might be more natural to parse `2 4 5` the same as `2 4 5`:
|
||||
`--repifs --ifs ' '` lets this happen. In fact, the `--ipprint` option above
|
||||
`--repifs --ifs ' '` lets this happen. In fact, the `--ipprint` option
|
||||
is internally implemented in terms of `--repifs`.
|
||||
|
||||
For example:
|
||||
|
|
@ -158,6 +168,15 @@ early light what so
|
|||
4 so
|
||||
</pre>
|
||||
|
||||
## Regular-expression separators
|
||||
|
||||
`IFS` and `IPS` can be regular expressions: use `--ifs-regex` or `--ips-regex` in place of
|
||||
`--ifs` or `--ips`, respectively.
|
||||
|
||||
You can also use either `--ifs space --repifs` or `--ifs-regex '()+'`. (But that gets a little tedious,
|
||||
so there are aliases listed below.) Note however that `--ifs space --repifs` is about 3x faster than
|
||||
`--ifs-regex '( )+'` -- regular expressions are powerful, but slower.
|
||||
|
||||
## Aliases
|
||||
|
||||
Many things we'd like to write as separators need to be escaped from the shell
|
||||
|
|
@ -192,16 +211,25 @@ pipe = "|"
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\t"
|
||||
tabs = "(\t)+"
|
||||
usv_fs = "\xe2\x90\x9f"
|
||||
usv_rs = "\xe2\x90\x9e"
|
||||
</pre>
|
||||
|
||||
And for `--ifs-regex` and `--ips-regex`:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr help list-separator-regex-aliases</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
spaces = "( )+"
|
||||
tabs = "(\t)+"
|
||||
whitespace = "([ \t])+"
|
||||
</pre>
|
||||
|
||||
Note that `spaces`, `tabs`, and `whitespace` already are regexes so you
|
||||
shouldn't use `--repifs` with them.
|
||||
shouldn't use `--repifs` with them. (In fact, the `--repifs` flag is ignored
|
||||
when `--ifs-regex` is provided.)
|
||||
|
||||
## Command-line flags
|
||||
|
||||
|
|
@ -209,8 +237,8 @@ Given the above, we now have seen the following flags:
|
|||
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
--rs --irs --ors
|
||||
--fs --ifs --ofs --repifs
|
||||
--ps --ips --ops
|
||||
--fs --ifs --ofs --repifs --ifs-regex
|
||||
--ps --ips --ops --ips-regex
|
||||
</pre>
|
||||
|
||||
See also the [separator-flags section](reference-main-flag-list.md#separator-flags).
|
||||
|
|
|
|||
|
|
@ -48,6 +48,14 @@ GENMD-RUN-COMMAND
|
|||
mlr --ifs , --ofs ';' --ips = --ops : cut -o -f c,a,b data/a.dkvp
|
||||
GENMD-EOF
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --csv head -n 2 example.csv
|
||||
GENMD-EOF
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --csv --ofs pipe head -n 2 example.csv
|
||||
GENMD-EOF
|
||||
|
||||
If your data has non-default separators and you don't want to change those
|
||||
between input and output, you can use `--rs`, `--fs`, and `--ps`. Setting `--fs
|
||||
:` is the same as setting `--ifs : --ofs :`, but with fewer keystrokes.
|
||||
|
|
@ -60,31 +68,23 @@ GENMD-RUN-COMMAND
|
|||
mlr --fs ';' --ps : cut -o -f c,a,b data/modsep.dkvp
|
||||
GENMD-EOF
|
||||
|
||||
## Multi-character and regular-expression separators
|
||||
## Multi-character separators
|
||||
|
||||
The separators default to single characters, but can be multiple characters if you like:
|
||||
All separators can be multi-character, except for file formats which don't
|
||||
allow parameterization (see below). And for CSV (CSV-lite doesn't have these
|
||||
restrictions), IRS must be `\n` and IFS must be a single character.
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --ifs ';' --ips : --ofs ';;;' --ops := cut -o -f c,a,b data/modsep.dkvp
|
||||
GENMD-EOF
|
||||
|
||||
As of September 2021:
|
||||
|
||||
* `IFS` and `IPS` can be regular expressions.
|
||||
* `IRS` can be multi-character (except for file formats which don't allow parameterization -- see below)
|
||||
* `OFS`, `OPS`, and `ORS` can be multi-character.
|
||||
|
||||
Since `IFS` and `IPS` can be regular expressions, if your data has field
|
||||
separators which are one or more consecutive spaces, you can use `--ifs '(
|
||||
)+'`. But that gets a little tedious, so Miller has the `--repifs` and
|
||||
`--repips` flags you can use if you like. This wraps the `IFS` or `IPS`, say
|
||||
`X`, as `(X)+`.
|
||||
|
||||
The `--repifs` flag means that multiple successive occurrences of the field
|
||||
If your data has field separators which are one or more consecutive spaces, you
|
||||
can use `--ifs space --repifs`.
|
||||
More generally, the `--repifs` flag means that multiple successive occurrences of the field
|
||||
separator count as one. For example, in CSV data we often signify nulls by
|
||||
empty strings, e.g. `2,9,,,,,6,5,4`. On the other hand, if the field separator
|
||||
is a space, it might be more natural to parse `2 4 5` the same as `2 4 5`:
|
||||
`--repifs --ifs ' '` lets this happen. In fact, the `--ipprint` option above
|
||||
`--repifs --ifs ' '` lets this happen. In fact, the `--ipprint` option
|
||||
is internally implemented in terms of `--repifs`.
|
||||
|
||||
For example:
|
||||
|
|
@ -97,6 +97,15 @@ GENMD-RUN-COMMAND
|
|||
mlr --ifs ' ' --repifs --inidx --oxtab cat data/extra-spaces.txt
|
||||
GENMD-EOF
|
||||
|
||||
## Regular-expression separators
|
||||
|
||||
`IFS` and `IPS` can be regular expressions: use `--ifs-regex` or `--ips-regex` in place of
|
||||
`--ifs` or `--ips`, respectively.
|
||||
|
||||
You can also use either `--ifs space --repifs` or `--ifs-regex '()+'`. (But that gets a little tedious,
|
||||
so there are aliases listed below.) Note however that `--ifs space --repifs` is about 3x faster than
|
||||
`--ifs-regex '( )+'` -- regular expressions are powerful, but slower.
|
||||
|
||||
## Aliases
|
||||
|
||||
Many things we'd like to write as separators need to be escaped from the shell
|
||||
|
|
@ -106,8 +115,15 @@ GENMD-RUN-COMMAND
|
|||
mlr help list-separator-aliases
|
||||
GENMD-EOF
|
||||
|
||||
And for `--ifs-regex` and `--ips-regex`:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr help list-separator-regex-aliases
|
||||
GENMD-EOF
|
||||
|
||||
Note that `spaces`, `tabs`, and `whitespace` already are regexes so you
|
||||
shouldn't use `--repifs` with them.
|
||||
shouldn't use `--repifs` with them. (In fact, the `--repifs` flag is ignored
|
||||
when `--ifs-regex` is provided.)
|
||||
|
||||
## Command-line flags
|
||||
|
||||
|
|
@ -115,8 +131,8 @@ Given the above, we now have seen the following flags:
|
|||
|
||||
GENMD-CARDIFY
|
||||
--rs --irs --ors
|
||||
--fs --ifs --ofs --repifs
|
||||
--ps --ips --ops
|
||||
--fs --ifs --ofs --repifs --ifs-regex
|
||||
--ps --ips --ops --ips-regex
|
||||
GENMD-EOF
|
||||
|
||||
See also the [separator-flags section](reference-main-flag-list.md#separator-flags).
|
||||
|
|
|
|||
|
|
@ -1589,7 +1589,6 @@ the main "mlr --help" for more information on syntax for these arguments:
|
|||
--ifs {field-separator character}
|
||||
--ips {pair-separator character}
|
||||
--repifs
|
||||
--repips
|
||||
--implicit-csv-header
|
||||
--no-implicit-csv-header
|
||||
For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will
|
||||
|
|
|
|||
|
|
@ -73,6 +73,7 @@ func init() {
|
|||
handlerInfos: []tHandlerInfo{
|
||||
{name: "flags", zaryHandlerFunc: showFlagHelp},
|
||||
{name: "list-separator-aliases", zaryHandlerFunc: listSeparatorAliases},
|
||||
{name: "list-separator-regex-aliases", zaryHandlerFunc: listSeparatorRegexAliases},
|
||||
// Per-section entries will be computed and installed below
|
||||
},
|
||||
},
|
||||
|
|
@ -315,6 +316,10 @@ func listSeparatorAliases() {
|
|||
cli.ListSeparatorAliasesForOnlineHelp()
|
||||
}
|
||||
|
||||
func listSeparatorRegexAliases() {
|
||||
cli.ListSeparatorRegexAliasesForOnlineHelp()
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func helpAuxents() {
|
||||
fmt.Print(`Miller has a few otherwise-standalone executables packaged within it.
|
||||
|
|
|
|||
|
|
@ -25,3 +25,14 @@ func SeparatorFromArg(name string) string {
|
|||
return name
|
||||
}
|
||||
}
|
||||
|
||||
// SeparatorRegexFromArg is for letting people do things like `--ifs-regex whitespace`
|
||||
// rather than `--ifs '([ \t])+'`.
|
||||
func SeparatorRegexFromArg(name string) string {
|
||||
sep, ok := SEPARATOR_REGEX_NAMES_TO_VALUES[name]
|
||||
if ok {
|
||||
return sep
|
||||
} else {
|
||||
return name
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,7 +36,11 @@ func FinalizeReaderOptions(readerOptions *TReaderOptions) {
|
|||
readerOptions.IFS = defaultFSes[readerOptions.InputFileFormat]
|
||||
}
|
||||
if !readerOptions.ipsWasSpecified {
|
||||
//if readerOptions.InputFileFormat == "xtab" && !readerOptions.ipsWasSpecified {
|
||||
//readerOptions.IPSRegex = lib.CompileMillerRegexOrDie(WHITESPACE_REGEX)
|
||||
//} else {
|
||||
readerOptions.IPS = defaultPSes[readerOptions.InputFileFormat]
|
||||
//}
|
||||
}
|
||||
if !readerOptions.irsWasSpecified {
|
||||
readerOptions.IRS = defaultRSes[readerOptions.InputFileFormat]
|
||||
|
|
@ -46,40 +50,14 @@ func FinalizeReaderOptions(readerOptions *TReaderOptions) {
|
|||
// and spaces, that should now be the default for NIDX. But *only* for NIDX format,
|
||||
// and if IFS wasn't specified.
|
||||
if readerOptions.InputFileFormat == "nidx" && !readerOptions.ifsWasSpecified {
|
||||
readerOptions.IFS = WHITESPACE
|
||||
readerOptions.IFSRegex = lib.CompileMillerRegexOrDie(WHITESPACE_REGEX)
|
||||
} else {
|
||||
readerOptions.AllowRepeatIFS = defaultAllowRepeatIFSes[readerOptions.InputFileFormat]
|
||||
}
|
||||
}
|
||||
if !readerOptions.allowRepeatIPSWasSpecified {
|
||||
readerOptions.AllowRepeatIPS = defaultAllowRepeatIPSes[readerOptions.InputFileFormat]
|
||||
}
|
||||
|
||||
if readerOptions.SuppressIFSRegexing {
|
||||
readerOptions.IFSRegex = nil
|
||||
} else if readerOptions.AllowRepeatIFS {
|
||||
readerOptions.IFSRegex = lib.CompileMillerRegexOrDie("(" + readerOptions.IFS + ")+")
|
||||
} else if !lib.IsRegexString(readerOptions.IFS) {
|
||||
// Using regex-splitting on IFS/IPS in record-readers that support it is a HUGE perf hit (almost 2x).
|
||||
// Don't use it unless these are actually value-adding regexes.
|
||||
readerOptions.IFSRegex = nil
|
||||
} else {
|
||||
readerOptions.IFSRegex = lib.CompileMillerRegexOrDie(readerOptions.IFS)
|
||||
}
|
||||
|
||||
if readerOptions.SuppressIPSRegexing {
|
||||
readerOptions.IPSRegex = nil
|
||||
} else if readerOptions.AllowRepeatIPS {
|
||||
readerOptions.IPSRegex = lib.CompileMillerRegexOrDie("(" + readerOptions.IPS + ")+")
|
||||
} else if !lib.IsRegexString(readerOptions.IPS) {
|
||||
// Using regex-splitting on IFS/IPS in record-readers that support it
|
||||
// is a HUGE perf hit (almost 2x). Don't use it unless these are
|
||||
// actually value-adding regexes.
|
||||
readerOptions.IPSRegex = nil
|
||||
} else {
|
||||
readerOptions.IPSRegex = lib.CompileMillerRegexOrDie(readerOptions.IPS)
|
||||
}
|
||||
|
||||
readerOptions.IFS = lib.UnbackslashStringLiteral(readerOptions.IFS)
|
||||
readerOptions.IPS = lib.UnbackslashStringLiteral(readerOptions.IPS)
|
||||
readerOptions.IRS = lib.UnbackslashStringLiteral(readerOptions.IRS)
|
||||
}
|
||||
|
||||
|
|
@ -193,6 +171,14 @@ Notes about all other separators:
|
|||
}
|
||||
fmt.Println()
|
||||
|
||||
fmt.Println(" - Similarly, you can use the following for `--ifs-regex` and `--ips-regex`:")
|
||||
fmt.Println()
|
||||
aliases = lib.GetArrayKeysSorted(SEPARATOR_REGEX_NAMES_TO_VALUES)
|
||||
for _, alias := range aliases {
|
||||
fmt.Printf(" %-10s = \"%s\"\n", alias, SEPARATOR_REGEX_NAMES_TO_VALUES[alias])
|
||||
}
|
||||
fmt.Println()
|
||||
|
||||
fmt.Println("* Default separators by format:")
|
||||
fmt.Println()
|
||||
|
||||
|
|
@ -227,6 +213,16 @@ func ListSeparatorAliasesForOnlineHelp() {
|
|||
}
|
||||
}
|
||||
|
||||
func ListSeparatorRegexAliasesForOnlineHelp() {
|
||||
// Go doesn't preserve insertion order in its arrays so here we are inlining a sort.
|
||||
aliases := lib.GetArrayKeysSorted(SEPARATOR_REGEX_NAMES_TO_VALUES)
|
||||
for _, alias := range aliases {
|
||||
// Really absurd level of indent needed to get fixed-with font in mkdocs here,
|
||||
// I don't know why. Usually it only takes 4, not 10.
|
||||
fmt.Printf("%-10s = \"%s\"\n", alias, SEPARATOR_REGEX_NAMES_TO_VALUES[alias])
|
||||
}
|
||||
}
|
||||
|
||||
func init() { SeparatorFlagSection.Sort() }
|
||||
|
||||
var SeparatorFlagSection = FlagSection{
|
||||
|
|
@ -251,6 +247,23 @@ var SeparatorFlagSection = FlagSection{
|
|||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "--ifs-regex",
|
||||
arg: "{string}",
|
||||
help: "Specify FS for input as a regular expression.",
|
||||
parser: func(args []string, argc int, pargi *int, options *TOptions) {
|
||||
CheckArgCount(args, *pargi, argc, 2)
|
||||
// Backward compatibility with Miller <= 5. Auto-inference of
|
||||
// LF vs CR/LF line endings is handled within Go libraries so
|
||||
// we needn't do anything ourselves.
|
||||
if args[*pargi+1] != "auto" {
|
||||
options.ReaderOptions.IFSRegex = lib.CompileMillerRegexOrDie(SeparatorRegexFromArg(args[*pargi+1]))
|
||||
options.ReaderOptions.ifsWasSpecified = true
|
||||
}
|
||||
*pargi += 2
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "--ips",
|
||||
arg: "{string}",
|
||||
|
|
@ -263,6 +276,18 @@ var SeparatorFlagSection = FlagSection{
|
|||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "--ips-regex",
|
||||
arg: "{string}",
|
||||
help: "Specify PS for input as a regular expression.",
|
||||
parser: func(args []string, argc int, pargi *int, options *TOptions) {
|
||||
CheckArgCount(args, *pargi, argc, 2)
|
||||
options.ReaderOptions.IPSRegex = lib.CompileMillerRegexOrDie(SeparatorRegexFromArg(args[*pargi+1]))
|
||||
options.ReaderOptions.ipsWasSpecified = true
|
||||
*pargi += 2
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "--irs",
|
||||
arg: "{string}",
|
||||
|
|
@ -382,24 +407,6 @@ var SeparatorFlagSection = FlagSection{
|
|||
*pargi += 2
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "--no-ifs-regex",
|
||||
help: `Don't treat IFS value as a regular expression. Useful if your IFS is ".".`,
|
||||
parser: func(args []string, argc int, pargi *int, options *TOptions) {
|
||||
options.ReaderOptions.SuppressIFSRegexing = true
|
||||
*pargi += 1
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "--no-ips-regex",
|
||||
help: `Don't treat IPS value as a regular expression. Useful if your IPS is ".".`,
|
||||
parser: func(args []string, argc int, pargi *int, options *TOptions) {
|
||||
options.ReaderOptions.SuppressIPSRegexing = true
|
||||
*pargi += 1
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -36,17 +36,14 @@ type TGeneratorOptions struct {
|
|||
}
|
||||
|
||||
type TReaderOptions struct {
|
||||
InputFileFormat string
|
||||
IFS string
|
||||
IPS string
|
||||
IRS string
|
||||
AllowRepeatIFS bool
|
||||
AllowRepeatIPS bool
|
||||
IFSRegex *regexp.Regexp
|
||||
IPSRegex *regexp.Regexp
|
||||
SuppressIFSRegexing bool // e.g. if they want to do '--ifs .' since '.' is a regex metacharacter
|
||||
SuppressIPSRegexing bool // e.g. if they want to do '--ips .' since '.' is a regex metacharacter
|
||||
DedupeFieldNames bool
|
||||
InputFileFormat string
|
||||
IFS string
|
||||
IPS string
|
||||
IRS string
|
||||
AllowRepeatIFS bool
|
||||
IFSRegex *regexp.Regexp
|
||||
IPSRegex *regexp.Regexp
|
||||
DedupeFieldNames bool
|
||||
|
||||
// If unspecified on the command line, these take input-format-dependent
|
||||
// defaults. E.g. default FS is comma for DKVP but space for NIDX;
|
||||
|
|
@ -55,7 +52,6 @@ type TReaderOptions struct {
|
|||
ipsWasSpecified bool
|
||||
irsWasSpecified bool
|
||||
allowRepeatIFSWasSpecified bool
|
||||
allowRepeatIPSWasSpecified bool
|
||||
|
||||
UseImplicitCSVHeader bool
|
||||
AllowRaggedCSVInput bool
|
||||
|
|
|
|||
|
|
@ -14,10 +14,11 @@ const PIPE = "|"
|
|||
const SEMICOLON = ";"
|
||||
const SLASH = "/"
|
||||
const SPACE = " "
|
||||
const SPACES = "( )+"
|
||||
const TAB = "\\t"
|
||||
const TABS = "(\\t)+"
|
||||
const WHITESPACE = "([ \\t])+"
|
||||
|
||||
const SPACES_REGEX = "( )+"
|
||||
const TABS_REGEX = "(\\t)+"
|
||||
const WHITESPACE_REGEX = "([ \\t])+"
|
||||
|
||||
const ASCII_ESC = "\\x1b"
|
||||
const ASCII_ETX = "\\x04"
|
||||
|
|
@ -67,12 +68,15 @@ var SEPARATOR_NAMES_TO_VALUES = map[string]string{
|
|||
"semicolon": SEMICOLON,
|
||||
"slash": SLASH,
|
||||
"space": SPACE,
|
||||
"spaces": SPACES,
|
||||
"tab": TAB,
|
||||
"tabs": TABS,
|
||||
"usv_fs": USV_FS,
|
||||
"usv_rs": USV_RS,
|
||||
"whitespace": WHITESPACE,
|
||||
}
|
||||
|
||||
var SEPARATOR_REGEX_NAMES_TO_VALUES = map[string]string{
|
||||
"spaces": SPACES_REGEX,
|
||||
"tabs": TABS_REGEX,
|
||||
"whitespace": WHITESPACE_REGEX,
|
||||
}
|
||||
|
||||
// E.g. if IFS isn't specified, it's space for NIDX and comma for DKVP, etc.
|
||||
|
|
@ -96,7 +100,7 @@ var defaultPSes = map[string]string{
|
|||
"markdown": "N/A",
|
||||
"nidx": "N/A",
|
||||
"pprint": "N/A",
|
||||
"xtab": " ", // todo: windows-dependent ...
|
||||
"xtab": " ",
|
||||
}
|
||||
|
||||
var defaultRSes = map[string]string{
|
||||
|
|
@ -120,14 +124,3 @@ var defaultAllowRepeatIFSes = map[string]bool{
|
|||
"pprint": true,
|
||||
"xtab": false,
|
||||
}
|
||||
|
||||
var defaultAllowRepeatIPSes = map[string]bool{
|
||||
"csv": false,
|
||||
"csvlite": false,
|
||||
"dkvp": false,
|
||||
"json": false,
|
||||
"markdown": false,
|
||||
"nidx": false,
|
||||
"pprint": false,
|
||||
"xtab": true,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,10 +1,17 @@
|
|||
// This file contains the interface for file-format-specific record-readers, as
|
||||
// well as a collection of utility functions.
|
||||
|
||||
package input
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"container/list"
|
||||
"io"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/johnkerl/miller/internal/pkg/cli"
|
||||
"github.com/johnkerl/miller/internal/pkg/lib"
|
||||
"github.com/johnkerl/miller/internal/pkg/types"
|
||||
)
|
||||
|
||||
|
|
@ -123,3 +130,68 @@ func channelizedLineScanner(
|
|||
linesChannel <- lines
|
||||
close(linesChannel) // end-of-stream marker
|
||||
}
|
||||
|
||||
// IPairSplitter splits a string into left and right, e.g. for IPS.
|
||||
// This helps us reuse code for splitting by IPS string, or IPS regex.
|
||||
type iPairSplitter interface {
|
||||
Split(input string) []string
|
||||
}
|
||||
|
||||
func newPairSplitter(options *cli.TReaderOptions) iPairSplitter {
|
||||
if options.IPSRegex == nil {
|
||||
return &tIPSSplitter{ips: options.IPS}
|
||||
} else {
|
||||
return &tIPSRegexSplitter{ipsRegex: options.IPSRegex}
|
||||
}
|
||||
}
|
||||
|
||||
type tIPSSplitter struct {
|
||||
ips string
|
||||
}
|
||||
|
||||
func (s *tIPSSplitter) Split(input string) []string {
|
||||
return strings.SplitN(input, s.ips, 2)
|
||||
}
|
||||
|
||||
type tIPSRegexSplitter struct {
|
||||
ipsRegex *regexp.Regexp
|
||||
}
|
||||
|
||||
func (s *tIPSRegexSplitter) Split(input string) []string {
|
||||
return lib.RegexSplitString(s.ipsRegex, input, 2)
|
||||
}
|
||||
|
||||
// IFieldSplitter splits a string into pieces, e.g. for IFS.
|
||||
// This helps us reuse code for splitting by IFS string, or IFS regex.
|
||||
type iFieldSplitter interface {
|
||||
Split(input string) []string
|
||||
}
|
||||
|
||||
func newFieldSplitter(options *cli.TReaderOptions) iFieldSplitter {
|
||||
if options.IFSRegex == nil {
|
||||
return &tIFSSplitter{ifs: options.IFS, allowRepeatIFS: options.AllowRepeatIFS}
|
||||
} else {
|
||||
return &tIFSRegexSplitter{ifsRegex: options.IFSRegex}
|
||||
}
|
||||
}
|
||||
|
||||
type tIFSSplitter struct {
|
||||
ifs string
|
||||
allowRepeatIFS bool
|
||||
}
|
||||
|
||||
func (s *tIFSSplitter) Split(input string) []string {
|
||||
fields := lib.SplitString(input, s.ifs)
|
||||
if s.allowRepeatIFS {
|
||||
fields = lib.StripEmpties(fields) // left/right trim
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
type tIFSRegexSplitter struct {
|
||||
ifsRegex *regexp.Regexp
|
||||
}
|
||||
|
||||
func (s *tIFSRegexSplitter) Split(input string) []string {
|
||||
return lib.RegexSplitString(s.ifsRegex, input, -1)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,6 +49,7 @@ type RecordReaderCSVLite struct {
|
|||
readerOptions *cli.TReaderOptions
|
||||
recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl
|
||||
|
||||
fieldSplitter iFieldSplitter
|
||||
recordBatchGetter recordBatchGetterCSV
|
||||
|
||||
inputLineNumber int
|
||||
|
|
@ -62,6 +63,7 @@ func NewRecordReaderCSVLite(
|
|||
reader := &RecordReaderCSVLite{
|
||||
readerOptions: readerOptions,
|
||||
recordsPerBatch: recordsPerBatch,
|
||||
fieldSplitter: newFieldSplitter(readerOptions),
|
||||
}
|
||||
if reader.readerOptions.UseImplicitCSVHeader {
|
||||
reader.recordBatchGetter = getRecordBatchImplicitCSVHeader
|
||||
|
|
@ -78,6 +80,7 @@ func NewRecordReaderPPRINT(
|
|||
reader := &RecordReaderCSVLite{
|
||||
readerOptions: readerOptions,
|
||||
recordsPerBatch: recordsPerBatch,
|
||||
fieldSplitter: newFieldSplitter(readerOptions),
|
||||
}
|
||||
if reader.readerOptions.UseImplicitCSVHeader {
|
||||
reader.recordBatchGetter = getRecordBatchImplicitCSVHeader
|
||||
|
|
@ -218,15 +221,7 @@ func getRecordBatchExplicitCSVHeader(
|
|||
continue
|
||||
}
|
||||
|
||||
var fields []string
|
||||
if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex
|
||||
fields = lib.SplitString(line, reader.readerOptions.IFS)
|
||||
} else {
|
||||
fields = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1)
|
||||
}
|
||||
if reader.readerOptions.AllowRepeatIFS {
|
||||
fields = lib.StripEmpties(fields) // left/right trim
|
||||
}
|
||||
fields := reader.fieldSplitter.Split(line)
|
||||
|
||||
if reader.headerStrings == nil {
|
||||
reader.headerStrings = fields
|
||||
|
|
@ -343,16 +338,7 @@ func getRecordBatchImplicitCSVHeader(
|
|||
continue
|
||||
}
|
||||
|
||||
var fields []string
|
||||
// TODO: function-pointer this
|
||||
if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex
|
||||
fields = lib.SplitString(line, reader.readerOptions.IFS)
|
||||
} else {
|
||||
fields = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1)
|
||||
}
|
||||
if reader.readerOptions.AllowRepeatIFS {
|
||||
fields = lib.StripEmpties(fields) // left/right trim
|
||||
}
|
||||
fields := reader.fieldSplitter.Split(line)
|
||||
|
||||
if reader.headerStrings == nil {
|
||||
n := len(fields)
|
||||
|
|
|
|||
|
|
@ -16,12 +16,14 @@ import (
|
|||
|
||||
// splitter_DKVP_NIDX is a function type for the one bit of code differing
|
||||
// between the DKVP reader and the NIDX reader, namely, how it splits lines.
|
||||
type splitter_DKVP_NIDX func(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrmap, error)
|
||||
type line_splitter_DKVP_NIDX func(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrmap, error)
|
||||
|
||||
type RecordReaderDKVPNIDX struct {
|
||||
readerOptions *cli.TReaderOptions
|
||||
recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl
|
||||
splitter splitter_DKVP_NIDX
|
||||
lineSplitter line_splitter_DKVP_NIDX
|
||||
fieldSplitter iFieldSplitter
|
||||
pairSplitter iPairSplitter
|
||||
}
|
||||
|
||||
func NewRecordReaderDKVP(
|
||||
|
|
@ -31,7 +33,9 @@ func NewRecordReaderDKVP(
|
|||
return &RecordReaderDKVPNIDX{
|
||||
readerOptions: readerOptions,
|
||||
recordsPerBatch: recordsPerBatch,
|
||||
splitter: recordFromDKVPLine,
|
||||
lineSplitter: recordFromDKVPLine,
|
||||
fieldSplitter: newFieldSplitter(readerOptions),
|
||||
pairSplitter: newPairSplitter(readerOptions),
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
|
@ -42,7 +46,9 @@ func NewRecordReaderNIDX(
|
|||
return &RecordReaderDKVPNIDX{
|
||||
readerOptions: readerOptions,
|
||||
recordsPerBatch: recordsPerBatch,
|
||||
splitter: recordFromNIDXLine,
|
||||
lineSplitter: recordFromNIDXLine,
|
||||
fieldSplitter: newFieldSplitter(readerOptions),
|
||||
pairSplitter: newPairSplitter(readerOptions),
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
|
@ -143,7 +149,7 @@ func (reader *RecordReaderDKVPNIDX) getRecordBatch(
|
|||
}
|
||||
}
|
||||
|
||||
record, err := reader.splitter(reader, line)
|
||||
record, err := reader.lineSplitter(reader, line)
|
||||
if err != nil {
|
||||
errorChannel <- err
|
||||
return
|
||||
|
|
@ -160,24 +166,10 @@ func recordFromDKVPLine(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrm
|
|||
record := mlrval.NewMlrmapAsRecord()
|
||||
dedupeFieldNames := reader.readerOptions.DedupeFieldNames
|
||||
|
||||
var pairs []string
|
||||
// TODO: func-pointer this away
|
||||
if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex
|
||||
pairs = lib.SplitString(line, reader.readerOptions.IFS)
|
||||
} else {
|
||||
pairs = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1)
|
||||
}
|
||||
if reader.readerOptions.AllowRepeatIFS {
|
||||
pairs = lib.StripEmpties(pairs) // left/right trim
|
||||
}
|
||||
pairs := reader.fieldSplitter.Split(line)
|
||||
|
||||
for i, pair := range pairs {
|
||||
var kv []string
|
||||
if reader.readerOptions.IPSRegex == nil { // e.g. --no-ips-regex
|
||||
kv = strings.SplitN(pair, reader.readerOptions.IPS, 2)
|
||||
} else {
|
||||
kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, pair, 2)
|
||||
}
|
||||
kv := reader.pairSplitter.Split(pair)
|
||||
|
||||
if len(kv) == 0 || (len(kv) == 1 && kv[0] == "") {
|
||||
// Ignore. This is expected when splitting with repeated IFS.
|
||||
|
|
@ -206,16 +198,7 @@ func recordFromDKVPLine(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrm
|
|||
func recordFromNIDXLine(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrmap, error) {
|
||||
record := mlrval.NewMlrmapAsRecord()
|
||||
|
||||
var values []string
|
||||
// TODO: func-pointer this away
|
||||
if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex
|
||||
values = lib.SplitString(line, reader.readerOptions.IFS)
|
||||
} else {
|
||||
values = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1)
|
||||
}
|
||||
if reader.readerOptions.AllowRepeatIFS {
|
||||
values = lib.StripEmpties(values) // left/right trim
|
||||
}
|
||||
values := reader.fieldSplitter.Split(line)
|
||||
|
||||
var i int = 0
|
||||
for _, value := range values {
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import (
|
|||
"container/list"
|
||||
"errors"
|
||||
"io"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/johnkerl/miller/internal/pkg/cli"
|
||||
|
|
@ -13,9 +14,14 @@ import (
|
|||
"github.com/johnkerl/miller/internal/pkg/types"
|
||||
)
|
||||
|
||||
type iXTABPairSplitter interface {
|
||||
Split(input string) (key, value string, err error)
|
||||
}
|
||||
|
||||
type RecordReaderXTAB struct {
|
||||
readerOptions *cli.TReaderOptions
|
||||
recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl
|
||||
pairSplitter iXTABPairSplitter
|
||||
|
||||
// Note: XTAB uses two consecutive IFS in place of an IRS; IRS is ignored
|
||||
}
|
||||
|
|
@ -45,6 +51,7 @@ func NewRecordReaderXTAB(
|
|||
return &RecordReaderXTAB{
|
||||
readerOptions: readerOptions,
|
||||
recordsPerBatch: recordsPerBatch,
|
||||
pairSplitter: newXTABPairSplitter(readerOptions),
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
|
@ -262,31 +269,103 @@ func (reader *RecordReaderXTAB) recordFromXTABLines(
|
|||
for e := stanza.Front(); e != nil; e = e.Next() {
|
||||
line := e.Value.(string)
|
||||
|
||||
var kv []string
|
||||
if reader.readerOptions.IPSRegex == nil { // e.g. --no-ips-regex
|
||||
kv = strings.SplitN(line, reader.readerOptions.IPS, 2)
|
||||
} else {
|
||||
kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, line, 2)
|
||||
}
|
||||
if len(kv) < 1 {
|
||||
return nil, errors.New("mlr: internal coding error in XTAB reader")
|
||||
key, value, err := reader.pairSplitter.Split(line)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
key := kv[0]
|
||||
if len(kv) == 1 {
|
||||
value := mlrval.VOID
|
||||
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
value := mlrval.FromDeferredType(kv[1])
|
||||
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
_, err = record.PutReferenceMaybeDedupe(key, mlrval.FromDeferredType(value), dedupeFieldNames)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return record, nil
|
||||
}
|
||||
|
||||
// IPairSplitter splits a string into left and right, e.g. for IPS.
|
||||
// This is similar to the general one for multiple formats; the exception
|
||||
// is that for XTAB we always allow repeat IPS.
|
||||
func newXTABPairSplitter(options *cli.TReaderOptions) iXTABPairSplitter {
|
||||
if options.IPSRegex == nil {
|
||||
return &tXTABIPSSplitter{ips: options.IPS, ipslen: len(options.IPS)}
|
||||
} else {
|
||||
return &tXTABIPSRegexSplitter{ipsRegex: options.IPSRegex}
|
||||
}
|
||||
}
|
||||
|
||||
type tXTABIPSSplitter struct {
|
||||
ips string
|
||||
ipslen int
|
||||
}
|
||||
|
||||
// This is a splitter for XTAB lines, like 'abc 123'. It's not quite the same as the
|
||||
// field/pair-splitter functions shared by DKVP, NIDX, and CSV-lite. XTAB is the omly format for
|
||||
// which we need to produce just a pair of items -- a key and a value -- delimited by one or more
|
||||
// IPS. For exaemple, with IPS being a space, in 'abc 123' we need to get key 'abc' and value
|
||||
// '123'; for 'abc 123 456' we need key 'abc' and value '123 456'. It's super-elegant to simply
|
||||
// regex-split the line like 'kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, line, 2)' --
|
||||
// however, that's 3x slower than the current implementation. It turns out regexes are great
|
||||
// but we should use them only when we must, since they are expensive.
|
||||
func (s *tXTABIPSSplitter) Split(input string) (key, value string, err error) {
|
||||
// Empty string is a length-0 return value.
|
||||
n := len(input)
|
||||
if n == 0 {
|
||||
return "", "", errors.New("mlr: internal coding error in XTAB reader")
|
||||
}
|
||||
|
||||
// ' abc 123' splits as key '', value 'abc 123'.
|
||||
if strings.HasPrefix(input, s.ips) {
|
||||
keyStart := 0
|
||||
for keyStart < n && strings.HasPrefix(input[keyStart:], s.ips) {
|
||||
keyStart += s.ipslen
|
||||
}
|
||||
return "", input[keyStart:n], nil
|
||||
}
|
||||
|
||||
// Find the first IPS, if any. If there isn't any in the input line then there is no value, only key:
|
||||
// e.g. the line is 'abc'.
|
||||
var keyEnd, valueStart int
|
||||
foundIPS := false
|
||||
for keyEnd = 1; keyEnd <= n; keyEnd++ {
|
||||
if strings.HasPrefix(input[keyEnd:], s.ips) {
|
||||
foundIPS = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundIPS {
|
||||
return input, "", nil
|
||||
}
|
||||
|
||||
// Find the first non-IPS character after last-found IPS, if any. If there isn't any in the input
|
||||
// line then there is no value, only key: e.g. the line is 'abc '.
|
||||
foundValue := false
|
||||
for valueStart = keyEnd + s.ipslen; valueStart <= n; valueStart++ {
|
||||
if !strings.HasPrefix(input[valueStart:], s.ips) {
|
||||
foundValue = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundValue {
|
||||
return input[0:keyEnd], "", nil
|
||||
}
|
||||
|
||||
return input[0:keyEnd], input[valueStart:n], nil
|
||||
}
|
||||
|
||||
type tXTABIPSRegexSplitter struct {
|
||||
ipsRegex *regexp.Regexp
|
||||
}
|
||||
|
||||
func (s *tXTABIPSRegexSplitter) Split(input string) (key, value string, err error) {
|
||||
kv := lib.RegexSplitString(s.ipsRegex, input, 2)
|
||||
if len(kv) == 0 {
|
||||
return "", "", errors.New("mlr: internal coding error in XTAB reader")
|
||||
} else if len(kv) == 1 {
|
||||
return kv[0], "", nil
|
||||
} else if len(kv) == 2 {
|
||||
return kv[0], kv[1], nil
|
||||
} else {
|
||||
return "", "", errors.New("mlr: internal coding error in XTAB reader")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,16 +44,6 @@ var captureDetector = regexp.MustCompile("\\\\[0-9]")
|
|||
// "\2:\1" so they don't need to be recomputed on every record.
|
||||
var captureSplitter = regexp.MustCompile("(\\\\[0-9])")
|
||||
|
||||
// IsRegexString is for the IFS/IPS-as-regex feature.
|
||||
// TODO: probably put this entirely under user control, so people can explicitly say '--ifs-regex something'.
|
||||
func IsRegexString(s string) bool {
|
||||
if len(s) == 1 { // Unfortunately, '|' and '.' qualify as "regex metacharacters".
|
||||
return false
|
||||
} else {
|
||||
return regexp.QuoteMeta(s) != s
|
||||
}
|
||||
}
|
||||
|
||||
// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax
|
||||
// which predate the port of Miller from C to Go. Miller regexes use a final
|
||||
// 'i' to indicate case-insensitivity; Go regexes use an initial "(?i)".
|
||||
|
|
|
|||
|
|
@ -115,7 +115,6 @@ func transformerJoinUsage(
|
|||
fmt.Fprintf(o, " --ifs {field-separator character}\n")
|
||||
fmt.Fprintf(o, " --ips {pair-separator character}\n")
|
||||
fmt.Fprintf(o, " --repifs\n")
|
||||
fmt.Fprintf(o, " --repips\n")
|
||||
fmt.Fprintf(o, " --implicit-csv-header\n")
|
||||
fmt.Fprintf(o, " --no-implicit-csv-header\n")
|
||||
fmt.Fprintf(o, "For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will\n")
|
||||
|
|
|
|||
|
|
@ -111,6 +111,7 @@ HELP OPTIONS
|
|||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help list-separator-regex-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
@ -688,11 +689,14 @@ SEPARATOR FLAGS
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\t"
|
||||
tabs = "(\t)+"
|
||||
usv_fs = "\xe2\x90\x9f"
|
||||
usv_rs = "\xe2\x90\x9e"
|
||||
|
||||
- Similarly, you can use the following for `--ifs-regex` and `--ips-regex`:
|
||||
|
||||
spaces = "( )+"
|
||||
tabs = "(\t)+"
|
||||
whitespace = "([ \t])+"
|
||||
|
||||
* Default separators by format:
|
||||
|
|
@ -709,12 +713,10 @@ SEPARATOR FLAGS
|
|||
|
||||
--fs {string} Specify FS for input and output.
|
||||
--ifs {string} Specify FS for input.
|
||||
--ifs-regex {string} Specify FS for input as a regular expression.
|
||||
--ips {string} Specify PS for input.
|
||||
--ips-regex {string} Specify PS for input as a regular expression.
|
||||
--irs {string} Specify RS for input.
|
||||
--no-ifs-regex Don't treat IFS value as a regular expression. Useful
|
||||
if your IFS is ".".
|
||||
--no-ips-regex Don't treat IPS value as a regular expression. Useful
|
||||
if your IPS is ".".
|
||||
--ofs {string} Specify FS for output.
|
||||
--ops {string} Specify PS for output.
|
||||
--ors {string} Specify RS for output.
|
||||
|
|
@ -1229,7 +1231,6 @@ VERBS
|
|||
--ifs {field-separator character}
|
||||
--ips {pair-separator character}
|
||||
--repifs
|
||||
--repips
|
||||
--implicit-csv-header
|
||||
--no-implicit-csv-header
|
||||
For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will
|
||||
|
|
@ -2996,4 +2997,4 @@ SEE ALSO
|
|||
|
||||
|
||||
|
||||
2021-12-23 MILLER(1)
|
||||
2021-12-25 MILLER(1)
|
||||
|
|
|
|||
19
man/mlr.1
19
man/mlr.1
|
|
@ -2,12 +2,12 @@
|
|||
.\" Title: mlr
|
||||
.\" Author: [see the "AUTHOR" section]
|
||||
.\" Generator: ./mkman.rb
|
||||
.\" Date: 2021-12-23
|
||||
.\" Date: 2021-12-25
|
||||
.\" Manual: \ \&
|
||||
.\" Source: \ \&
|
||||
.\" Language: English
|
||||
.\"
|
||||
.TH "MILLER" "1" "2021-12-23" "\ \&" "\ \&"
|
||||
.TH "MILLER" "1" "2021-12-25" "\ \&" "\ \&"
|
||||
.\" -----------------------------------------------------------------
|
||||
.\" * Portability definitions
|
||||
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
@ -146,6 +146,7 @@ Essentials:
|
|||
Flags:
|
||||
mlr help flags
|
||||
mlr help list-separator-aliases
|
||||
mlr help list-separator-regex-aliases
|
||||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv-only-flags
|
||||
|
|
@ -831,11 +832,14 @@ Notes about all other separators:
|
|||
semicolon = ";"
|
||||
slash = "/"
|
||||
space = " "
|
||||
spaces = "( )+"
|
||||
tab = "\et"
|
||||
tabs = "(\et)+"
|
||||
usv_fs = "\exe2\ex90\ex9f"
|
||||
usv_rs = "\exe2\ex90\ex9e"
|
||||
|
||||
- Similarly, you can use the following for `--ifs-regex` and `--ips-regex`:
|
||||
|
||||
spaces = "( )+"
|
||||
tabs = "(\et)+"
|
||||
whitespace = "([ \et])+"
|
||||
|
||||
* Default separators by format:
|
||||
|
|
@ -852,12 +856,10 @@ Notes about all other separators:
|
|||
|
||||
--fs {string} Specify FS for input and output.
|
||||
--ifs {string} Specify FS for input.
|
||||
--ifs-regex {string} Specify FS for input as a regular expression.
|
||||
--ips {string} Specify PS for input.
|
||||
--ips-regex {string} Specify PS for input as a regular expression.
|
||||
--irs {string} Specify RS for input.
|
||||
--no-ifs-regex Don't treat IFS value as a regular expression. Useful
|
||||
if your IFS is ".".
|
||||
--no-ips-regex Don't treat IPS value as a regular expression. Useful
|
||||
if your IPS is ".".
|
||||
--ofs {string} Specify FS for output.
|
||||
--ops {string} Specify PS for output.
|
||||
--ors {string} Specify RS for output.
|
||||
|
|
@ -1554,7 +1556,6 @@ the main "mlr --help" for more information on syntax for these arguments:
|
|||
--ifs {field-separator character}
|
||||
--ips {pair-separator character}
|
||||
--repifs
|
||||
--repips
|
||||
--implicit-csv-header
|
||||
--no-implicit-csv-header
|
||||
For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will
|
||||
|
|
|
|||
|
|
@ -455,7 +455,6 @@ the main "mlr --help" for more information on syntax for these arguments:
|
|||
--ifs {field-separator character}
|
||||
--ips {pair-separator character}
|
||||
--repifs
|
||||
--repips
|
||||
--implicit-csv-header
|
||||
--no-implicit-csv-header
|
||||
For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
mlr --xtab --ips '\.' --ops @ cut -x -f b test/input/dots.xtab
|
||||
mlr --xtab --ips '.' --ops @ cut -x -f b test/input/dots.xtab
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
mlr --from ${CASEDIR}/input --n2j --ifs spaces cat
|
||||
mlr --from ${CASEDIR}/input --n2j --ifs-regex spaces cat
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
mlr --from ${CASEDIR}/input --n2j --ifs tabs cat
|
||||
mlr --from ${CASEDIR}/input --n2j --ifs-regex tabs cat
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
mlr --from ${CASEDIR}/input --n2j --ifs whitespace cat
|
||||
mlr --from ${CASEDIR}/input --n2j --ifs-regex whitespace cat
|
||||
|
|
|
|||
3
todo.txt
3
todo.txt
|
|
@ -2,7 +2,6 @@
|
|||
PUNCHDOWN LIST
|
||||
|
||||
* blockers:
|
||||
! --ifs-regex & --ips-regex -- guessing is not safe as evidence by '.' and '|'
|
||||
- allow-repeat-ixs nidx perf mod w/o regex split string ...
|
||||
|
||||
- fractional-strptime
|
||||
|
|
@ -107,6 +106,8 @@ PUNCHDOWN LIST
|
|||
================================================================
|
||||
NON-BLOCKERS
|
||||
|
||||
* xtab splitter UT; nidx too
|
||||
|
||||
* integrate:
|
||||
o https://www.libhunt.com/r/miller
|
||||
o https://repology.org/project/miller/information
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue