diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 27cf9bb97..e6ed7dd25 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -132,6 +132,7 @@ HELP OPTIONS Flags: mlr help flags mlr help list-separator-aliases + mlr help list-separator-regex-aliases mlr help comments-in-data-flags mlr help compressed-data-flags mlr help csv-only-flags @@ -709,11 +710,14 @@ SEPARATOR FLAGS semicolon = ";" slash = "/" space = " " - spaces = "( )+" tab = "\t" - tabs = "(\t)+" usv_fs = "\xe2\x90\x9f" usv_rs = "\xe2\x90\x9e" + + - Similarly, you can use the following for `--ifs-regex` and `--ips-regex`: + + spaces = "( )+" + tabs = "(\t)+" whitespace = "([ \t])+" * Default separators by format: @@ -730,12 +734,10 @@ SEPARATOR FLAGS --fs {string} Specify FS for input and output. --ifs {string} Specify FS for input. + --ifs-regex {string} Specify FS for input as a regular expression. --ips {string} Specify PS for input. + --ips-regex {string} Specify PS for input as a regular expression. --irs {string} Specify RS for input. - --no-ifs-regex Don't treat IFS value as a regular expression. Useful - if your IFS is ".". - --no-ips-regex Don't treat IPS value as a regular expression. Useful - if your IPS is ".". --ofs {string} Specify FS for output. --ops {string} Specify PS for output. --ors {string} Specify RS for output. @@ -1250,7 +1252,6 @@ VERBS --ifs {field-separator character} --ips {pair-separator character} --repifs - --repips --implicit-csv-header --no-implicit-csv-header For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will @@ -3017,5 +3018,5 @@ SEE ALSO - 2021-12-23 MILLER(1) + 2021-12-25 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index bc31f7d73..77f07daf2 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -111,6 +111,7 @@ HELP OPTIONS Flags: mlr help flags mlr help list-separator-aliases + mlr help list-separator-regex-aliases mlr help comments-in-data-flags mlr help compressed-data-flags mlr help csv-only-flags @@ -688,11 +689,14 @@ SEPARATOR FLAGS semicolon = ";" slash = "/" space = " " - spaces = "( )+" tab = "\t" - tabs = "(\t)+" usv_fs = "\xe2\x90\x9f" usv_rs = "\xe2\x90\x9e" + + - Similarly, you can use the following for `--ifs-regex` and `--ips-regex`: + + spaces = "( )+" + tabs = "(\t)+" whitespace = "([ \t])+" * Default separators by format: @@ -709,12 +713,10 @@ SEPARATOR FLAGS --fs {string} Specify FS for input and output. --ifs {string} Specify FS for input. + --ifs-regex {string} Specify FS for input as a regular expression. --ips {string} Specify PS for input. + --ips-regex {string} Specify PS for input as a regular expression. --irs {string} Specify RS for input. - --no-ifs-regex Don't treat IFS value as a regular expression. Useful - if your IFS is ".". - --no-ips-regex Don't treat IPS value as a regular expression. Useful - if your IPS is ".". --ofs {string} Specify FS for output. --ops {string} Specify PS for output. --ors {string} Specify RS for output. @@ -1229,7 +1231,6 @@ VERBS --ifs {field-separator character} --ips {pair-separator character} --repifs - --repips --implicit-csv-header --no-implicit-csv-header For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will @@ -2996,4 +2997,4 @@ SEE ALSO - 2021-12-23 MILLER(1) + 2021-12-25 MILLER(1) diff --git a/docs/src/online-help.md b/docs/src/online-help.md index 4bea51ab4..d74921e90 100644 --- a/docs/src/online-help.md +++ b/docs/src/online-help.md @@ -50,6 +50,7 @@ Essentials: Flags: mlr help flags mlr help list-separator-aliases + mlr help list-separator-regex-aliases mlr help comments-in-data-flags mlr help compressed-data-flags mlr help csv-only-flags diff --git a/docs/src/reference-main-flag-list.md b/docs/src/reference-main-flag-list.md index b677c2363..124b1f027 100644 --- a/docs/src/reference-main-flag-list.md +++ b/docs/src/reference-main-flag-list.md @@ -541,11 +541,14 @@ Notes about all other separators: semicolon = ";" slash = "/" space = " " - spaces = "( )+" tab = "\t" - tabs = "(\t)+" usv_fs = "\xe2\x90\x9f" usv_rs = "\xe2\x90\x9e" + + - Similarly, you can use the following for `--ifs-regex` and `--ips-regex`: + + spaces = "( )+" + tabs = "(\t)+" whitespace = "([ \t])+" * Default separators by format: @@ -567,14 +570,14 @@ Notes about all other separators: `: Specify FS for input and output. * `--ifs {string} `: Specify FS for input. +* `--ifs-regex {string} +`: Specify FS for input as a regular expression. * `--ips {string} `: Specify PS for input. +* `--ips-regex {string} +`: Specify PS for input as a regular expression. * `--irs {string} `: Specify RS for input. -* `--no-ifs-regex -`: Don't treat IFS value as a regular expression. Useful if your IFS is ".". -* `--no-ips-regex -`: Don't treat IPS value as a regular expression. Useful if your IPS is ".". * `--ofs {string} `: Specify FS for output. * `--ops {string} diff --git a/docs/src/reference-main-separators.md b/docs/src/reference-main-separators.md index e096dfbd5..8b939dbca 100644 --- a/docs/src/reference-main-separators.md +++ b/docs/src/reference-main-separators.md @@ -76,6 +76,24 @@ c:3;a:1;b:2 c:6;a:4;b:5 +
+mlr --csv head -n 2 example.csv
+
+
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
+red,square,true,2,15,79.2778,0.0130
+
+ +
+mlr --csv --ofs pipe head -n 2 example.csv
+
+
+color|shape|flag|k|index|quantity|rate
+yellow|triangle|true|1|11|43.6498|9.8870
+red|square|true|2|15|79.2778|0.0130
+
+ If your data has non-default separators and you don't want to change those between input and output, you can use `--rs`, `--fs`, and `--ps`. Setting `--fs :` is the same as setting `--ifs : --ofs :`, but with fewer keystrokes. @@ -96,9 +114,11 @@ c:3;a:1;b:2 c:6;a:4;b:5 -## Multi-character and regular-expression separators +## Multi-character separators -The separators default to single characters, but can be multiple characters if you like: +All separators can be multi-character, except for file formats which don't +allow parameterization (see below). And for CSV (CSV-lite doesn't have these +restrictions), IRS must be `\n` and IFS must be a single character.
 mlr --ifs ';' --ips : --ofs ';;;' --ops := cut -o -f c,a,b data/modsep.dkvp
@@ -108,23 +128,13 @@ c:=3;;;a:=1;;;b:=2
 c:=6;;;a:=4;;;b:=5
 
-As of September 2021: - -* `IFS` and `IPS` can be regular expressions. -* `IRS` can be multi-character (except for file formats which don't allow parameterization -- see below) -* `OFS`, `OPS`, and `ORS` can be multi-character. - -Since `IFS` and `IPS` can be regular expressions, if your data has field -separators which are one or more consecutive spaces, you can use `--ifs '( -)+'`. But that gets a little tedious, so Miller has the `--repifs` and -`--repips` flags you can use if you like. This wraps the `IFS` or `IPS`, say -`X`, as `(X)+`. - -The `--repifs` flag means that multiple successive occurrences of the field +If your data has field separators which are one or more consecutive spaces, you +can use `--ifs space --repifs`. +More generally, the `--repifs` flag means that multiple successive occurrences of the field separator count as one. For example, in CSV data we often signify nulls by empty strings, e.g. `2,9,,,,,6,5,4`. On the other hand, if the field separator is a space, it might be more natural to parse `2 4 5` the same as `2 4 5`: -`--repifs --ifs ' '` lets this happen. In fact, the `--ipprint` option above +`--repifs --ifs ' '` lets this happen. In fact, the `--ipprint` option is internally implemented in terms of `--repifs`. For example: @@ -158,6 +168,15 @@ early light what so 4 so +## Regular-expression separators + +`IFS` and `IPS` can be regular expressions: use `--ifs-regex` or `--ips-regex` in place of +`--ifs` or `--ips`, respectively. + +You can also use either `--ifs space --repifs` or `--ifs-regex '()+'`. (But that gets a little tedious, +so there are aliases listed below.) Note however that `--ifs space --repifs` is about 3x faster than +`--ifs-regex '( )+'` -- regular expressions are powerful, but slower. + ## Aliases Many things we'd like to write as separators need to be escaped from the shell @@ -192,16 +211,25 @@ pipe = "|" semicolon = ";" slash = "/" space = " " -spaces = "( )+" tab = "\t" -tabs = "(\t)+" usv_fs = "\xe2\x90\x9f" usv_rs = "\xe2\x90\x9e" + + +And for `--ifs-regex` and `--ips-regex`: + +
+mlr help list-separator-regex-aliases
+
+
+spaces     = "( )+"
+tabs       = "(\t)+"
 whitespace = "([ \t])+"
 
Note that `spaces`, `tabs`, and `whitespace` already are regexes so you -shouldn't use `--repifs` with them. +shouldn't use `--repifs` with them. (In fact, the `--repifs` flag is ignored +when `--ifs-regex` is provided.) ## Command-line flags @@ -209,8 +237,8 @@ Given the above, we now have seen the following flags:
 --rs --irs --ors
---fs --ifs --ofs --repifs
---ps --ips --ops
+--fs --ifs --ofs --repifs --ifs-regex
+--ps --ips --ops --ips-regex
 
See also the [separator-flags section](reference-main-flag-list.md#separator-flags). diff --git a/docs/src/reference-main-separators.md.in b/docs/src/reference-main-separators.md.in index ce6ec50ff..921b3098c 100644 --- a/docs/src/reference-main-separators.md.in +++ b/docs/src/reference-main-separators.md.in @@ -48,6 +48,14 @@ GENMD-RUN-COMMAND mlr --ifs , --ofs ';' --ips = --ops : cut -o -f c,a,b data/a.dkvp GENMD-EOF +GENMD-RUN-COMMAND +mlr --csv head -n 2 example.csv +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --csv --ofs pipe head -n 2 example.csv +GENMD-EOF + If your data has non-default separators and you don't want to change those between input and output, you can use `--rs`, `--fs`, and `--ps`. Setting `--fs :` is the same as setting `--ifs : --ofs :`, but with fewer keystrokes. @@ -60,31 +68,23 @@ GENMD-RUN-COMMAND mlr --fs ';' --ps : cut -o -f c,a,b data/modsep.dkvp GENMD-EOF -## Multi-character and regular-expression separators +## Multi-character separators -The separators default to single characters, but can be multiple characters if you like: +All separators can be multi-character, except for file formats which don't +allow parameterization (see below). And for CSV (CSV-lite doesn't have these +restrictions), IRS must be `\n` and IFS must be a single character. GENMD-RUN-COMMAND mlr --ifs ';' --ips : --ofs ';;;' --ops := cut -o -f c,a,b data/modsep.dkvp GENMD-EOF -As of September 2021: - -* `IFS` and `IPS` can be regular expressions. -* `IRS` can be multi-character (except for file formats which don't allow parameterization -- see below) -* `OFS`, `OPS`, and `ORS` can be multi-character. - -Since `IFS` and `IPS` can be regular expressions, if your data has field -separators which are one or more consecutive spaces, you can use `--ifs '( -)+'`. But that gets a little tedious, so Miller has the `--repifs` and -`--repips` flags you can use if you like. This wraps the `IFS` or `IPS`, say -`X`, as `(X)+`. - -The `--repifs` flag means that multiple successive occurrences of the field +If your data has field separators which are one or more consecutive spaces, you +can use `--ifs space --repifs`. +More generally, the `--repifs` flag means that multiple successive occurrences of the field separator count as one. For example, in CSV data we often signify nulls by empty strings, e.g. `2,9,,,,,6,5,4`. On the other hand, if the field separator is a space, it might be more natural to parse `2 4 5` the same as `2 4 5`: -`--repifs --ifs ' '` lets this happen. In fact, the `--ipprint` option above +`--repifs --ifs ' '` lets this happen. In fact, the `--ipprint` option is internally implemented in terms of `--repifs`. For example: @@ -97,6 +97,15 @@ GENMD-RUN-COMMAND mlr --ifs ' ' --repifs --inidx --oxtab cat data/extra-spaces.txt GENMD-EOF +## Regular-expression separators + +`IFS` and `IPS` can be regular expressions: use `--ifs-regex` or `--ips-regex` in place of +`--ifs` or `--ips`, respectively. + +You can also use either `--ifs space --repifs` or `--ifs-regex '()+'`. (But that gets a little tedious, +so there are aliases listed below.) Note however that `--ifs space --repifs` is about 3x faster than +`--ifs-regex '( )+'` -- regular expressions are powerful, but slower. + ## Aliases Many things we'd like to write as separators need to be escaped from the shell @@ -106,8 +115,15 @@ GENMD-RUN-COMMAND mlr help list-separator-aliases GENMD-EOF +And for `--ifs-regex` and `--ips-regex`: + +GENMD-RUN-COMMAND +mlr help list-separator-regex-aliases +GENMD-EOF + Note that `spaces`, `tabs`, and `whitespace` already are regexes so you -shouldn't use `--repifs` with them. +shouldn't use `--repifs` with them. (In fact, the `--repifs` flag is ignored +when `--ifs-regex` is provided.) ## Command-line flags @@ -115,8 +131,8 @@ Given the above, we now have seen the following flags: GENMD-CARDIFY --rs --irs --ors ---fs --ifs --ofs --repifs ---ps --ips --ops +--fs --ifs --ofs --repifs --ifs-regex +--ps --ips --ops --ips-regex GENMD-EOF See also the [separator-flags section](reference-main-flag-list.md#separator-flags). diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index 1dcb00762..eecdc27e8 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -1589,7 +1589,6 @@ the main "mlr --help" for more information on syntax for these arguments: --ifs {field-separator character} --ips {pair-separator character} --repifs - --repips --implicit-csv-header --no-implicit-csv-header For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will diff --git a/internal/pkg/auxents/help/entry.go b/internal/pkg/auxents/help/entry.go index c223279e8..24bdd3f5d 100644 --- a/internal/pkg/auxents/help/entry.go +++ b/internal/pkg/auxents/help/entry.go @@ -73,6 +73,7 @@ func init() { handlerInfos: []tHandlerInfo{ {name: "flags", zaryHandlerFunc: showFlagHelp}, {name: "list-separator-aliases", zaryHandlerFunc: listSeparatorAliases}, + {name: "list-separator-regex-aliases", zaryHandlerFunc: listSeparatorRegexAliases}, // Per-section entries will be computed and installed below }, }, @@ -315,6 +316,10 @@ func listSeparatorAliases() { cli.ListSeparatorAliasesForOnlineHelp() } +func listSeparatorRegexAliases() { + cli.ListSeparatorRegexAliasesForOnlineHelp() +} + // ---------------------------------------------------------------- func helpAuxents() { fmt.Print(`Miller has a few otherwise-standalone executables packaged within it. diff --git a/internal/pkg/cli/mlrcli_util.go b/internal/pkg/cli/mlrcli_util.go index f0bc1063f..4da61000c 100644 --- a/internal/pkg/cli/mlrcli_util.go +++ b/internal/pkg/cli/mlrcli_util.go @@ -25,3 +25,14 @@ func SeparatorFromArg(name string) string { return name } } + +// SeparatorRegexFromArg is for letting people do things like `--ifs-regex whitespace` +// rather than `--ifs '([ \t])+'`. +func SeparatorRegexFromArg(name string) string { + sep, ok := SEPARATOR_REGEX_NAMES_TO_VALUES[name] + if ok { + return sep + } else { + return name + } +} diff --git a/internal/pkg/cli/option_parse.go b/internal/pkg/cli/option_parse.go index 92abc72d8..b8e637981 100644 --- a/internal/pkg/cli/option_parse.go +++ b/internal/pkg/cli/option_parse.go @@ -36,7 +36,11 @@ func FinalizeReaderOptions(readerOptions *TReaderOptions) { readerOptions.IFS = defaultFSes[readerOptions.InputFileFormat] } if !readerOptions.ipsWasSpecified { + //if readerOptions.InputFileFormat == "xtab" && !readerOptions.ipsWasSpecified { + //readerOptions.IPSRegex = lib.CompileMillerRegexOrDie(WHITESPACE_REGEX) + //} else { readerOptions.IPS = defaultPSes[readerOptions.InputFileFormat] + //} } if !readerOptions.irsWasSpecified { readerOptions.IRS = defaultRSes[readerOptions.InputFileFormat] @@ -46,40 +50,14 @@ func FinalizeReaderOptions(readerOptions *TReaderOptions) { // and spaces, that should now be the default for NIDX. But *only* for NIDX format, // and if IFS wasn't specified. if readerOptions.InputFileFormat == "nidx" && !readerOptions.ifsWasSpecified { - readerOptions.IFS = WHITESPACE + readerOptions.IFSRegex = lib.CompileMillerRegexOrDie(WHITESPACE_REGEX) } else { readerOptions.AllowRepeatIFS = defaultAllowRepeatIFSes[readerOptions.InputFileFormat] } } - if !readerOptions.allowRepeatIPSWasSpecified { - readerOptions.AllowRepeatIPS = defaultAllowRepeatIPSes[readerOptions.InputFileFormat] - } - - if readerOptions.SuppressIFSRegexing { - readerOptions.IFSRegex = nil - } else if readerOptions.AllowRepeatIFS { - readerOptions.IFSRegex = lib.CompileMillerRegexOrDie("(" + readerOptions.IFS + ")+") - } else if !lib.IsRegexString(readerOptions.IFS) { - // Using regex-splitting on IFS/IPS in record-readers that support it is a HUGE perf hit (almost 2x). - // Don't use it unless these are actually value-adding regexes. - readerOptions.IFSRegex = nil - } else { - readerOptions.IFSRegex = lib.CompileMillerRegexOrDie(readerOptions.IFS) - } - - if readerOptions.SuppressIPSRegexing { - readerOptions.IPSRegex = nil - } else if readerOptions.AllowRepeatIPS { - readerOptions.IPSRegex = lib.CompileMillerRegexOrDie("(" + readerOptions.IPS + ")+") - } else if !lib.IsRegexString(readerOptions.IPS) { - // Using regex-splitting on IFS/IPS in record-readers that support it - // is a HUGE perf hit (almost 2x). Don't use it unless these are - // actually value-adding regexes. - readerOptions.IPSRegex = nil - } else { - readerOptions.IPSRegex = lib.CompileMillerRegexOrDie(readerOptions.IPS) - } + readerOptions.IFS = lib.UnbackslashStringLiteral(readerOptions.IFS) + readerOptions.IPS = lib.UnbackslashStringLiteral(readerOptions.IPS) readerOptions.IRS = lib.UnbackslashStringLiteral(readerOptions.IRS) } @@ -193,6 +171,14 @@ Notes about all other separators: } fmt.Println() + fmt.Println(" - Similarly, you can use the following for `--ifs-regex` and `--ips-regex`:") + fmt.Println() + aliases = lib.GetArrayKeysSorted(SEPARATOR_REGEX_NAMES_TO_VALUES) + for _, alias := range aliases { + fmt.Printf(" %-10s = \"%s\"\n", alias, SEPARATOR_REGEX_NAMES_TO_VALUES[alias]) + } + fmt.Println() + fmt.Println("* Default separators by format:") fmt.Println() @@ -227,6 +213,16 @@ func ListSeparatorAliasesForOnlineHelp() { } } +func ListSeparatorRegexAliasesForOnlineHelp() { + // Go doesn't preserve insertion order in its arrays so here we are inlining a sort. + aliases := lib.GetArrayKeysSorted(SEPARATOR_REGEX_NAMES_TO_VALUES) + for _, alias := range aliases { + // Really absurd level of indent needed to get fixed-with font in mkdocs here, + // I don't know why. Usually it only takes 4, not 10. + fmt.Printf("%-10s = \"%s\"\n", alias, SEPARATOR_REGEX_NAMES_TO_VALUES[alias]) + } +} + func init() { SeparatorFlagSection.Sort() } var SeparatorFlagSection = FlagSection{ @@ -251,6 +247,23 @@ var SeparatorFlagSection = FlagSection{ }, }, + { + name: "--ifs-regex", + arg: "{string}", + help: "Specify FS for input as a regular expression.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + CheckArgCount(args, *pargi, argc, 2) + // Backward compatibility with Miller <= 5. Auto-inference of + // LF vs CR/LF line endings is handled within Go libraries so + // we needn't do anything ourselves. + if args[*pargi+1] != "auto" { + options.ReaderOptions.IFSRegex = lib.CompileMillerRegexOrDie(SeparatorRegexFromArg(args[*pargi+1])) + options.ReaderOptions.ifsWasSpecified = true + } + *pargi += 2 + }, + }, + { name: "--ips", arg: "{string}", @@ -263,6 +276,18 @@ var SeparatorFlagSection = FlagSection{ }, }, + { + name: "--ips-regex", + arg: "{string}", + help: "Specify PS for input as a regular expression.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + CheckArgCount(args, *pargi, argc, 2) + options.ReaderOptions.IPSRegex = lib.CompileMillerRegexOrDie(SeparatorRegexFromArg(args[*pargi+1])) + options.ReaderOptions.ipsWasSpecified = true + *pargi += 2 + }, + }, + { name: "--irs", arg: "{string}", @@ -382,24 +407,6 @@ var SeparatorFlagSection = FlagSection{ *pargi += 2 }, }, - - { - name: "--no-ifs-regex", - help: `Don't treat IFS value as a regular expression. Useful if your IFS is ".".`, - parser: func(args []string, argc int, pargi *int, options *TOptions) { - options.ReaderOptions.SuppressIFSRegexing = true - *pargi += 1 - }, - }, - - { - name: "--no-ips-regex", - help: `Don't treat IPS value as a regular expression. Useful if your IPS is ".".`, - parser: func(args []string, argc int, pargi *int, options *TOptions) { - options.ReaderOptions.SuppressIPSRegexing = true - *pargi += 1 - }, - }, }, } diff --git a/internal/pkg/cli/option_types.go b/internal/pkg/cli/option_types.go index c008c9553..68c08fc8a 100644 --- a/internal/pkg/cli/option_types.go +++ b/internal/pkg/cli/option_types.go @@ -36,17 +36,14 @@ type TGeneratorOptions struct { } type TReaderOptions struct { - InputFileFormat string - IFS string - IPS string - IRS string - AllowRepeatIFS bool - AllowRepeatIPS bool - IFSRegex *regexp.Regexp - IPSRegex *regexp.Regexp - SuppressIFSRegexing bool // e.g. if they want to do '--ifs .' since '.' is a regex metacharacter - SuppressIPSRegexing bool // e.g. if they want to do '--ips .' since '.' is a regex metacharacter - DedupeFieldNames bool + InputFileFormat string + IFS string + IPS string + IRS string + AllowRepeatIFS bool + IFSRegex *regexp.Regexp + IPSRegex *regexp.Regexp + DedupeFieldNames bool // If unspecified on the command line, these take input-format-dependent // defaults. E.g. default FS is comma for DKVP but space for NIDX; @@ -55,7 +52,6 @@ type TReaderOptions struct { ipsWasSpecified bool irsWasSpecified bool allowRepeatIFSWasSpecified bool - allowRepeatIPSWasSpecified bool UseImplicitCSVHeader bool AllowRaggedCSVInput bool diff --git a/internal/pkg/cli/separators.go b/internal/pkg/cli/separators.go index 2a6282fe8..e5e0c385f 100644 --- a/internal/pkg/cli/separators.go +++ b/internal/pkg/cli/separators.go @@ -14,10 +14,11 @@ const PIPE = "|" const SEMICOLON = ";" const SLASH = "/" const SPACE = " " -const SPACES = "( )+" const TAB = "\\t" -const TABS = "(\\t)+" -const WHITESPACE = "([ \\t])+" + +const SPACES_REGEX = "( )+" +const TABS_REGEX = "(\\t)+" +const WHITESPACE_REGEX = "([ \\t])+" const ASCII_ESC = "\\x1b" const ASCII_ETX = "\\x04" @@ -67,12 +68,15 @@ var SEPARATOR_NAMES_TO_VALUES = map[string]string{ "semicolon": SEMICOLON, "slash": SLASH, "space": SPACE, - "spaces": SPACES, "tab": TAB, - "tabs": TABS, "usv_fs": USV_FS, "usv_rs": USV_RS, - "whitespace": WHITESPACE, +} + +var SEPARATOR_REGEX_NAMES_TO_VALUES = map[string]string{ + "spaces": SPACES_REGEX, + "tabs": TABS_REGEX, + "whitespace": WHITESPACE_REGEX, } // E.g. if IFS isn't specified, it's space for NIDX and comma for DKVP, etc. @@ -96,7 +100,7 @@ var defaultPSes = map[string]string{ "markdown": "N/A", "nidx": "N/A", "pprint": "N/A", - "xtab": " ", // todo: windows-dependent ... + "xtab": " ", } var defaultRSes = map[string]string{ @@ -120,14 +124,3 @@ var defaultAllowRepeatIFSes = map[string]bool{ "pprint": true, "xtab": false, } - -var defaultAllowRepeatIPSes = map[string]bool{ - "csv": false, - "csvlite": false, - "dkvp": false, - "json": false, - "markdown": false, - "nidx": false, - "pprint": false, - "xtab": true, -} diff --git a/internal/pkg/input/record_reader.go b/internal/pkg/input/record_reader.go index bf6542314..2228614e0 100644 --- a/internal/pkg/input/record_reader.go +++ b/internal/pkg/input/record_reader.go @@ -1,10 +1,17 @@ +// This file contains the interface for file-format-specific record-readers, as +// well as a collection of utility functions. + package input import ( "bufio" "container/list" "io" + "regexp" + "strings" + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/lib" "github.com/johnkerl/miller/internal/pkg/types" ) @@ -123,3 +130,68 @@ func channelizedLineScanner( linesChannel <- lines close(linesChannel) // end-of-stream marker } + +// IPairSplitter splits a string into left and right, e.g. for IPS. +// This helps us reuse code for splitting by IPS string, or IPS regex. +type iPairSplitter interface { + Split(input string) []string +} + +func newPairSplitter(options *cli.TReaderOptions) iPairSplitter { + if options.IPSRegex == nil { + return &tIPSSplitter{ips: options.IPS} + } else { + return &tIPSRegexSplitter{ipsRegex: options.IPSRegex} + } +} + +type tIPSSplitter struct { + ips string +} + +func (s *tIPSSplitter) Split(input string) []string { + return strings.SplitN(input, s.ips, 2) +} + +type tIPSRegexSplitter struct { + ipsRegex *regexp.Regexp +} + +func (s *tIPSRegexSplitter) Split(input string) []string { + return lib.RegexSplitString(s.ipsRegex, input, 2) +} + +// IFieldSplitter splits a string into pieces, e.g. for IFS. +// This helps us reuse code for splitting by IFS string, or IFS regex. +type iFieldSplitter interface { + Split(input string) []string +} + +func newFieldSplitter(options *cli.TReaderOptions) iFieldSplitter { + if options.IFSRegex == nil { + return &tIFSSplitter{ifs: options.IFS, allowRepeatIFS: options.AllowRepeatIFS} + } else { + return &tIFSRegexSplitter{ifsRegex: options.IFSRegex} + } +} + +type tIFSSplitter struct { + ifs string + allowRepeatIFS bool +} + +func (s *tIFSSplitter) Split(input string) []string { + fields := lib.SplitString(input, s.ifs) + if s.allowRepeatIFS { + fields = lib.StripEmpties(fields) // left/right trim + } + return fields +} + +type tIFSRegexSplitter struct { + ifsRegex *regexp.Regexp +} + +func (s *tIFSRegexSplitter) Split(input string) []string { + return lib.RegexSplitString(s.ifsRegex, input, -1) +} diff --git a/internal/pkg/input/record_reader_csvlite.go b/internal/pkg/input/record_reader_csvlite.go index 759363386..aaca53239 100644 --- a/internal/pkg/input/record_reader_csvlite.go +++ b/internal/pkg/input/record_reader_csvlite.go @@ -49,6 +49,7 @@ type RecordReaderCSVLite struct { readerOptions *cli.TReaderOptions recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl + fieldSplitter iFieldSplitter recordBatchGetter recordBatchGetterCSV inputLineNumber int @@ -62,6 +63,7 @@ func NewRecordReaderCSVLite( reader := &RecordReaderCSVLite{ readerOptions: readerOptions, recordsPerBatch: recordsPerBatch, + fieldSplitter: newFieldSplitter(readerOptions), } if reader.readerOptions.UseImplicitCSVHeader { reader.recordBatchGetter = getRecordBatchImplicitCSVHeader @@ -78,6 +80,7 @@ func NewRecordReaderPPRINT( reader := &RecordReaderCSVLite{ readerOptions: readerOptions, recordsPerBatch: recordsPerBatch, + fieldSplitter: newFieldSplitter(readerOptions), } if reader.readerOptions.UseImplicitCSVHeader { reader.recordBatchGetter = getRecordBatchImplicitCSVHeader @@ -218,15 +221,7 @@ func getRecordBatchExplicitCSVHeader( continue } - var fields []string - if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex - fields = lib.SplitString(line, reader.readerOptions.IFS) - } else { - fields = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) - } - if reader.readerOptions.AllowRepeatIFS { - fields = lib.StripEmpties(fields) // left/right trim - } + fields := reader.fieldSplitter.Split(line) if reader.headerStrings == nil { reader.headerStrings = fields @@ -343,16 +338,7 @@ func getRecordBatchImplicitCSVHeader( continue } - var fields []string - // TODO: function-pointer this - if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex - fields = lib.SplitString(line, reader.readerOptions.IFS) - } else { - fields = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) - } - if reader.readerOptions.AllowRepeatIFS { - fields = lib.StripEmpties(fields) // left/right trim - } + fields := reader.fieldSplitter.Split(line) if reader.headerStrings == nil { n := len(fields) diff --git a/internal/pkg/input/record_reader_dkvp_nidx.go b/internal/pkg/input/record_reader_dkvp_nidx.go index ebcbff987..b1b4cd97c 100644 --- a/internal/pkg/input/record_reader_dkvp_nidx.go +++ b/internal/pkg/input/record_reader_dkvp_nidx.go @@ -16,12 +16,14 @@ import ( // splitter_DKVP_NIDX is a function type for the one bit of code differing // between the DKVP reader and the NIDX reader, namely, how it splits lines. -type splitter_DKVP_NIDX func(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrmap, error) +type line_splitter_DKVP_NIDX func(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrmap, error) type RecordReaderDKVPNIDX struct { readerOptions *cli.TReaderOptions recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl - splitter splitter_DKVP_NIDX + lineSplitter line_splitter_DKVP_NIDX + fieldSplitter iFieldSplitter + pairSplitter iPairSplitter } func NewRecordReaderDKVP( @@ -31,7 +33,9 @@ func NewRecordReaderDKVP( return &RecordReaderDKVPNIDX{ readerOptions: readerOptions, recordsPerBatch: recordsPerBatch, - splitter: recordFromDKVPLine, + lineSplitter: recordFromDKVPLine, + fieldSplitter: newFieldSplitter(readerOptions), + pairSplitter: newPairSplitter(readerOptions), }, nil } @@ -42,7 +46,9 @@ func NewRecordReaderNIDX( return &RecordReaderDKVPNIDX{ readerOptions: readerOptions, recordsPerBatch: recordsPerBatch, - splitter: recordFromNIDXLine, + lineSplitter: recordFromNIDXLine, + fieldSplitter: newFieldSplitter(readerOptions), + pairSplitter: newPairSplitter(readerOptions), }, nil } @@ -143,7 +149,7 @@ func (reader *RecordReaderDKVPNIDX) getRecordBatch( } } - record, err := reader.splitter(reader, line) + record, err := reader.lineSplitter(reader, line) if err != nil { errorChannel <- err return @@ -160,24 +166,10 @@ func recordFromDKVPLine(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrm record := mlrval.NewMlrmapAsRecord() dedupeFieldNames := reader.readerOptions.DedupeFieldNames - var pairs []string - // TODO: func-pointer this away - if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex - pairs = lib.SplitString(line, reader.readerOptions.IFS) - } else { - pairs = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) - } - if reader.readerOptions.AllowRepeatIFS { - pairs = lib.StripEmpties(pairs) // left/right trim - } + pairs := reader.fieldSplitter.Split(line) for i, pair := range pairs { - var kv []string - if reader.readerOptions.IPSRegex == nil { // e.g. --no-ips-regex - kv = strings.SplitN(pair, reader.readerOptions.IPS, 2) - } else { - kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, pair, 2) - } + kv := reader.pairSplitter.Split(pair) if len(kv) == 0 || (len(kv) == 1 && kv[0] == "") { // Ignore. This is expected when splitting with repeated IFS. @@ -206,16 +198,7 @@ func recordFromDKVPLine(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrm func recordFromNIDXLine(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrmap, error) { record := mlrval.NewMlrmapAsRecord() - var values []string - // TODO: func-pointer this away - if reader.readerOptions.IFSRegex == nil { // e.g. --no-ifs-regex - values = lib.SplitString(line, reader.readerOptions.IFS) - } else { - values = lib.RegexSplitString(reader.readerOptions.IFSRegex, line, -1) - } - if reader.readerOptions.AllowRepeatIFS { - values = lib.StripEmpties(values) // left/right trim - } + values := reader.fieldSplitter.Split(line) var i int = 0 for _, value := range values { diff --git a/internal/pkg/input/record_reader_xtab.go b/internal/pkg/input/record_reader_xtab.go index 65ceabefb..c2ddc7e68 100644 --- a/internal/pkg/input/record_reader_xtab.go +++ b/internal/pkg/input/record_reader_xtab.go @@ -5,6 +5,7 @@ import ( "container/list" "errors" "io" + "regexp" "strings" "github.com/johnkerl/miller/internal/pkg/cli" @@ -13,9 +14,14 @@ import ( "github.com/johnkerl/miller/internal/pkg/types" ) +type iXTABPairSplitter interface { + Split(input string) (key, value string, err error) +} + type RecordReaderXTAB struct { readerOptions *cli.TReaderOptions recordsPerBatch int // distinct from readerOptions.RecordsPerBatch for join/repl + pairSplitter iXTABPairSplitter // Note: XTAB uses two consecutive IFS in place of an IRS; IRS is ignored } @@ -45,6 +51,7 @@ func NewRecordReaderXTAB( return &RecordReaderXTAB{ readerOptions: readerOptions, recordsPerBatch: recordsPerBatch, + pairSplitter: newXTABPairSplitter(readerOptions), }, nil } @@ -262,31 +269,103 @@ func (reader *RecordReaderXTAB) recordFromXTABLines( for e := stanza.Front(); e != nil; e = e.Next() { line := e.Value.(string) - var kv []string - if reader.readerOptions.IPSRegex == nil { // e.g. --no-ips-regex - kv = strings.SplitN(line, reader.readerOptions.IPS, 2) - } else { - kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, line, 2) - } - if len(kv) < 1 { - return nil, errors.New("mlr: internal coding error in XTAB reader") + key, value, err := reader.pairSplitter.Split(line) + if err != nil { + return nil, err } - key := kv[0] - if len(kv) == 1 { - value := mlrval.VOID - _, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames) - if err != nil { - return nil, err - } - } else { - value := mlrval.FromDeferredType(kv[1]) - _, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames) - if err != nil { - return nil, err - } + _, err = record.PutReferenceMaybeDedupe(key, mlrval.FromDeferredType(value), dedupeFieldNames) + if err != nil { + return nil, err } } return record, nil } + +// IPairSplitter splits a string into left and right, e.g. for IPS. +// This is similar to the general one for multiple formats; the exception +// is that for XTAB we always allow repeat IPS. +func newXTABPairSplitter(options *cli.TReaderOptions) iXTABPairSplitter { + if options.IPSRegex == nil { + return &tXTABIPSSplitter{ips: options.IPS, ipslen: len(options.IPS)} + } else { + return &tXTABIPSRegexSplitter{ipsRegex: options.IPSRegex} + } +} + +type tXTABIPSSplitter struct { + ips string + ipslen int +} + +// This is a splitter for XTAB lines, like 'abc 123'. It's not quite the same as the +// field/pair-splitter functions shared by DKVP, NIDX, and CSV-lite. XTAB is the omly format for +// which we need to produce just a pair of items -- a key and a value -- delimited by one or more +// IPS. For exaemple, with IPS being a space, in 'abc 123' we need to get key 'abc' and value +// '123'; for 'abc 123 456' we need key 'abc' and value '123 456'. It's super-elegant to simply +// regex-split the line like 'kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, line, 2)' -- +// however, that's 3x slower than the current implementation. It turns out regexes are great +// but we should use them only when we must, since they are expensive. +func (s *tXTABIPSSplitter) Split(input string) (key, value string, err error) { + // Empty string is a length-0 return value. + n := len(input) + if n == 0 { + return "", "", errors.New("mlr: internal coding error in XTAB reader") + } + + // ' abc 123' splits as key '', value 'abc 123'. + if strings.HasPrefix(input, s.ips) { + keyStart := 0 + for keyStart < n && strings.HasPrefix(input[keyStart:], s.ips) { + keyStart += s.ipslen + } + return "", input[keyStart:n], nil + } + + // Find the first IPS, if any. If there isn't any in the input line then there is no value, only key: + // e.g. the line is 'abc'. + var keyEnd, valueStart int + foundIPS := false + for keyEnd = 1; keyEnd <= n; keyEnd++ { + if strings.HasPrefix(input[keyEnd:], s.ips) { + foundIPS = true + break + } + } + if !foundIPS { + return input, "", nil + } + + // Find the first non-IPS character after last-found IPS, if any. If there isn't any in the input + // line then there is no value, only key: e.g. the line is 'abc '. + foundValue := false + for valueStart = keyEnd + s.ipslen; valueStart <= n; valueStart++ { + if !strings.HasPrefix(input[valueStart:], s.ips) { + foundValue = true + break + } + } + if !foundValue { + return input[0:keyEnd], "", nil + } + + return input[0:keyEnd], input[valueStart:n], nil +} + +type tXTABIPSRegexSplitter struct { + ipsRegex *regexp.Regexp +} + +func (s *tXTABIPSRegexSplitter) Split(input string) (key, value string, err error) { + kv := lib.RegexSplitString(s.ipsRegex, input, 2) + if len(kv) == 0 { + return "", "", errors.New("mlr: internal coding error in XTAB reader") + } else if len(kv) == 1 { + return kv[0], "", nil + } else if len(kv) == 2 { + return kv[0], kv[1], nil + } else { + return "", "", errors.New("mlr: internal coding error in XTAB reader") + } +} diff --git a/internal/pkg/lib/regex.go b/internal/pkg/lib/regex.go index 85946d828..6f241f3ee 100644 --- a/internal/pkg/lib/regex.go +++ b/internal/pkg/lib/regex.go @@ -44,16 +44,6 @@ var captureDetector = regexp.MustCompile("\\\\[0-9]") // "\2:\1" so they don't need to be recomputed on every record. var captureSplitter = regexp.MustCompile("(\\\\[0-9])") -// IsRegexString is for the IFS/IPS-as-regex feature. -// TODO: probably put this entirely under user control, so people can explicitly say '--ifs-regex something'. -func IsRegexString(s string) bool { - if len(s) == 1 { // Unfortunately, '|' and '.' qualify as "regex metacharacters". - return false - } else { - return regexp.QuoteMeta(s) != s - } -} - // CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax // which predate the port of Miller from C to Go. Miller regexes use a final // 'i' to indicate case-insensitivity; Go regexes use an initial "(?i)". diff --git a/internal/pkg/transformers/join.go b/internal/pkg/transformers/join.go index 3b6147fa5..d407054a5 100644 --- a/internal/pkg/transformers/join.go +++ b/internal/pkg/transformers/join.go @@ -115,7 +115,6 @@ func transformerJoinUsage( fmt.Fprintf(o, " --ifs {field-separator character}\n") fmt.Fprintf(o, " --ips {pair-separator character}\n") fmt.Fprintf(o, " --repifs\n") - fmt.Fprintf(o, " --repips\n") fmt.Fprintf(o, " --implicit-csv-header\n") fmt.Fprintf(o, " --no-implicit-csv-header\n") fmt.Fprintf(o, "For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will\n") diff --git a/man/manpage.txt b/man/manpage.txt index bc31f7d73..77f07daf2 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -111,6 +111,7 @@ HELP OPTIONS Flags: mlr help flags mlr help list-separator-aliases + mlr help list-separator-regex-aliases mlr help comments-in-data-flags mlr help compressed-data-flags mlr help csv-only-flags @@ -688,11 +689,14 @@ SEPARATOR FLAGS semicolon = ";" slash = "/" space = " " - spaces = "( )+" tab = "\t" - tabs = "(\t)+" usv_fs = "\xe2\x90\x9f" usv_rs = "\xe2\x90\x9e" + + - Similarly, you can use the following for `--ifs-regex` and `--ips-regex`: + + spaces = "( )+" + tabs = "(\t)+" whitespace = "([ \t])+" * Default separators by format: @@ -709,12 +713,10 @@ SEPARATOR FLAGS --fs {string} Specify FS for input and output. --ifs {string} Specify FS for input. + --ifs-regex {string} Specify FS for input as a regular expression. --ips {string} Specify PS for input. + --ips-regex {string} Specify PS for input as a regular expression. --irs {string} Specify RS for input. - --no-ifs-regex Don't treat IFS value as a regular expression. Useful - if your IFS is ".". - --no-ips-regex Don't treat IPS value as a regular expression. Useful - if your IPS is ".". --ofs {string} Specify FS for output. --ops {string} Specify PS for output. --ors {string} Specify RS for output. @@ -1229,7 +1231,6 @@ VERBS --ifs {field-separator character} --ips {pair-separator character} --repifs - --repips --implicit-csv-header --no-implicit-csv-header For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will @@ -2996,4 +2997,4 @@ SEE ALSO - 2021-12-23 MILLER(1) + 2021-12-25 MILLER(1) diff --git a/man/mlr.1 b/man/mlr.1 index 685f3e824..df93c3401 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2021-12-23 +.\" Date: 2021-12-25 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2021-12-23" "\ \&" "\ \&" +.TH "MILLER" "1" "2021-12-25" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -146,6 +146,7 @@ Essentials: Flags: mlr help flags mlr help list-separator-aliases + mlr help list-separator-regex-aliases mlr help comments-in-data-flags mlr help compressed-data-flags mlr help csv-only-flags @@ -831,11 +832,14 @@ Notes about all other separators: semicolon = ";" slash = "/" space = " " - spaces = "( )+" tab = "\et" - tabs = "(\et)+" usv_fs = "\exe2\ex90\ex9f" usv_rs = "\exe2\ex90\ex9e" + + - Similarly, you can use the following for `--ifs-regex` and `--ips-regex`: + + spaces = "( )+" + tabs = "(\et)+" whitespace = "([ \et])+" * Default separators by format: @@ -852,12 +856,10 @@ Notes about all other separators: --fs {string} Specify FS for input and output. --ifs {string} Specify FS for input. +--ifs-regex {string} Specify FS for input as a regular expression. --ips {string} Specify PS for input. +--ips-regex {string} Specify PS for input as a regular expression. --irs {string} Specify RS for input. ---no-ifs-regex Don't treat IFS value as a regular expression. Useful - if your IFS is ".". ---no-ips-regex Don't treat IPS value as a regular expression. Useful - if your IPS is ".". --ofs {string} Specify FS for output. --ops {string} Specify PS for output. --ors {string} Specify RS for output. @@ -1554,7 +1556,6 @@ the main "mlr --help" for more information on syntax for these arguments: --ifs {field-separator character} --ips {pair-separator character} --repifs - --repips --implicit-csv-header --no-implicit-csv-header For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index b2c0331ea..bf89a1ec3 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -455,7 +455,6 @@ the main "mlr --help" for more information on syntax for these arguments: --ifs {field-separator character} --ips {pair-separator character} --repifs - --repips --implicit-csv-header --no-implicit-csv-header For example, if you have 'mlr --csv ... join -l foo ... ' then the left-file format will diff --git a/test/cases/io-multi-character-ixs/0012/cmd b/test/cases/io-multi-character-ixs/0012/cmd index 2641d0110..80040b284 100644 --- a/test/cases/io-multi-character-ixs/0012/cmd +++ b/test/cases/io-multi-character-ixs/0012/cmd @@ -1 +1 @@ -mlr --xtab --ips '\.' --ops @ cut -x -f b test/input/dots.xtab +mlr --xtab --ips '.' --ops @ cut -x -f b test/input/dots.xtab diff --git a/test/cases/io-separator-aliases/0003/cmd b/test/cases/io-separator-aliases/0003/cmd index 32f9f822e..2aa373988 100644 --- a/test/cases/io-separator-aliases/0003/cmd +++ b/test/cases/io-separator-aliases/0003/cmd @@ -1 +1 @@ -mlr --from ${CASEDIR}/input --n2j --ifs spaces cat +mlr --from ${CASEDIR}/input --n2j --ifs-regex spaces cat diff --git a/test/cases/io-separator-aliases/0005/cmd b/test/cases/io-separator-aliases/0005/cmd index e8a1ca17d..97bb2d7d4 100644 --- a/test/cases/io-separator-aliases/0005/cmd +++ b/test/cases/io-separator-aliases/0005/cmd @@ -1 +1 @@ -mlr --from ${CASEDIR}/input --n2j --ifs tabs cat +mlr --from ${CASEDIR}/input --n2j --ifs-regex tabs cat diff --git a/test/cases/io-separator-aliases/0006/cmd b/test/cases/io-separator-aliases/0006/cmd index 74823216a..96dcc89e8 100644 --- a/test/cases/io-separator-aliases/0006/cmd +++ b/test/cases/io-separator-aliases/0006/cmd @@ -1 +1 @@ -mlr --from ${CASEDIR}/input --n2j --ifs whitespace cat +mlr --from ${CASEDIR}/input --n2j --ifs-regex whitespace cat diff --git a/todo.txt b/todo.txt index 5469ef66d..f4c41456e 100644 --- a/todo.txt +++ b/todo.txt @@ -2,7 +2,6 @@ PUNCHDOWN LIST * blockers: - ! --ifs-regex & --ips-regex -- guessing is not safe as evidence by '.' and '|' - allow-repeat-ixs nidx perf mod w/o regex split string ... - fractional-strptime @@ -107,6 +106,8 @@ PUNCHDOWN LIST ================================================================ NON-BLOCKERS +* xtab splitter UT; nidx too + * integrate: o https://www.libhunt.com/r/miller o https://repology.org/project/miller/information