From e10fee07242e15e796578ba0c842402996570ca5 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 27 Dec 2021 00:54:21 -0500 Subject: [PATCH] Improve type-inference performance (#809) * To-do items for broader platform/go-version benchmarking * neaten inferrer API * extend type-inference unit-test cases * Add benchmark scripts for comparing compiler versions * mlr version in addition to mlr --version * some go-benchmark files for Mac/Linux perf comparisons * neaten perf-scripts * merge * type-scan optimization tests * type-scan optimization infra * test new inferrer * mlr --time option * include --cpuprofile and --traceprofile in on-line help * sharpen inferred/deferred-type API distinction * replace old inferrer with newer/faster * update docs for new type-inferrer --- .vimrc | 1 + Makefile | 30 +- cmd/mlr/main.go | 31 +- cmd/scan/main.go | 19 + docs/src/manpage.md | 18 +- docs/src/manpage.txt | 18 +- docs/src/mk-flag-info.rb | 2 +- docs/src/new-in-miller-6.md | 11 +- docs/src/new-in-miller-6.md.in | 11 +- docs/src/online-help.md | 1 + docs/src/reference-main-arithmetic.md | 17 +- docs/src/reference-main-arithmetic.md.in | 17 +- docs/src/reference-main-auxiliary-commands.md | 1 + docs/src/reference-main-flag-list.md | 382 ++++++------------ go.mod | 2 +- internal/pkg/auxents/auxents.go | 8 + internal/pkg/auxents/repl/prompt.go | 2 +- internal/pkg/cli/option_parse.go | 46 +++ internal/pkg/cli/option_types.go | 2 + internal/pkg/climain/mlrcli_parse.go | 12 +- internal/pkg/entrypoint/entrypoint.go | 11 +- .../pkg/input/record_reader_benchmark_test.go | 71 ++++ internal/pkg/lib/util.go | 2 +- internal/pkg/mlrval/mlrval_benchmark_test.go | 34 ++ internal/pkg/mlrval/mlrval_infer.go | 248 +++++++++--- internal/pkg/mlrval/mlrval_infer_test.go | 252 ++++++++++++ internal/pkg/mlrval/mlrval_new.go | 12 +- internal/pkg/scan/digits.go | 90 +++++ internal/pkg/scan/digits_test.go | 57 +++ internal/pkg/scan/doc.go | 3 + internal/pkg/scan/find.go | 182 +++++++++ internal/pkg/scan/find_benchmark_test.go | 68 ++++ internal/pkg/scan/find_test.go | 114 ++++++ internal/pkg/scan/type.go | 36 ++ internal/pkg/scan/type_test.go | 18 + man/manpage.txt | 18 +- man/mlr.1 | 28 +- scripts/chain-1.mlr | 2 +- scripts/chain-cmps.sh | 19 +- scripts/chain-lengths.sh | 18 +- scripts/compiler-versions-build | 7 + scripts/compiler-versions-install | 13 + scripts/compiler-versions-time | 9 + scripts/time-big-file | 2 +- scripts/time-big-files | 16 +- test/cases/io-infer-flags/dash-O/expout | 4 +- todo.txt | 11 + 47 files changed, 1595 insertions(+), 381 deletions(-) create mode 100644 cmd/scan/main.go create mode 100644 internal/pkg/input/record_reader_benchmark_test.go create mode 100644 internal/pkg/mlrval/mlrval_benchmark_test.go create mode 100644 internal/pkg/mlrval/mlrval_infer_test.go create mode 100644 internal/pkg/scan/digits.go create mode 100644 internal/pkg/scan/digits_test.go create mode 100644 internal/pkg/scan/doc.go create mode 100644 internal/pkg/scan/find.go create mode 100644 internal/pkg/scan/find_benchmark_test.go create mode 100644 internal/pkg/scan/find_test.go create mode 100644 internal/pkg/scan/type.go create mode 100644 internal/pkg/scan/type_test.go create mode 100755 scripts/compiler-versions-build create mode 100755 scripts/compiler-versions-install create mode 100755 scripts/compiler-versions-time diff --git a/.vimrc b/.vimrc index 7d420eb5a..d3d35005f 100644 --- a/.vimrc +++ b/.vimrc @@ -1,2 +1,3 @@ map \d :w:!clear;echo Building ...; echo; make mlr map \f :w:!clear;echo Building ...; echo; make ut +map \r :w:!clear;echo Building ...; echo; make ut-scan ut-mlv diff --git a/Makefile b/Makefile index 29c5cd5ab..263ace6c8 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,24 @@ install: build unit-test ut: go test github.com/johnkerl/miller/internal/pkg/... +ut-lib: + go test github.com/johnkerl/miller/internal/pkg/lib... +ut-scan: + go test github.com/johnkerl/miller/internal/pkg/scan/... +ut-mlv: + go test github.com/johnkerl/miller/internal/pkg/mlrval/... +ut-bifs: + go test github.com/johnkerl/miller/internal/pkg/bifs/... +ut-input: + go test github.com/johnkerl/miller/internal/pkg/input/... + +bench: + go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/... +bench-mlv: + go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/mlrval/... +bench-input: + go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/input/... + # ---------------------------------------------------------------- # Regression tests (large number) # @@ -41,12 +59,22 @@ unit-test ut: regression-test: go test -v regression_test.go +# ---------------------------------------------------------------- +# Experimental executables: +scan: + go build github.com/johnkerl/miller/cmd/scan + +# ---------------------------------------------------------------- +# Formatting # go fmt ./... finds experimental C files which we want to ignore. fmt: -go fmt ./cmd/... -go fmt ./internal/pkg/... -go fmt ./regression_test.go +# ---------------------------------------------------------------- +# Static analysis + # Needs first: go install honnef.co/go/tools/cmd/staticcheck@latest # See also: https://staticcheck.io staticcheck: @@ -93,4 +121,4 @@ release_tarball: build check # ================================================================ # Go does its own dependency management, outside of make. -.PHONY: build mlr check unit_test regression_test fmt staticcheck dev docs +.PHONY: build mlr scan check unit_test regression_test bench fmt staticcheck dev docs diff --git a/cmd/mlr/main.go b/cmd/mlr/main.go index 0e8c9f3e4..23ed5b5b6 100644 --- a/cmd/mlr/main.go +++ b/cmd/mlr/main.go @@ -8,12 +8,16 @@ import ( "runtime/debug" "runtime/pprof" "strconv" + "strings" + "time" "github.com/johnkerl/miller/internal/pkg/entrypoint" "github.com/pkg/profile" // for trace.out ) func main() { + // For mlr --time + startTime := time.Now() // Respect env $GOMAXPROCS, if provided, else set default. haveSetGoMaxProcs := false @@ -63,12 +67,35 @@ func main() { defer fmt.Fprintf(os.Stderr, "CPU profile finished.\ngo tool pprof -http=:8080 %s\n", profFilename) } - if len(os.Args) >= 3 && os.Args[1] == "--traceprofile" { + if len(os.Args) >= 2 && os.Args[1] == "--traceprofile" { defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() defer fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") } + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // This will obtain os.Args and go from there. All the usual contents of // main() are put into this package for ease of testing. - entrypoint.Main() + mainReturn := entrypoint.Main() + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Timing + // + // The system 'time' command is built-in, of course but it's nice to have + // simply wall-time without the real/user/sys distinction. Also, making + // this a Miller built-in is nice for Windows. + if mainReturn.PrintElapsedTime { + endTime := time.Now() + startNanos := startTime.UnixNano() + endNanos := endTime.UnixNano() + seconds := float64(endNanos-startNanos) / 1e9 + fmt.Fprintf(os.Stderr, "%.6f", seconds) + for _, arg := range os.Args { + if strings.Contains(arg, " ") || strings.Contains(arg, "\t") { + fmt.Fprintf(os.Stderr, " '%s'", arg) + } else { + fmt.Fprintf(os.Stderr, " %s", arg) + } + } + fmt.Fprintf(os.Stderr, "\n") + } } diff --git a/cmd/scan/main.go b/cmd/scan/main.go new file mode 100644 index 000000000..c185b8752 --- /dev/null +++ b/cmd/scan/main.go @@ -0,0 +1,19 @@ +// ================================================================ +// Experiments for type-inference performance optimization +// ================================================================ + +package main + +import ( + "fmt" + "os" + + "github.com/johnkerl/miller/internal/pkg/scan" +) + +func main() { + for _, arg := range os.Args[1:] { + scanType := scan.FindScanType(arg) + fmt.Printf("%-10s -> %s\n", arg, scan.TypeNames[scanType]) + } +} diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 91d703d3c..4a591235a 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -144,6 +144,7 @@ HELP OPTIONS mlr help miscellaneous-flags mlr help output-colorization-flags mlr help pprint-only-flags + mlr help profiling-flags mlr help separator-flags Verbs: mlr help list-verbs @@ -637,6 +638,20 @@ PPRINT-ONLY FLAGS for input). --right Right-justifies all fields for PPRINT output. +PROFILING FLAGS + These are flags for profiling Miller performance. + --cpuprofile {CPU-profile file name} + Create a CPU-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + --time Print elapsed execution time in seconds to stderr at + the end of the execution of the program. + --traceprofile Create a trace-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + SEPARATOR FLAGS See the Separators doc page for more about record separators, field separators, and pair separators. Also see the File formats doc page, or @@ -756,6 +771,7 @@ AUXILIARY COMMANDS help regtest repl + version For more information, please invoke mlr {subcommand} --help. MLRRC @@ -3024,5 +3040,5 @@ SEE ALSO - 2021-12-25 MILLER(1) + 2021-12-27 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 07154c73b..0eb839907 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -123,6 +123,7 @@ HELP OPTIONS mlr help miscellaneous-flags mlr help output-colorization-flags mlr help pprint-only-flags + mlr help profiling-flags mlr help separator-flags Verbs: mlr help list-verbs @@ -616,6 +617,20 @@ PPRINT-ONLY FLAGS for input). --right Right-justifies all fields for PPRINT output. +PROFILING FLAGS + These are flags for profiling Miller performance. + --cpuprofile {CPU-profile file name} + Create a CPU-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + --time Print elapsed execution time in seconds to stderr at + the end of the execution of the program. + --traceprofile Create a trace-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + SEPARATOR FLAGS See the Separators doc page for more about record separators, field separators, and pair separators. Also see the File formats doc page, or @@ -735,6 +750,7 @@ AUXILIARY COMMANDS help regtest repl + version For more information, please invoke mlr {subcommand} --help. MLRRC @@ -3003,4 +3019,4 @@ SEE ALSO - 2021-12-25 MILLER(1) + 2021-12-27 MILLER(1) diff --git a/docs/src/mk-flag-info.rb b/docs/src/mk-flag-info.rb index c6b07cf7a..3f54fe08a 100755 --- a/docs/src/mk-flag-info.rb +++ b/docs/src/mk-flag-info.rb @@ -46,7 +46,7 @@ EOF for flag in flags headline = `mlr help show-headline-for-flag '#{flag}'` help = `mlr help show-help-for-flag '#{flag}'` - puts "* `#{headline}`: #{help}" + puts "* `#{headline.chomp}`: #{help}" end end diff --git a/docs/src/new-in-miller-6.md b/docs/src/new-in-miller-6.md index 22e3a9974..20e59c955 100644 --- a/docs/src/new-in-miller-6.md +++ b/docs/src/new-in-miller-6.md @@ -255,7 +255,8 @@ The following differences are rather technical. If they don't sound familiar to * See also `mlr help legacy-flags` or the [legacy-flags reference](reference-main-flag-list.md#legacy-flags). * Type-inference: * The `-S` and `-F` flags to `mlr put` and `mlr filter` are ignored, since type-inference is no longer done in `mlr put` and `mlr filter`, but rather, when records are first read. You can use `mlr -S` and `mlr -A`, respectively, instead to control type-inference within the record-readers. - * Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as float. + * Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as deicmal integers. + * Any numbers prefix with `0o`, e.g. `0o377`, are already treated as octal regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled. * See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags). * Emitting a map-valued expression now requires either a temporary variable or the new `emit1` keyword. Please see the [page on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information. @@ -270,7 +271,13 @@ The following differences are rather technical. If they don't sound familiar to As a benchmark, the [example.csv](https://github.com/johnkerl/miller/blob/main/docs/src/example.csv) file [was expanded](https://github.com/johnkerl/miller/blob/main/scripts/make-big-files) into a million-line CSV file, -then converted to DKVP, JSON, etc. These were run on a commodity Mac laptop with four CPUs. +then converted to DKVP, JSON, etc. + +Notes: + +* These were run on a commodity Mac laptop with four CPUs, on MacOS Monterey, using `go1.16.5 darwin/amd64`. +* Linux benchmarks are pending. +* As of late 2021, Miller has been benchmarks using Go compiler versions 1.15.15, 1.16.12, 1.17.5, and 1.18beta1, with no significant performance changes attributable to compiler versions. For the [first benchmark](https://github.com/johnkerl/miller/blob/main/scripts/time-big-files), we have `mlr cat` of those files, with processing times shown: diff --git a/docs/src/new-in-miller-6.md.in b/docs/src/new-in-miller-6.md.in index 000763f52..321f08e85 100644 --- a/docs/src/new-in-miller-6.md.in +++ b/docs/src/new-in-miller-6.md.in @@ -213,7 +213,8 @@ The following differences are rather technical. If they don't sound familiar to * See also `mlr help legacy-flags` or the [legacy-flags reference](reference-main-flag-list.md#legacy-flags). * Type-inference: * The `-S` and `-F` flags to `mlr put` and `mlr filter` are ignored, since type-inference is no longer done in `mlr put` and `mlr filter`, but rather, when records are first read. You can use `mlr -S` and `mlr -A`, respectively, instead to control type-inference within the record-readers. - * Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as float. + * Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as deicmal integers. + * Any numbers prefix with `0o`, e.g. `0o377`, are already treated as octal regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled. * See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags). * Emitting a map-valued expression now requires either a temporary variable or the new `emit1` keyword. Please see the [page on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information. @@ -228,7 +229,13 @@ The following differences are rather technical. If they don't sound familiar to As a benchmark, the [example.csv](https://github.com/johnkerl/miller/blob/main/docs/src/example.csv) file [was expanded](https://github.com/johnkerl/miller/blob/main/scripts/make-big-files) into a million-line CSV file, -then converted to DKVP, JSON, etc. These were run on a commodity Mac laptop with four CPUs. +then converted to DKVP, JSON, etc. + +Notes: + +* These were run on a commodity Mac laptop with four CPUs, on MacOS Monterey, using `go1.16.5 darwin/amd64`. +* Linux benchmarks are pending. +* As of late 2021, Miller has been benchmarks using Go compiler versions 1.15.15, 1.16.12, 1.17.5, and 1.18beta1, with no significant performance changes attributable to compiler versions. For the [first benchmark](https://github.com/johnkerl/miller/blob/main/scripts/time-big-files), we have `mlr cat` of those files, with processing times shown: diff --git a/docs/src/online-help.md b/docs/src/online-help.md index d74921e90..0216e0cda 100644 --- a/docs/src/online-help.md +++ b/docs/src/online-help.md @@ -62,6 +62,7 @@ Flags: mlr help miscellaneous-flags mlr help output-colorization-flags mlr help pprint-only-flags + mlr help profiling-flags mlr help separator-flags Verbs: mlr help list-verbs diff --git a/docs/src/reference-main-arithmetic.md b/docs/src/reference-main-arithmetic.md index bd8b797eb..2c42aa605 100644 --- a/docs/src/reference-main-arithmetic.md +++ b/docs/src/reference-main-arithmetic.md @@ -20,11 +20,20 @@ Quick links: Numbers in Miller are double-precision float or 64-bit signed integers. Anything scannable as int, e.g `123` or `0xabcd`, is treated as an integer; otherwise, input scannable as float (`4.56` or `8e9`) is treated as float; everything else is a string. -If you want all numbers to be treated as floats, then you may use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b` with `$c = float($a) * float($b)`). +Three flags control input-scanning for numbers: `mlr -O`, `mlr -A`, and `mlr -S`. - +Prefix `0x` means hexadecimal, e.g. `0xcafe`; prefix `0b` means binary, e.g. +`0b1011`; prefix `0o` means octal, e.g. `0o377`. Numbers in data files with +leading zeroes, e.g. `0377` or `06789`, are treated as strings in Miller, +unless you specifiy `mlr -O`: then `0377` will scan as an octal integer (with +value 255), and `06789` will scan as a decimal integer (with value 6789). + +If you want all numbers from data files to be treated as floats, then you may +use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b` +with `$c = float($a) * float($b)`). Or, use `mlr -A`. + +If you use `mlr -S` then all field values from data files are read in as +strings; you can cast them using `int()` or `float()`. ## Conversion by math routines diff --git a/docs/src/reference-main-arithmetic.md.in b/docs/src/reference-main-arithmetic.md.in index 6e481f736..cbf584643 100644 --- a/docs/src/reference-main-arithmetic.md.in +++ b/docs/src/reference-main-arithmetic.md.in @@ -4,11 +4,20 @@ Numbers in Miller are double-precision float or 64-bit signed integers. Anything scannable as int, e.g `123` or `0xabcd`, is treated as an integer; otherwise, input scannable as float (`4.56` or `8e9`) is treated as float; everything else is a string. -If you want all numbers to be treated as floats, then you may use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b` with `$c = float($a) * float($b)`). +Three flags control input-scanning for numbers: `mlr -O`, `mlr -A`, and `mlr -S`. - +Prefix `0x` means hexadecimal, e.g. `0xcafe`; prefix `0b` means binary, e.g. +`0b1011`; prefix `0o` means octal, e.g. `0o377`. Numbers in data files with +leading zeroes, e.g. `0377` or `06789`, are treated as strings in Miller, +unless you specifiy `mlr -O`: then `0377` will scan as an octal integer (with +value 255), and `06789` will scan as a decimal integer (with value 6789). + +If you want all numbers from data files to be treated as floats, then you may +use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b` +with `$c = float($a) * float($b)`). Or, use `mlr -A`. + +If you use `mlr -S` then all field values from data files are read in as +strings; you can cast them using `int()` or `float()`. ## Conversion by math routines diff --git a/docs/src/reference-main-auxiliary-commands.md b/docs/src/reference-main-auxiliary-commands.md index 7ee241ac2..a55ce0361 100644 --- a/docs/src/reference-main-auxiliary-commands.md +++ b/docs/src/reference-main-auxiliary-commands.md @@ -31,6 +31,7 @@ Available subcommands: help regtest repl + version For more information, please invoke mlr {subcommand} --help. diff --git a/docs/src/reference-main-flag-list.md b/docs/src/reference-main-flag-list.md index 124b1f027..270457798 100644 --- a/docs/src/reference-main-flag-list.md +++ b/docs/src/reference-main-flag-list.md @@ -60,14 +60,10 @@ Notes: **Flags:** -* `--pass-comments -`: Immediately print commented lines (prefixed by `#`) within the input. -* `--pass-comments-with {string} -`: Immediately print commented lines within input, with specified prefix. -* `--skip-comments -`: Ignore commented lines (prefixed by `#`) within the input. -* `--skip-comments-with {string} -`: Ignore commented lines within input, with specified prefix. +* `--pass-comments`: Immediately print commented lines (prefixed by `#`) within the input. +* `--pass-comments-with {string}`: Immediately print commented lines within input, with specified prefix. +* `--skip-comments`: Ignore commented lines (prefixed by `#`) within the input. +* `--skip-comments-with {string}`: Ignore commented lines within input, with specified prefix. ## Compressed-data flags @@ -102,22 +98,14 @@ decisions that might have been made based on the file suffix. Likewise, **Flags:** -* `--bz2in -`: Uncompress bzip2 within the Miller process. Done by default if file ends in `.bz2`. -* `--gzin -`: Uncompress gzip within the Miller process. Done by default if file ends in `.gz`. -* `--prepipe {decompression command} -`: You can, of course, already do without this for single input files, e.g. `gunzip < myfile.csv.gz | mlr ...`. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution. -* `--prepipe-bz2 -`: Same as `--prepipe bz2`, except this is allowed in `.mlrrc`. -* `--prepipe-gunzip -`: Same as `--prepipe gunzip`, except this is allowed in `.mlrrc`. -* `--prepipe-zcat -`: Same as `--prepipe zcat`, except this is allowed in `.mlrrc`. -* `--prepipex {decompression command} -`: Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful for some commands like `unzip -qc` which don't read standard input. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution. -* `--zin -`: Uncompress zlib within the Miller process. Done by default if file ends in `.z`. +* `--bz2in`: Uncompress bzip2 within the Miller process. Done by default if file ends in `.bz2`. +* `--gzin`: Uncompress gzip within the Miller process. Done by default if file ends in `.gz`. +* `--prepipe {decompression command}`: You can, of course, already do without this for single input files, e.g. `gunzip < myfile.csv.gz | mlr ...`. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution. +* `--prepipe-bz2`: Same as `--prepipe bz2`, except this is allowed in `.mlrrc`. +* `--prepipe-gunzip`: Same as `--prepipe gunzip`, except this is allowed in `.mlrrc`. +* `--prepipe-zcat`: Same as `--prepipe zcat`, except this is allowed in `.mlrrc`. +* `--prepipex {decompression command}`: Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful for some commands like `unzip -qc` which don't read standard input. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution. +* `--zin`: Uncompress zlib within the Miller process. Done by default if file ends in `.z`. ## CSV-only flags @@ -126,16 +114,11 @@ These are flags which are applicable to CSV format. **Flags:** -* `--allow-ragged-csv-input or --ragged -`: If a data line has fewer fields than the header line, fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case. -* `--headerless-csv-output or --ho -`: Print only CSV data lines; do not print CSV header lines. -* `--implicit-csv-header or --headerless-csv-input or --hi -`: Use 1,2,3,... as field labels, rather than from line 1 of input files. Tip: combine with `label` to recreate missing headers. -* `--no-implicit-csv-header -`: Opposite of `--implicit-csv-header`. This is the default anyway -- the main use is for the flags to `mlr join` if you have main file(s) which are headerless but you want to join in on a file which does have a CSV header. Then you could use `mlr --csv --implicit-csv-header join --no-implicit-csv-header -l your-join-in-with-header.csv ... your-headerless.csv`. -* `-N -`: Keystroke-saver for `--implicit-csv-header --headerless-csv-output`. +* `--allow-ragged-csv-input or --ragged`: If a data line has fewer fields than the header line, fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case. +* `--headerless-csv-output or --ho`: Print only CSV data lines; do not print CSV header lines. +* `--implicit-csv-header or --headerless-csv-input or --hi`: Use 1,2,3,... as field labels, rather than from line 1 of input files. Tip: combine with `label` to recreate missing headers. +* `--no-implicit-csv-header`: Opposite of `--implicit-csv-header`. This is the default anyway -- the main use is for the flags to `mlr join` if you have main file(s) which are headerless but you want to join in on a file which does have a CSV header. Then you could use `mlr --csv --implicit-csv-header join --no-implicit-csv-header -l your-join-in-with-header.csv ... your-headerless.csv`. +* `-N`: Keystroke-saver for `--implicit-csv-header --headerless-csv-output`. ## File-format flags @@ -152,90 +135,48 @@ are overridden in all cases by setting output format to `format2`. **Flags:** -* `--asv or --asvlite -`: Use ASV format for input and output data. -* `--csv or -c -`: Use CSV format for input and output data. -* `--csvlite -`: Use CSV-lite format for input and output data. -* `--dkvp -`: Use DKVP format for input and output data. -* `--gen-field-name -`: Specify field name for --igen. Defaults to "i". -* `--gen-start -`: Specify start value for --igen. Defaults to 1. -* `--gen-step -`: Specify step value for --igen. Defaults to 1. -* `--gen-stop -`: Specify stop value for --igen. Defaults to 100. -* `--iasv or --iasvlite -`: Use ASV format for input data. -* `--icsv -`: Use CSV format for input data. -* `--icsvlite -`: Use CSV-lite format for input data. -* `--idkvp -`: Use DKVP format for input data. -* `--igen -`: Ignore input files and instead generate sequential numeric input using --gen-field-name, --gen-start, --gen-step, and --gen-stop values. See also the seqgen verb, which is more useful/intuitive. -* `--ijson -`: Use JSON format for input data. -* `--inidx -`: Use NIDX format for input data. -* `--io {format name} -`: Use format name for input and output data. For example: `--io csv` is the same as `--csv`. -* `--ipprint -`: Use PPRINT format for input data. -* `--itsv -`: Use TSV format for input data. -* `--itsvlite -`: Use TSV-lite format for input data. -* `--iusv or --iusvlite -`: Use USV format for input data. -* `--ixtab -`: Use XTAB format for input data. -* `--json or -j -`: Use JSON format for input and output data. -* `--nidx -`: Use NIDX format for input and output data. -* `--oasv or --oasvlite -`: Use ASV format for output data. -* `--ocsv -`: Use CSV format for output data. -* `--ocsvlite -`: Use CSV-lite format for output data. -* `--odkvp -`: Use DKVP format for output data. -* `--ojson -`: Use JSON format for output data. -* `--omd -`: Use markdown-tabular format for output data. -* `--onidx -`: Use NIDX format for output data. -* `--opprint -`: Use PPRINT format for output data. -* `--otsv -`: Use TSV format for output data. -* `--otsvlite -`: Use TSV-lite format for output data. -* `--ousv or --ousvlite -`: Use USV format for output data. -* `--oxtab -`: Use XTAB format for output data. -* `--pprint -`: Use PPRINT format for input and output data. -* `--tsv -`: Use TSV format for input and output data. -* `--tsvlite or -t -`: Use TSV-lite format for input and output data. -* `--usv or --usvlite -`: Use USV format for input and output data. -* `--xtab -`: Use XTAB format for input and output data. -* `-i {format name} -`: Use format name for input data. For example: `-i csv` is the same as `--icsv`. -* `-o {format name} -`: Use format name for output data. For example: `-o csv` is the same as `--ocsv`. +* `--asv or --asvlite`: Use ASV format for input and output data. +* `--csv or -c`: Use CSV format for input and output data. +* `--csvlite`: Use CSV-lite format for input and output data. +* `--dkvp`: Use DKVP format for input and output data. +* `--gen-field-name`: Specify field name for --igen. Defaults to "i". +* `--gen-start`: Specify start value for --igen. Defaults to 1. +* `--gen-step`: Specify step value for --igen. Defaults to 1. +* `--gen-stop`: Specify stop value for --igen. Defaults to 100. +* `--iasv or --iasvlite`: Use ASV format for input data. +* `--icsv`: Use CSV format for input data. +* `--icsvlite`: Use CSV-lite format for input data. +* `--idkvp`: Use DKVP format for input data. +* `--igen`: Ignore input files and instead generate sequential numeric input using --gen-field-name, --gen-start, --gen-step, and --gen-stop values. See also the seqgen verb, which is more useful/intuitive. +* `--ijson`: Use JSON format for input data. +* `--inidx`: Use NIDX format for input data. +* `--io {format name}`: Use format name for input and output data. For example: `--io csv` is the same as `--csv`. +* `--ipprint`: Use PPRINT format for input data. +* `--itsv`: Use TSV format for input data. +* `--itsvlite`: Use TSV-lite format for input data. +* `--iusv or --iusvlite`: Use USV format for input data. +* `--ixtab`: Use XTAB format for input data. +* `--json or -j`: Use JSON format for input and output data. +* `--nidx`: Use NIDX format for input and output data. +* `--oasv or --oasvlite`: Use ASV format for output data. +* `--ocsv`: Use CSV format for output data. +* `--ocsvlite`: Use CSV-lite format for output data. +* `--odkvp`: Use DKVP format for output data. +* `--ojson`: Use JSON format for output data. +* `--omd`: Use markdown-tabular format for output data. +* `--onidx`: Use NIDX format for output data. +* `--opprint`: Use PPRINT format for output data. +* `--otsv`: Use TSV format for output data. +* `--otsvlite`: Use TSV-lite format for output data. +* `--ousv or --ousvlite`: Use USV format for output data. +* `--oxtab`: Use XTAB format for output data. +* `--pprint`: Use PPRINT format for input and output data. +* `--tsv`: Use TSV format for input and output data. +* `--tsvlite or -t`: Use TSV-lite format for input and output data. +* `--usv or --usvlite`: Use USV format for input and output data. +* `--xtab`: Use XTAB format for input and output data. +* `-i {format name}`: Use format name for input data. For example: `-i csv` is the same as `--icsv`. +* `-o {format name}`: Use format name for output data. For example: `-o csv` is the same as `--ocsv`. ## Flatten-unflatten flags @@ -246,14 +187,10 @@ See the Flatten/unflatten doc page for more information. **Flags:** -* `--flatsep or --jflatsep {string} -`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`. -* `--no-auto-flatten -`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. -* `--no-auto-unflatten -`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. -* `--xvright -`: Right-justify values for XTAB format. +* `--flatsep or --jflatsep {string}`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`. +* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. +* `--no-auto-unflatten`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. +* `--xvright`: Right-justify values for XTAB format. ## Format-conversion keystroke-saver flags @@ -283,12 +220,9 @@ These are flags which are applicable to JSON format. **Flags:** -* `--jlistwrap or --jl -`: Wrap JSON output in outermost `[ ]`. -* `--jvstack -`: Put one key-value pair per line for JSON output (multi-line output). -* `--no-jvstack -`: Put objects/arrays all on one line for JSON output. +* `--jlistwrap or --jl`: Wrap JSON output in outermost `[ ]`. +* `--jvstack`: Put one key-value pair per line for JSON output (multi-line output). +* `--no-jvstack`: Put objects/arrays all on one line for JSON output. ## Legacy flags @@ -298,38 +232,22 @@ They are accepted as no-op flags in order to keep old scripts from breaking. **Flags:** -* `--jknquoteint -`: Type information from JSON input files is now preserved throughout the processing stream. -* `--jquoteall -`: Type information from JSON input files is now preserved throughout the processing stream. -* `--json-fatal-arrays-on-input -`: Miller now supports arrays as of version 6. -* `--json-map-arrays-on-input -`: Miller now supports arrays as of version 6. -* `--json-skip-arrays-on-input -`: Miller now supports arrays as of version 6. -* `--jsonx -`: The `--jvstack` flag is now default true in Miller 6. -* `--jvquoteall -`: Type information from JSON input files is now preserved throughout the processing stream. -* `--mmap -`: Miller no longer uses memory-mapping to access data files. -* `--no-mmap -`: Miller no longer uses memory-mapping to access data files. -* `--ojsonx -`: The `--jvstack` flag is now default true in Miller 6. -* `--quote-all -`: Ignored as of version 6. Types are inferred/retained through the processing flow now. -* `--quote-minimal -`: Ignored as of version 6. Types are inferred/retained through the processing flow now. -* `--quote-none -`: Ignored as of version 6. Types are inferred/retained through the processing flow now. -* `--quote-numeric -`: Ignored as of version 6. Types are inferred/retained through the processing flow now. -* `--quote-original -`: Ignored as of version 6. Types are inferred/retained through the processing flow now. -* `--vflatsep -`: Ignored as of version 6. This functionality is subsumed into JSON formatting. +* `--jknquoteint`: Type information from JSON input files is now preserved throughout the processing stream. +* `--jquoteall`: Type information from JSON input files is now preserved throughout the processing stream. +* `--json-fatal-arrays-on-input`: Miller now supports arrays as of version 6. +* `--json-map-arrays-on-input`: Miller now supports arrays as of version 6. +* `--json-skip-arrays-on-input`: Miller now supports arrays as of version 6. +* `--jsonx`: The `--jvstack` flag is now default true in Miller 6. +* `--jvquoteall`: Type information from JSON input files is now preserved throughout the processing stream. +* `--mmap`: Miller no longer uses memory-mapping to access data files. +* `--no-mmap`: Miller no longer uses memory-mapping to access data files. +* `--ojsonx`: The `--jvstack` flag is now default true in Miller 6. +* `--quote-all`: Ignored as of version 6. Types are inferred/retained through the processing flow now. +* `--quote-minimal`: Ignored as of version 6. Types are inferred/retained through the processing flow now. +* `--quote-none`: Ignored as of version 6. Types are inferred/retained through the processing flow now. +* `--quote-numeric`: Ignored as of version 6. Types are inferred/retained through the processing flow now. +* `--quote-original`: Ignored as of version 6. Types are inferred/retained through the processing flow now. +* `--vflatsep`: Ignored as of version 6. This functionality is subsumed into JSON formatting. ## Miscellaneous flags @@ -337,44 +255,25 @@ These are flags which don't fit into any other category. **Flags:** -* `--fflush -`: Force buffered output to be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to force frequent updates even when output is to a pipe or file, at a performance cost. -* `--from {filename} -`: Use this to specify an input file before the verb(s), rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`. -* `--hash-records -`: This is an internal parameter which normally does not need to be modified. It controls the mechanism by which Miller accesses fields within records. In general --no-hash-records is faster, and is the default. For specific use-cases involving data having many fields, and many of them being processed during a given processing run, --hash-records might offer a slight performance benefit. -* `--infer-int-as-float or -A -`: Cast all integers in data files to floats. -* `--infer-none or -S -`: Don't treat values like 123 or 456.7 in data files as int/float; leave them as strings. -* `--infer-octal or -O -`: Treat numbers like 0123 in data files as numeric; default is string. Note that 00--07 etc scan as int; 08-09 scan as float. -* `--load {filename} -`: Load DSL script file for all put/filter operations on the command line. If the name following `--load` is a directory, load all `*.mlr` files in that directory. This is just like `put -f` and `filter -f` except it's up-front on the command line, so you can do something like `alias mlr='mlr --load ~/myscripts'` if you like. -* `--mfrom {filenames} -`: Use this to specify one of more input files before the verb(s), rather than after. May be used more than once. The list of filename must end with `--`. This is useful for example since `--from *.csv` doesn't do what you might hope but `--mfrom *.csv --` does. -* `--mload {filenames} -`: Like `--load` but works with more than one filename, e.g. `--mload *.mlr --`. -* `--no-dedupe-field-names -`: By default, if an input record has a field named `x` and another also named `x`, the second will be renamed `x_2`, and so on. With this flag provided, the second `x`'s value will replace the first `x`'s value when the record is read. This flag has no effect on JSON input records, where duplicate keys always result in the last one's value being retained. -* `--no-fflush -`: Let buffered output not be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to allow less-frequent updates when output is to the terminal. This is unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead. -* `--no-hash-records -`: See --hash-records. -* `--nr-progress-mod {m} -`: With m a positive integer: print filename and record count to os.Stderr every m input records. -* `--ofmt {format} -`: E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb. -* `--records-per-batch {n} -`: This is an internal parameter for maximum number of records in a batch size. Normally this does not need to be modified. -* `--seed {n} -`: with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. -* `--tz {timezone} -`: Specify timezone, overriding `$TZ` environment variable (if any). -* `-I -`: Process files in-place. For each file name on the command line, output is written to a temp file in the same directory, which is then renamed over the original. Each file is processed in isolation: if the output format is CSV, CSV headers will be present in each output file, statistics are only over each file's own records; and so on. -* `-n -`: Process no input files, nor standard input either. Useful for `mlr put` with `begin`/`end` statements only. (Same as `--from /dev/null`.) Also useful in `mlr -n put -v '...'` for analyzing abstract syntax trees (if that's your thing). +* `--fflush`: Force buffered output to be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to force frequent updates even when output is to a pipe or file, at a performance cost. +* `--from {filename}`: Use this to specify an input file before the verb(s), rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`. +* `--hash-records`: This is an internal parameter which normally does not need to be modified. It controls the mechanism by which Miller accesses fields within records. In general --no-hash-records is faster, and is the default. For specific use-cases involving data having many fields, and many of them being processed during a given processing run, --hash-records might offer a slight performance benefit. +* `--infer-int-as-float or -A`: Cast all integers in data files to floats. +* `--infer-none or -S`: Don't treat values like 123 or 456.7 in data files as int/float; leave them as strings. +* `--infer-octal or -O`: Treat numbers like 0123 in data files as numeric; default is string. Note that 00--07 etc scan as int; 08-09 scan as float. +* `--load {filename}`: Load DSL script file for all put/filter operations on the command line. If the name following `--load` is a directory, load all `*.mlr` files in that directory. This is just like `put -f` and `filter -f` except it's up-front on the command line, so you can do something like `alias mlr='mlr --load ~/myscripts'` if you like. +* `--mfrom {filenames}`: Use this to specify one of more input files before the verb(s), rather than after. May be used more than once. The list of filename must end with `--`. This is useful for example since `--from *.csv` doesn't do what you might hope but `--mfrom *.csv --` does. +* `--mload {filenames}`: Like `--load` but works with more than one filename, e.g. `--mload *.mlr --`. +* `--no-dedupe-field-names`: By default, if an input record has a field named `x` and another also named `x`, the second will be renamed `x_2`, and so on. With this flag provided, the second `x`'s value will replace the first `x`'s value when the record is read. This flag has no effect on JSON input records, where duplicate keys always result in the last one's value being retained. +* `--no-fflush`: Let buffered output not be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to allow less-frequent updates when output is to the terminal. This is unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead. +* `--no-hash-records`: See --hash-records. +* `--nr-progress-mod {m}`: With m a positive integer: print filename and record count to os.Stderr every m input records. +* `--ofmt {format}`: E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb. +* `--records-per-batch {n}`: This is an internal parameter for maximum number of records in a batch size. Normally this does not need to be modified. +* `--seed {n}`: with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. +* `--tz {timezone}`: Specify timezone, overriding `$TZ` environment variable (if any). +* `-I`: Process files in-place. For each file name on the command line, output is written to a temp file in the same directory, which is then renamed over the original. Each file is processed in isolation: if the output format is CSV, CSV headers will be present in each output file, statistics are only over each file's own records; and so on. +* `-n`: Process no input files, nor standard input either. Useful for `mlr put` with `begin`/`end` statements only. (Same as `--from /dev/null`.) Also useful in `mlr -n put -v '...'` for analyzing abstract syntax trees (if that's your thing). ## Output-colorization flags @@ -436,24 +335,15 @@ and `mlr --list-color-names` to see available names (like `orchid`). **Flags:** -* `--always-color or -C -`: Instructs Miller to colorize output even when it normally would not. Useful for piping output to `less -r`. -* `--fail-color -`: Specify the color (see `--list-color-codes` and `--list-color-names`) for failing cases in `mlr regtest`. -* `--help-color -`: Specify the color (see `--list-color-codes` and `--list-color-names`) for highlights in `mlr help` output. -* `--key-color -`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record keys. -* `--list-color-codes -`: Show the available color codes in the range 0..255, such as 170 for example. -* `--list-color-names -`: Show the names for the available color codes, such as `orchid` for example. -* `--no-color or -M -`: Instructs Miller to not colorize any output. -* `--pass-color -`: Specify the color (see `--list-color-codes` and `--list-color-names`) for passing cases in `mlr regtest`. -* `--value-color -`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record values. +* `--always-color or -C`: Instructs Miller to colorize output even when it normally would not. Useful for piping output to `less -r`. +* `--fail-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for failing cases in `mlr regtest`. +* `--help-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for highlights in `mlr help` output. +* `--key-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record keys. +* `--list-color-codes`: Show the available color codes in the range 0..255, such as 170 for example. +* `--list-color-names`: Show the names for the available color codes, such as `orchid` for example. +* `--no-color or -M`: Instructs Miller to not colorize any output. +* `--pass-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for passing cases in `mlr regtest`. +* `--value-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record values. ## PPRINT-only flags @@ -462,10 +352,18 @@ These are flags which are applicable to PPRINT output format. **Flags:** -* `--barred -`: Prints a border around PPRINT output (not available for input). -* `--right -`: Right-justifies all fields for PPRINT output. +* `--barred`: Prints a border around PPRINT output (not available for input). +* `--right`: Right-justifies all fields for PPRINT output. + +## Profiling flags + +These are flags for profiling Miller performance. + +**Flags:** + +* `--cpuprofile {CPU-profile file name}`: Create a CPU-profile file for performance analysis. Instructions will be printed to stderr. This flag must be the very first thing after 'mlr' on the command line. +* `--time`: Print elapsed execution time in seconds to stderr at the end of the execution of the program. +* `--traceprofile`: Create a trace-profile file for performance analysis. Instructions will be printed to stderr. This flag must be the very first thing after 'mlr' on the command line. ## Separator flags @@ -566,28 +464,16 @@ Notes about all other separators: **Flags:** -* `--fs {string} -`: Specify FS for input and output. -* `--ifs {string} -`: Specify FS for input. -* `--ifs-regex {string} -`: Specify FS for input as a regular expression. -* `--ips {string} -`: Specify PS for input. -* `--ips-regex {string} -`: Specify PS for input as a regular expression. -* `--irs {string} -`: Specify RS for input. -* `--ofs {string} -`: Specify FS for output. -* `--ops {string} -`: Specify PS for output. -* `--ors {string} -`: Specify RS for output. -* `--ps {string} -`: Specify PS for input and output. -* `--repifs -`: Let IFS be repeated: e.g. for splitting on multiple spaces. -* `--rs {string} -`: Specify RS for input and output. +* `--fs {string}`: Specify FS for input and output. +* `--ifs {string}`: Specify FS for input. +* `--ifs-regex {string}`: Specify FS for input as a regular expression. +* `--ips {string}`: Specify PS for input. +* `--ips-regex {string}`: Specify PS for input as a regular expression. +* `--irs {string}`: Specify RS for input. +* `--ofs {string}`: Specify FS for output. +* `--ops {string}`: Specify PS for output. +* `--ors {string}`: Specify RS for output. +* `--ps {string}`: Specify PS for input and output. +* `--repifs`: Let IFS be repeated: e.g. for splitting on multiple spaces. +* `--rs {string}`: Specify RS for input and output. diff --git a/go.mod b/go.mod index 65acb8d29..48a652e94 100644 --- a/go.mod +++ b/go.mod @@ -17,7 +17,7 @@ require ( github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 github.com/lestrrat-go/strftime v1.0.4 github.com/mattn/go-isatty v0.0.12 - github.com/pkg/profile v1.6.0 // indirect + github.com/pkg/profile v1.6.0 github.com/stretchr/testify v1.7.0 // indirect golang.org/x/sys v0.0.0-20210326220804-49726bf1d181 golang.org/x/term v0.0.0-20201210144234-2321bbc49cbf diff --git a/internal/pkg/auxents/auxents.go b/internal/pkg/auxents/auxents.go index 231a16899..ef87d510e 100644 --- a/internal/pkg/auxents/auxents.go +++ b/internal/pkg/auxents/auxents.go @@ -8,10 +8,12 @@ package auxents import ( "fmt" "os" + "runtime" "github.com/johnkerl/miller/internal/pkg/auxents/help" "github.com/johnkerl/miller/internal/pkg/auxents/regtest" "github.com/johnkerl/miller/internal/pkg/auxents/repl" + "github.com/johnkerl/miller/internal/pkg/version" ) // tAuxMain is a function-pointer type for the entrypoint handler for a given auxent, @@ -38,6 +40,7 @@ func init() { {"help", help.HelpMain}, {"regtest", regtest.RegTestMain}, {"repl", repl.ReplMain}, + {"version", showVersion}, } } @@ -82,3 +85,8 @@ func ShowAuxEntries(o *os.File) { fmt.Fprintf(o, "For more information, please invoke mlr {subcommand} --help.\n") } + +func showVersion(args []string) int { + fmt.Printf("mlr version %s for %s/%s/%s\n", version.STRING, runtime.GOOS, runtime.GOARCH, runtime.Version()) + return 0 +} diff --git a/internal/pkg/auxents/repl/prompt.go b/internal/pkg/auxents/repl/prompt.go index 350bf849f..c4c4a4a76 100644 --- a/internal/pkg/auxents/repl/prompt.go +++ b/internal/pkg/auxents/repl/prompt.go @@ -50,7 +50,7 @@ func getPrompt2() string { func (repl *Repl) printStartupBanner() { if repl.inputIsTerminal { - fmt.Printf("Miller %s REPL for %s:%s:%s\n", version.STRING, runtime.GOOS, runtime.GOARCH, runtime.Version()) + fmt.Printf("Miller %s REPL for %s/%s/%s\n", version.STRING, runtime.GOOS, runtime.GOARCH, runtime.Version()) fmt.Printf("Docs: %s\n", lib.DOC_URL) fmt.Printf("Type ':h' or ':help' for online help; ':q' or ':quit' to quit.\n") } diff --git a/internal/pkg/cli/option_parse.go b/internal/pkg/cli/option_parse.go index 95cddff89..7ed501f36 100644 --- a/internal/pkg/cli/option_parse.go +++ b/internal/pkg/cli/option_parse.go @@ -98,6 +98,7 @@ var FLAG_TABLE = FlagTable{ &CommentsInDataFlagSection, &OutputColorizationFlagSection, &FlattenUnflattenFlagSection, + &ProfilingFlagSection, &MiscFlagSection, }, } @@ -2410,6 +2411,51 @@ var FlattenUnflattenFlagSection = FlagSection{ }, } +// ================================================================ +// PROFILING FLAGS + +func ProfilingPrintInfo() { + fmt.Print("These are flags for profiling Miller performance.") +} + +func init() { ProfilingFlagSection.Sort() } + +var ProfilingFlagSection = FlagSection{ + name: "Profiling flags", + infoPrinter: ProfilingPrintInfo, + flags: []Flag{ + { + name: "--cpuprofile", + arg: "{CPU-profile file name}", + help: `Create a CPU-profile file for performance analysis. Instructions will be printed to stderr. +This flag must be the very first thing after 'mlr' on the command line.`, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + // Already handled in main(). Nothing to do here except to accept this as valid syntax. + *pargi += 2 + }, + }, + + { + name: "--traceprofile", + help: `Create a trace-profile file for performance analysis. Instructions will be printed to stderr. +This flag must be the very first thing after 'mlr' on the command line.`, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + // Already handled in main(). Nothing to do here except to accept this as valid syntax. + *pargi += 1 + }, + }, + + { + name: "--time", + help: "Print elapsed execution time in seconds to stderr at the end of the execution of the program.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.PrintElapsedTime = true + *pargi += 1 + }, + }, + }, +} + // ================================================================ // MISC FLAGS diff --git a/internal/pkg/cli/option_types.go b/internal/pkg/cli/option_types.go index 68c08fc8a..7b366a5cd 100644 --- a/internal/pkg/cli/option_types.go +++ b/internal/pkg/cli/option_types.go @@ -153,6 +153,8 @@ type TOptions struct { HaveRandSeed bool RandSeed int + + PrintElapsedTime bool // mlr --time } // Not usable until FinalizeReaderOptions and FinalizeWriterOptions are called. diff --git a/internal/pkg/climain/mlrcli_parse.go b/internal/pkg/climain/mlrcli_parse.go index 2a2cb3bb3..d0eb2c649 100644 --- a/internal/pkg/climain/mlrcli_parse.go +++ b/internal/pkg/climain/mlrcli_parse.go @@ -1,3 +1,4 @@ +// ================================================================ // Miller main command-line parsing. // // Before Miller 6 the ordering was: @@ -65,6 +66,7 @@ // foo.csv' the '--csv' looks like it belongs to the 'head' verb. When people // use '#!/bin/sh' scripts they need to insert the '--' in 'mlr head -n 10 -- // --csv foo.csv'; for 'mlr -s' we insert the '--' for them. +// ================================================================ package climain @@ -128,15 +130,7 @@ func parseCommandLinePassOne( oargi := argi if args[argi][0] == '-' { - - if args[argi] == "--cpuprofile" { - // Already handled in main(); ignore here, and don't send it to pass two. - cli.CheckArgCount(args, argi, argc, 1) - argi += 2 - } else if args[argi] == "--traceprofile" { - // Already handled in main(); ignore here, and don't send it to pass two. - argi += 1 - } else if args[argi] == "--version" { + if args[argi] == "--version" { // Exiting flag: handle it immediately. fmt.Printf("mlr %s\n", version.STRING) os.Exit(0) diff --git a/internal/pkg/entrypoint/entrypoint.go b/internal/pkg/entrypoint/entrypoint.go index 26fb34565..6445b8733 100644 --- a/internal/pkg/entrypoint/entrypoint.go +++ b/internal/pkg/entrypoint/entrypoint.go @@ -20,8 +20,11 @@ import ( "github.com/johnkerl/miller/internal/pkg/transformers" ) -// ---------------------------------------------------------------- -func Main() { +type MainReturn struct { + PrintElapsedTime bool +} + +func Main() MainReturn { // Special handling for Windows so we can do things like: // // mlr put '$a = $b . "cd \"efg\" hi"' foo.dat @@ -55,6 +58,10 @@ func Main() { } else { processInPlace(options) } + + return MainReturn{ + PrintElapsedTime: options.PrintElapsedTime, + } } // ---------------------------------------------------------------- diff --git a/internal/pkg/input/record_reader_benchmark_test.go b/internal/pkg/input/record_reader_benchmark_test.go new file mode 100644 index 000000000..93ce89857 --- /dev/null +++ b/internal/pkg/input/record_reader_benchmark_test.go @@ -0,0 +1,71 @@ +package input + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/johnkerl/miller/internal/pkg/cli" +) + +// go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/input/... + +func BenchmarkDKVPParse(b *testing.B) { + readerOptions := &cli.TReaderOptions{ + InputFileFormat: "dkvp", + IFS: ",", + IPS: "=", + IRS: "\n", + } + reader, err := NewRecordReaderDKVP(readerOptions, 1) + assert.Nil(b, err) + + for i := 0; i < b.N; i++ { + _, _ = recordFromDKVPLine( + reader, + "color=yellow,shape=triangle,flag=true,k=1,index=11,quantity=43.6498,rate=9.8870", + ) + } +} + +func BenchmarkNIDXParse(b *testing.B) { + readerOptions := &cli.TReaderOptions{ + InputFileFormat: "nidx", + IFS: " ", + AllowRepeatIFS: true, + IRS: "\n", + } + reader, err := NewRecordReaderNIDX(readerOptions, 1) + assert.Nil(b, err) + + for i := 0; i < b.N; i++ { + _, _ = recordFromDKVPLine( + reader, + "yellow triangle true 1 11 43.6498 9.8870", + ) + } +} + +func BenchmarkXTABParse(b *testing.B) { + readerOptions := &cli.TReaderOptions{ + InputFileFormat: "xtab", + IPS: " ", + IFS: "\n", + IRS: "\n", + } + reader, err := NewRecordReaderXTAB(readerOptions, 1) + assert.Nil(b, err) + + stanza := newStanza() + stanza.dataLines.PushBack("color yellow") + stanza.dataLines.PushBack("shape triangle") + stanza.dataLines.PushBack("flag true") + stanza.dataLines.PushBack("k 1") + stanza.dataLines.PushBack("index 11") + stanza.dataLines.PushBack("quantity 43.6498") + stanza.dataLines.PushBack("rate 9.8870") + + for i := 0; i < b.N; i++ { + _, _ = reader.recordFromXTABLines(stanza.dataLines) + } +} diff --git a/internal/pkg/lib/util.go b/internal/pkg/lib/util.go index bee04660a..ff4c692cb 100644 --- a/internal/pkg/lib/util.go +++ b/internal/pkg/lib/util.go @@ -101,7 +101,7 @@ func TryIntFromString(input string) (int, bool) { } } - // Following twos-complement formatting familiar from all manners of + // Following twos-complement formatting familiar from all manner of // languages, including C which was Miller's original implementation // language, we want to allow 0x00....00 through 0x7f....ff as positive // 64-bit integers and 0x80....00 through 0xff....ff as negative ones. Go's diff --git a/internal/pkg/mlrval/mlrval_benchmark_test.go b/internal/pkg/mlrval/mlrval_benchmark_test.go new file mode 100644 index 000000000..cb8a1f6ee --- /dev/null +++ b/internal/pkg/mlrval/mlrval_benchmark_test.go @@ -0,0 +1,34 @@ +package mlrval + +import ( + "testing" +) + +// go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/mlrval/... + +func BenchmarkFromDeferredType(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = FromDeferredType("123") + } +} + +func BenchmarkInferIntFromDeferredType(b *testing.B) { + for i := 0; i < b.N; i++ { + mv := FromDeferredType("123") + mv.Type() + } +} + +func BenchmarkInferFloatFromDeferredType(b *testing.B) { + for i := 0; i < b.N; i++ { + mv := FromDeferredType("123.4") + mv.Type() + } +} + +func BenchmarkInferStringFromDeferredType(b *testing.B) { + for i := 0; i < b.N; i++ { + mv := FromDeferredType("abc") + mv.Type() + } +} diff --git a/internal/pkg/mlrval/mlrval_infer.go b/internal/pkg/mlrval/mlrval_infer.go index bdfb10a2b..68d88763c 100644 --- a/internal/pkg/mlrval/mlrval_infer.go +++ b/internal/pkg/mlrval/mlrval_infer.go @@ -1,10 +1,9 @@ package mlrval import ( - "regexp" - "strings" + "strconv" - "github.com/johnkerl/miller/internal/pkg/lib" + "github.com/johnkerl/miller/internal/pkg/scan" ) // TODO: comment no infer-bool from data files. Always false in this path. @@ -15,19 +14,19 @@ import ( func (mv *Mlrval) Type() MVType { if mv.mvtype == MT_PENDING { - packageLevelInferrer(mv, mv.printrep, false) + packageLevelInferrer(mv) } return mv.mvtype } // Support for mlr -S, mlr -A, mlr -O. -type tInferrer func(mv *Mlrval, input string, inferBool bool) *Mlrval +type tInferrer func(mv *Mlrval) *Mlrval -var packageLevelInferrer tInferrer = inferWithOctalAsString +var packageLevelInferrer tInferrer = inferNormally -// SetInferrerOctalAsInt is for default behavior. -func SetInferrerOctalAsString() { - packageLevelInferrer = inferWithOctalAsString +// SetInferNormally is the default behavior. +func SetInferNormally() { + packageLevelInferrer = inferNormally } // SetInferrerOctalAsInt is for mlr -O. @@ -42,67 +41,25 @@ func SetInferrerIntAsFloat() { // SetInferrerStringOnly is for mlr -S. func SetInferrerStringOnly() { - packageLevelInferrer = inferStringOnly + packageLevelInferrer = inferString } -// When loading data files, don't scan these words into floats -- even though -// the Go library is willing to do so. -var downcasedFloatNamesToNotInfer = map[string]bool{ - "inf": true, - "+inf": true, - "-inf": true, - "infinity": true, - "+infinity": true, - "-infinity": true, - "nan": true, +// ---------------------------------------------------------------- + +func inferNormally(mv *Mlrval) *Mlrval { + scanType := scan.FindScanType(mv.printrep) + return normalInferrerTable[scanType](mv) } -var octalDetector = regexp.MustCompile("^-?0[0-9]+") - -// inferWithOctalAsString is for default behavior. -func inferWithOctalAsString(mv *Mlrval, input string, inferBool bool) *Mlrval { - inferWithOctalAsInt(mv, input, inferBool) - if mv.mvtype != MT_INT && mv.mvtype != MT_FLOAT { - return mv - } - - if octalDetector.MatchString(mv.printrep) { - return mv.SetFromString(input) - } else { - return mv - } -} - -// inferWithOctalAsInt is for mlr -O. -func inferWithOctalAsInt(mv *Mlrval, input string, inferBool bool) *Mlrval { - if input == "" { - return mv.SetFromVoid() - } - - intval, iok := lib.TryIntFromString(input) - if iok { - return mv.SetFromPrevalidatedIntString(input, intval) - } - - if downcasedFloatNamesToNotInfer[strings.ToLower(input)] == false { - floatval, fok := lib.TryFloatFromString(input) - if fok { - return mv.SetFromPrevalidatedFloatString(input, floatval) - } - } - - if inferBool { - boolval, bok := lib.TryBoolFromBoolString(input) - if bok { - return mv.SetFromPrevalidatedBoolString(input, boolval) - } - } - return mv.SetFromString(input) +// xxx temp +func inferWithOctalAsInt(mv *Mlrval) *Mlrval { + scanType := scan.FindScanType(mv.printrep) + return leadingZeroAsIntInferrerTable[scanType](mv) } // inferWithIntAsFloat is for mlr -A. -func inferWithIntAsFloat(mv *Mlrval, input string, inferBool bool) *Mlrval { - inferWithOctalAsString(mv, input, inferBool) +func inferWithIntAsFloat(mv *Mlrval) *Mlrval { + inferNormally(mv) if mv.Type() == MT_INT { mv.floatval = float64(mv.intval) mv.mvtype = MT_FLOAT @@ -110,7 +67,166 @@ func inferWithIntAsFloat(mv *Mlrval, input string, inferBool bool) *Mlrval { return mv } -// inferStringOnly is for mlr -S. -func inferStringOnly(mv *Mlrval, input string, inferBool bool) *Mlrval { - return mv.SetFromString(input) +// inferString is for mlr -S. +func inferString(mv *Mlrval) *Mlrval { + return mv.SetFromString(mv.printrep) +} + +// ---------------------------------------------------------------- + +// Important: synchronize this with the type-ordering in the scan package. +var normalInferrerTable []tInferrer = []tInferrer{ + inferString, + inferDecimalInt, + inferString, // inferLeadingZeroDecimalIntAsInt, + inferOctalInt, + inferString, // inferFromLeadingZeroOctalIntAsInt, + inferHexInt, + inferBinaryInt, + inferMaybeFloat, +} + +// Important: synchronize this with the type-ordering in the scan package. +var leadingZeroAsIntInferrerTable []tInferrer = []tInferrer{ + inferString, + inferDecimalInt, + inferLeadingZeroDecimalIntAsInt, + inferOctalInt, + inferFromLeadingZeroOctalIntAsInt, + inferHexInt, + inferBinaryInt, + inferMaybeFloat, +} + +// TODO: comment +func inferDecimalInt(mv *Mlrval) *Mlrval { + intval, err := strconv.ParseInt(mv.printrep, 10, 64) + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } +} + +// TODO: comment +func inferLeadingZeroDecimalIntAsInt(mv *Mlrval) *Mlrval { + intval, err := strconv.ParseInt(mv.printrep, 10, 64) + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } +} + +// TODO: comment +// E.g. explicit 0o377, not 0377 +func inferOctalInt(mv *Mlrval) *Mlrval { + return inferBaseInt(mv, 8) +} + +// TODO: comment +func inferFromLeadingZeroOctalIntAsInt(mv *Mlrval) *Mlrval { + intval, err := strconv.ParseInt(mv.printrep, 8, 64) + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } +} + +// TODO: comment +func inferHexInt(mv *Mlrval) *Mlrval { + var input string + var negate bool + // Skip known leading 0x or -0x prefix + if mv.printrep[0] == '-' { + input = mv.printrep[3:] + negate = true + } else { + input = mv.printrep[2:] + negate = false + } + + // Following twos-complement formatting familiar from all manner of + // languages, including C which was Miller's original implementation + // language, we want to allow 0x00....00 through 0x7f....ff as positive + // 64-bit integers and 0x80....00 through 0xff....ff as negative ones. Go's + // signed-int parsing explicitly doesn't allow that, but we don't want Go + // semantics to dictate Miller semantics. So, we try signed-int parsing + // for 0x00....00 through 0x7f....ff, as well as positive or negative + // decimal. Failing that, we try unsigned-int parsing for 0x80....00 + // through 0xff....ff. + + i0 := input[0] + if len(input) == 16 && ('8' <= i0 && i0 <= 'f') { + uintval, err := strconv.ParseUint(input, 16, 64) + intval := int(uintval) + if negate { + intval = -intval + } + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, intval) + } else { + return mv.SetFromString(mv.printrep) + } + } else { + intval, err := strconv.ParseInt(input, 16, 64) + if negate { + intval = -intval + } + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } + } + +} + +// TODO: comment +func inferBinaryInt(mv *Mlrval) *Mlrval { + return inferBaseInt(mv, 2) +} + +// TODO: comment +func inferMaybeFloat(mv *Mlrval) *Mlrval { + floatval, err := strconv.ParseFloat(mv.printrep, 64) + if err == nil { + return mv.SetFromPrevalidatedFloatString(mv.printrep, floatval) + } else { + return mv.SetFromString(mv.printrep) + } +} + +// TODO: comment +func inferFromBool(mv *Mlrval) *Mlrval { + if mv.printrep == "true" { + return mv.SetFromPrevalidatedBoolString(mv.printrep, true) + } else { + return mv.SetFromPrevalidatedBoolString(mv.printrep, false) + } +} + +// TODO: comment +// Shared code for 0o/0b integers +func inferBaseInt(mv *Mlrval, base int) *Mlrval { + var input string + var negate bool + // Skip known leading 0x or -0x prefix + if mv.printrep[0] == '-' { + input = mv.printrep[3:] + negate = true + } else { + input = mv.printrep[2:] + negate = false + } + intval, err := strconv.ParseInt(input, base, 64) + if err == nil { + if negate { + intval = -intval + } + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } } diff --git a/internal/pkg/mlrval/mlrval_infer_test.go b/internal/pkg/mlrval/mlrval_infer_test.go new file mode 100644 index 000000000..dcb4b01b9 --- /dev/null +++ b/internal/pkg/mlrval/mlrval_infer_test.go @@ -0,0 +1,252 @@ +// ================================================================ +// Tests mlrval constructors. +// ================================================================ + +package mlrval + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestInferNormally(t *testing.T) { + assert.True(t, inferNormally(FromDeferredType("")).IsVoid()) + + assert.True(t, inferNormally(FromDeferredType("true")).IsString()) + assert.True(t, inferNormally(FromDeferredType("false")).IsString()) + + assert.True(t, inferNormally(FromDeferredType("abc")).IsString()) + + assert.True(t, inferNormally(FromDeferredType("0123")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-0123")).IsString()) + assert.True(t, inferNormally(FromDeferredType("0377")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-0377")).IsString()) + assert.True(t, inferNormally(FromDeferredType("0923")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-0923")).IsString()) + + assert.True(t, inferNormally(FromDeferredType("123")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("-123")).IsInt()) + + assert.True(t, inferNormally(FromDeferredType("0xff")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("-0xff")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("0b1011")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("-0b1011")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("0x7fffffffffffffff")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("0x8000000000000000")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("0xffffffffffffffff")).IsInt()) + + assert.True(t, inferNormally(FromDeferredType("12_3")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-12_3")).IsString()) + assert.True(t, inferNormally(FromDeferredType("1_2.3_4")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-1_2.3_4")).IsString()) + assert.True(t, inferNormally(FromDeferredType("0xca_fe")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-0xca_fe")).IsString()) + assert.True(t, inferNormally(FromDeferredType("0b1011_1101")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-0b1011_1101")).IsString()) + + assert.True(t, inferNormally(FromDeferredType(".")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-.")).IsString()) + assert.True(t, inferNormally(FromDeferredType("123.")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-123.")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType(".123")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-.123")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("123.456")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-123.456")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("1e2.")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-1e2.")).IsString()) + assert.True(t, inferNormally(FromDeferredType("1e-2.")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-1e-2.")).IsString()) + assert.True(t, inferNormally(FromDeferredType("1.2e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-1.2e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("1.2e-3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-1.2e-3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("1.e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-1.e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("1.e-3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-1.e-3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType(".2e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-.2e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType(".2e-3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-.2e-3")).IsFloat()) +} + +func TestInferWithOctalAsInt(t *testing.T) { + assert.True(t, inferWithOctalAsInt(FromDeferredType("")).IsVoid()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("true")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("false")).IsString()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("abc")).IsString()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("0123")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0123")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0377")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0377")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0923")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0923")).IsInt()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("123")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-123")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0xff")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0xff")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0b1011")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0b1011")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0x7fffffffffffffff")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0x8000000000000000")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0xffffffffffffffff")).IsInt()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("12_3")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-12_3")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1_2.3_4")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1_2.3_4")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0xca_fe")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0xca_fe")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0b1011_1101")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0b1011_1101")).IsString()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType(".")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("123.")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-123.")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType(".123")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.123")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("123.456")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-123.456")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1e2.")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1e2.")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1e-2.")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1e-2.")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.2e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.2e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.2e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.2e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType(".2e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType(".2e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e-3")).IsFloat()) +} + +func TestInferWithIntAsFloat(t *testing.T) { + assert.True(t, inferWithIntAsFloat(FromDeferredType("")).IsVoid()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("true")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("false")).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("abc")).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("0123")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0123")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0377")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0377")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0923")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0923")).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("123")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-123")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0xff")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0xff")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0b1011")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0b1011")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0x7fffffffffffffff")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0x8000000000000000")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0xffffffffffffffff")).IsFloat()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("12_3")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-12_3")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1_2.3_4")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1_2.3_4")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0xca_fe")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0xca_fe")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0b1011_1101")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0b1011_1101")).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType(".")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-.")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("123.")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-123.")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType(".123")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-.123")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("123.456")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-123.456")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1e2.")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1e2.")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1e-2.")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1e-2.")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.2e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.2e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.2e-3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.2e-3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.e-3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.e-3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType(".2e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-.2e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType(".2e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e-3")).IsFloat()) +} + +func TestInferString(t *testing.T) { + assert.True(t, inferString(FromDeferredType("")).IsVoid()) + + assert.True(t, inferString(FromDeferredType("true")).IsString()) + assert.True(t, inferString(FromDeferredType("false")).IsString()) + + assert.True(t, inferString(FromDeferredType("abc")).IsString()) + + assert.True(t, inferString(FromDeferredType("0123")).IsString()) + assert.True(t, inferString(FromDeferredType("-0123")).IsString()) + assert.True(t, inferString(FromDeferredType("0377")).IsString()) + assert.True(t, inferString(FromDeferredType("-0377")).IsString()) + assert.True(t, inferString(FromDeferredType("0923")).IsString()) + assert.True(t, inferString(FromDeferredType("-0923")).IsString()) + + assert.True(t, inferString(FromDeferredType("123")).IsString()) + assert.True(t, inferString(FromDeferredType("-123")).IsString()) + assert.True(t, inferString(FromDeferredType("0xff")).IsString()) + assert.True(t, inferString(FromDeferredType("-0xff")).IsString()) + assert.True(t, inferString(FromDeferredType("0b1011")).IsString()) + assert.True(t, inferString(FromDeferredType("-0b1011")).IsString()) + assert.True(t, inferString(FromDeferredType("0x7fffffffffffffff")).IsString()) + assert.True(t, inferString(FromDeferredType("0x8000000000000000")).IsString()) + assert.True(t, inferString(FromDeferredType("0xffffffffffffffff")).IsString()) + + assert.True(t, inferString(FromDeferredType("12_3")).IsString()) + assert.True(t, inferString(FromDeferredType("-12_3")).IsString()) + assert.True(t, inferString(FromDeferredType("1_2.3_4")).IsString()) + assert.True(t, inferString(FromDeferredType("-1_2.3_4")).IsString()) + assert.True(t, inferString(FromDeferredType("0xca_fe")).IsString()) + assert.True(t, inferString(FromDeferredType("-0xca_fe")).IsString()) + assert.True(t, inferString(FromDeferredType("0b1011_1101")).IsString()) + assert.True(t, inferString(FromDeferredType("-0b1011_1101")).IsString()) + + assert.True(t, inferString(FromDeferredType(".")).IsString()) + assert.True(t, inferString(FromDeferredType("-.")).IsString()) + assert.True(t, inferString(FromDeferredType("123.")).IsString()) + assert.True(t, inferString(FromDeferredType("-123.")).IsString()) + assert.True(t, inferString(FromDeferredType(".123")).IsString()) + assert.True(t, inferString(FromDeferredType("-.123")).IsString()) + assert.True(t, inferString(FromDeferredType("123.456")).IsString()) + assert.True(t, inferString(FromDeferredType("-123.456")).IsString()) + assert.True(t, inferString(FromDeferredType("1e2.")).IsString()) + assert.True(t, inferString(FromDeferredType("-1e2.")).IsString()) + assert.True(t, inferString(FromDeferredType("1e-2.")).IsString()) + assert.True(t, inferString(FromDeferredType("-1e-2.")).IsString()) + assert.True(t, inferString(FromDeferredType("1.2e3")).IsString()) + assert.True(t, inferString(FromDeferredType("-1.2e3")).IsString()) + assert.True(t, inferString(FromDeferredType("1.2e-3")).IsString()) + assert.True(t, inferString(FromDeferredType("-1.2e-3")).IsString()) + assert.True(t, inferString(FromDeferredType("1.e3")).IsString()) + assert.True(t, inferString(FromDeferredType("-1.e3")).IsString()) + assert.True(t, inferString(FromDeferredType("1.e-3")).IsString()) + assert.True(t, inferString(FromDeferredType("-1.e-3")).IsString()) + assert.True(t, inferString(FromDeferredType(".2e3")).IsString()) + assert.True(t, inferString(FromDeferredType("-.2e3")).IsString()) + assert.True(t, inferString(FromDeferredType(".2e-3")).IsString()) + assert.True(t, inferString(FromDeferredType("-.2e-3")).IsString()) +} diff --git a/internal/pkg/mlrval/mlrval_new.go b/internal/pkg/mlrval/mlrval_new.go index 507f5ad53..8bf3a6dc2 100644 --- a/internal/pkg/mlrval/mlrval_new.go +++ b/internal/pkg/mlrval/mlrval_new.go @@ -39,9 +39,15 @@ func FromInferredType(input string) *Mlrval { printrep: input, printrepValid: true, } - // TODO: comment re inferBool arg - packageLevelInferrer(mv, mv.printrep, true) - return mv + // TODO: comment re data files vs literals context -- this is for the latter + if input == "true" { + return TRUE + } else if input == "false" { + return FALSE + } else { + packageLevelInferrer(mv) + return mv + } } func FromString(input string) *Mlrval { diff --git a/internal/pkg/scan/digits.go b/internal/pkg/scan/digits.go new file mode 100644 index 000000000..92f69894d --- /dev/null +++ b/internal/pkg/scan/digits.go @@ -0,0 +1,90 @@ +package scan + +// TODO: comment re context + +// 00000000: 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f |................| +// 00000010: 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f |................| +// 00000020: 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f | !"#$%&'()*+,-./| +// 00000030: 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f |0123456789:;<=>?| +// 00000040: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f |@ABCDEFGHIJKLMNO| +// 00000050: 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f |PQRSTUVWXYZ[\]^_| +// 00000060: 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f |`abcdefghijklmno| +// 00000070: 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f |pqrstuvwxyz{|}~.| + +var isDecimalDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f + true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + +var isOctalDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f + true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, // 30-3f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + +var isHexDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f + true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f + false, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + +// Possible character in floats include '.', 0-9, [eE], [-+] -- the latter two for things like 1.2e-8. +// Miller intentionally does not accept 'inf' or 'NaN' as float numbers in file-input data. +var isFloatDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, true, false, true, true, false, // 20-2f + true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f + false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + +func isDecimalDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isDecimalDigitTable[c] + } else { + return false + } +} + +func isOctalDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isOctalDigitTable[c] + } else { + return false + } +} + +func isHexDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isHexDigitTable[c] + } else { + return false + } +} + +func isFloatDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isFloatDigitTable[c] + } else { + return false + } +} diff --git a/internal/pkg/scan/digits_test.go b/internal/pkg/scan/digits_test.go new file mode 100644 index 000000000..d305e1bee --- /dev/null +++ b/internal/pkg/scan/digits_test.go @@ -0,0 +1,57 @@ +package scan + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIsDecimalDigit(t *testing.T) { + var c byte + for c = 0x00; c < 0xff; c++ { + if c >= '0' && c <= '9' { + assert.True(t, isDecimalDigit(c)) + } else { + assert.False(t, isDecimalDigit(c)) + } + } +} + +func TestIsOctalDigit(t *testing.T) { + var c byte + for c = 0x00; c < 0xff; c++ { + if c >= '0' && c <= '7' { + assert.True(t, isOctalDigit(c)) + } else { + assert.False(t, isOctalDigit(c)) + } + } +} + +func TestIsHexDigit(t *testing.T) { + var c byte + for c = 0x00; c < 0xff; c++ { + if c >= '0' && c <= '9' { + assert.True(t, isHexDigit(c)) + } else if c >= 'a' && c <= 'f' { + assert.True(t, isHexDigit(c)) + } else if c >= 'A' && c <= 'F' { + assert.True(t, isHexDigit(c)) + } else { + assert.False(t, isHexDigit(c)) + } + } +} + +func TestIsFloatDigit(t *testing.T) { + var c byte + for c = 0x00; c < 0xff; c++ { + if c >= '0' && c <= '9' { + assert.True(t, isFloatDigit(c)) + } else if c == '.' || c == '-' || c == '+' || c == 'e' || c == 'E' { + assert.True(t, isFloatDigit(c)) + } else { + assert.False(t, isFloatDigit(c)) + } + } +} diff --git a/internal/pkg/scan/doc.go b/internal/pkg/scan/doc.go new file mode 100644 index 000000000..67737c3c3 --- /dev/null +++ b/internal/pkg/scan/doc.go @@ -0,0 +1,3 @@ +// Package scan contains low-level logic for efficient type-inference of string +// to int/float/bool/string. +package scan diff --git a/internal/pkg/scan/find.go b/internal/pkg/scan/find.go new file mode 100644 index 000000000..d3a911205 --- /dev/null +++ b/internal/pkg/scan/find.go @@ -0,0 +1,182 @@ +package scan + +import () + +// TODO: comment re context + +// o grammar for numbers & case-through +// k len 0 +// - len 1 +// k has leading minus; strip & rest +// - 0x, 0b, 0[0-9] +// - decimal: leading minus; [0-9]+ +// - octal: leading minus; 0[0-7]+ +// - hex: leading minus; 0[xX][0-9a-fA-F]+ +// - float: leadinug minus; [0-9] or '.' +// +// o float literals: +// 123 123. 123.4 .234 +// 1e2 1e-2 1.2e3 1.e3 1.2e-3 1.e-3 +// .2e3 .2e-3 1.e-3 +// +// ?- [0-9]+ +// ?- [0-9]+ '.' [0-9]* +// ?- [0-9]* '.' [0-9]+ +// ?- [0-9]+ [eE] ?- [0-9]+ +// ?- [0-9]+ '.' [0-9]* [eE] ?- [0-9]+ +// ?- [0-9]* '.' [0-9]+ [eE] ?- [0-9]+ + +func FindScanType(sinput string) ScanType { + input := []byte(sinput) + + if len(input) == 0 { + return scanTypeString + } + + i0 := input[0] + if i0 == '-' { + return findScanTypePositiveNumberOrString(input[1:]) + } + if i0 >= '0' && i0 <= '9' { + return findScanTypePositiveNumberOrString(input) + } + if i0 == '.' { + if len(input) == 1 { + return scanTypeString + } else { + return findScanTypePositiveDecimalOrFloatOrString(input) + } + } + + return scanTypeString +} + +// Convenience function for unit test +func findScanTypeName(sinput string) string { + return TypeNames[FindScanType(sinput)] +} + +func findScanTypePositiveNumberOrString(input []byte) ScanType { + if len(input) == 0 { + return scanTypeString + } + i0 := input[0] + + if i0 == '.' { + return findScanTypePositiveFloatOrString(input) + } + + if isDecimalDigit(i0) { + if len(input) == 1 { + return scanTypeDecimalInt + } + if i0 == '0' { + i1 := input[1] + if i1 == 'x' || i1 == 'X' { + if len(input) == 2 { + return scanTypeString + } else { + return findScanTypePositiveHexOrString(input[2:]) + } + } + if i1 == 'o' || i1 == 'O' { + if len(input) == 2 { + return scanTypeString + } else { + return findScanTypePositiveOctalOrString(input[2:]) + } + } + if i1 == 'b' || i1 == 'B' { + if len(input) == 2 { + return scanTypeString + } else { + return findScanTypePositiveBinaryOrString(input[2:]) + } + } + + allOctal := true + allDecimal := true + for _, c := range input[1:] { + if !isOctalDigit(c) { + allOctal = false + } + if !isDecimalDigit(c) { + allDecimal = false + break + } + } + if allOctal { + return scanTypeLeadingZeroOctalInt + } + if allDecimal { + return scanTypeLeadingZeroDecimalInt + } + // else fall through + } + + return findScanTypePositiveDecimalOrFloatOrString(input) + } + + return scanTypeString +} + +func findScanTypePositiveFloatOrString(input []byte) ScanType { + for _, c := range []byte(input) { + if !isFloatDigit(c) { + return scanTypeString + } + } + return scanTypeMaybeFloat +} + +func findScanTypePositiveDecimalOrFloatOrString(input []byte) ScanType { + maybeInt := true + for _, c := range []byte(input) { + // All float digits are decimal-int digits so if the current character + // is not a float digit, this can't be either a float or a decimal int. + // Example: "1x2" + if !isFloatDigit(c) { + return scanTypeString + } + + // Examples: "1e2" or "1x2". + if !isDecimalDigit(c) { + maybeInt = false + } + } + if maybeInt { + return scanTypeDecimalInt + } else { + return scanTypeMaybeFloat + } +} + +// Leading 0o has already been stripped +func findScanTypePositiveOctalOrString(input []byte) ScanType { + for _, c := range []byte(input) { + if !isOctalDigit(c) { + return scanTypeString + } + } + return scanTypeOctalInt +} + +// Leading 0x has already been stripped +func findScanTypePositiveHexOrString(input []byte) ScanType { + for _, c := range []byte(input) { + if !isHexDigit(c) { + return scanTypeString + } + } + return scanTypeHexInt +} + +// Leading 0b has already been stripped +func findScanTypePositiveBinaryOrString(input []byte) ScanType { + for _, c := range []byte(input) { + if c < '0' || c > '1' { + return scanTypeString + } + } + return scanTypeBinaryInt +} diff --git a/internal/pkg/scan/find_benchmark_test.go b/internal/pkg/scan/find_benchmark_test.go new file mode 100644 index 000000000..0d023a25f --- /dev/null +++ b/internal/pkg/scan/find_benchmark_test.go @@ -0,0 +1,68 @@ +package scan + +import ( + "testing" +) + +// go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/scan/... + +func BenchmarkFromNormalCases(b *testing.B) { + + data := []string{ + "yellow", "triangle", "true", "1", "11", "43.6498", "9.8870", + "red", "square", "true", "2", "15", "79.2778", "0.0130", + "red", "circle", "true", "3", "16", "13.8103", "2.9010", + "red", "square", "false", "4", "48", "77.5542", "7.4670", + "purple", "triangle", "false", "5", "51", "81.2290", "8.5910", + "red", "square", "false", "6", "64", "77.1991", "9.5310", + "purple", "triangle", "false", "7", "65", "80.1405", "5.8240", + "yellow", "circle", "true", "8", "73", "63.9785", "4.2370", + "yellow", "circle", "true", "9", "87", "63.5058", "8.3350", + "purple", "square", "false", "10", "91", "72.3735", "8.2430", + } + ndata := len(data) + + for i := 0; i < b.N; i++ { + _ = FindScanType(data[i%ndata]) + } +} + +func BenchmarkFromAbnormalCases(b *testing.B) { + + data := []string{ + "", "-", + "abc", "-abc", + "0", "-0", + "1", "-1", + "2", "-2", + "123", "-123", + "1.", "-1.", + ".2", "-.2", + ".", "-.", + "1.2", "-1.2", + "1.2.3", "-1.2.3", + "1e2e3", "-1e2e3", + "12e-2", "-12e-2", + "1e2x3", "-1e2x3", + "0x", "-0x", + "0x0", "-0x0", + "0xcafe", "-0xcafe", + "0xcape", "-0xcape", + "0o", "-0o", + "0o0", "-0o0", + "0o1234", "-0o1234", + "0b", "-0b", + "0b0", "-0b0", + "0b1011", "-0b1011", + "0b1021", "-0b1021", + "true", "true", + "false", "false", + "True", "True", + "False", "False", + } + ndata := len(data) + + for i := 0; i < b.N; i++ { + _ = FindScanType(data[i%ndata]) + } +} diff --git a/internal/pkg/scan/find_test.go b/internal/pkg/scan/find_test.go new file mode 100644 index 000000000..e1eba0437 --- /dev/null +++ b/internal/pkg/scan/find_test.go @@ -0,0 +1,114 @@ +package scan + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestFindScanTypeNameStrings(t *testing.T) { + assert.Equal(t, typeNameString, findScanTypeName("")) + assert.Equal(t, typeNameString, findScanTypeName("-")) + assert.Equal(t, typeNameString, findScanTypeName("abc")) + assert.Equal(t, typeNameString, findScanTypeName("-abc")) +} + +func TestFindScanTypeNameDecimals(t *testing.T) { + assert.Equal(t, typeNameDecimalInt, findScanTypeName("0")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("-0")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("1")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("-1")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("2")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("-2")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("123")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("-123")) +} + +func TestFindScanTypeNameFloats(t *testing.T) { + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName(".2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-.2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-.")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("12e-2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-12e-2")) + + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.2.3")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.2.3")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1e2e3")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1e2e3")) + + assert.Equal(t, typeNameString, findScanTypeName(".")) + assert.Equal(t, typeNameString, findScanTypeName("1e2x3")) + assert.Equal(t, typeNameString, findScanTypeName("-1e2x3")) + + assert.Equal(t, typeNameString, findScanTypeName("inf")) + assert.Equal(t, typeNameString, findScanTypeName("infinity")) + assert.Equal(t, typeNameString, findScanTypeName("NaN")) + assert.Equal(t, typeNameString, findScanTypeName("-inf")) + assert.Equal(t, typeNameString, findScanTypeName("-infinity")) + assert.Equal(t, typeNameString, findScanTypeName("-NaN")) +} + +func TestFindScanTypeNameHexes(t *testing.T) { + assert.Equal(t, typeNameHexInt, findScanTypeName("0x0")) + assert.Equal(t, typeNameHexInt, findScanTypeName("-0x0")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0xf")) + assert.Equal(t, typeNameHexInt, findScanTypeName("-0xf")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0xcafe")) + assert.Equal(t, typeNameHexInt, findScanTypeName("-0xcafe")) + + assert.Equal(t, typeNameHexInt, findScanTypeName("0x7ffffffffffffffe")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0x7fffffffffffffff")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0x8000000000000000")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0x8000000000000001")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0xfffffffffffffffe")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0xffffffffffffffff")) + + assert.Equal(t, typeNameString, findScanTypeName("0x")) + assert.Equal(t, typeNameString, findScanTypeName("-0x")) + assert.Equal(t, typeNameString, findScanTypeName("0xcape")) + assert.Equal(t, typeNameString, findScanTypeName("-0xcape")) +} + +func TestFindScanTypeNameOctals(t *testing.T) { + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("00")) + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("-00")) + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("01")) + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("-01")) + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("0377")) + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("-0377")) + + assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("08")) + assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("-08")) + + assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("06789")) + assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("-06789")) + + assert.Equal(t, typeNameOctalInt, findScanTypeName("0o377")) + assert.Equal(t, typeNameOctalInt, findScanTypeName("-0o377")) + + assert.Equal(t, typeNameString, findScanTypeName("0o6789")) + assert.Equal(t, typeNameString, findScanTypeName("-0o6789")) +} + +func TestFindScanTypeNameBinaries(t *testing.T) { + assert.Equal(t, typeNameBinaryInt, findScanTypeName("0b0")) + assert.Equal(t, typeNameBinaryInt, findScanTypeName("-0b0")) + assert.Equal(t, typeNameBinaryInt, findScanTypeName("0b1011")) + assert.Equal(t, typeNameBinaryInt, findScanTypeName("-0b1011")) + + assert.Equal(t, typeNameString, findScanTypeName("0b")) + assert.Equal(t, typeNameString, findScanTypeName("-0b")) + assert.Equal(t, typeNameString, findScanTypeName("0b1021")) + assert.Equal(t, typeNameString, findScanTypeName("-0b1021")) +} + +func TestFindScanTypeNameBooleans(t *testing.T) { + assert.Equal(t, typeNameString, findScanTypeName("true")) + assert.Equal(t, typeNameString, findScanTypeName("True")) + assert.Equal(t, typeNameString, findScanTypeName("false")) + assert.Equal(t, typeNameString, findScanTypeName("False")) +} diff --git a/internal/pkg/scan/type.go b/internal/pkg/scan/type.go new file mode 100644 index 000000000..674969ee4 --- /dev/null +++ b/internal/pkg/scan/type.go @@ -0,0 +1,36 @@ +package scan + +// TODO: comment re context + +type ScanType int + +const ( + scanTypeString ScanType = 0 + scanTypeDecimalInt = 1 + scanTypeLeadingZeroDecimalInt = 2 + scanTypeOctalInt = 3 + scanTypeLeadingZeroOctalInt = 4 + scanTypeHexInt = 5 + scanTypeBinaryInt = 6 + scanTypeMaybeFloat = 7 +) + +const typeNameString = "string" +const typeNameDecimalInt = "decint" // e.g. 123 +const typeNameLeadingZeroDecimalInt = "lzdecint" // e.g. 0899 +const typeNameOctalInt = "octint" // e.g. 0o377 +const typeNameLeadingZeroOctalInt = "lzoctint" // e.g. 0377 +const typeNameHexInt = "hexint" // e.g. 0xcafe +const typeNameBinaryInt = "binint" // e.g. 0b1011 +const typeNameMaybeFloat = "float?" // characters in [0-9\.-+eE] but needs parse to be sure + +var TypeNames = []string{ + typeNameString, + typeNameDecimalInt, + typeNameLeadingZeroDecimalInt, + typeNameOctalInt, + typeNameLeadingZeroOctalInt, + typeNameHexInt, + typeNameBinaryInt, + typeNameMaybeFloat, +} diff --git a/internal/pkg/scan/type_test.go b/internal/pkg/scan/type_test.go new file mode 100644 index 000000000..4960963e6 --- /dev/null +++ b/internal/pkg/scan/type_test.go @@ -0,0 +1,18 @@ +package scan + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTypeNames(t *testing.T) { + assert.Equal(t, TypeNames[scanTypeString], "string") + assert.Equal(t, TypeNames[scanTypeDecimalInt], "decint") + assert.Equal(t, TypeNames[scanTypeLeadingZeroDecimalInt], "lzdecint") // e.g. 0899 + assert.Equal(t, TypeNames[scanTypeOctalInt], "octint") // e.g. 0o377 + assert.Equal(t, TypeNames[scanTypeLeadingZeroOctalInt], "lzoctint") // e.g. 0377 + assert.Equal(t, TypeNames[scanTypeHexInt], "hexint") // e.g. 0xcafe + assert.Equal(t, TypeNames[scanTypeBinaryInt], "binint") // e.g. 0b1011 + assert.Equal(t, TypeNames[scanTypeMaybeFloat], "float?") // characters in [0-9\.-+eE] but needs parse to be sure +} diff --git a/man/manpage.txt b/man/manpage.txt index 07154c73b..0eb839907 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -123,6 +123,7 @@ HELP OPTIONS mlr help miscellaneous-flags mlr help output-colorization-flags mlr help pprint-only-flags + mlr help profiling-flags mlr help separator-flags Verbs: mlr help list-verbs @@ -616,6 +617,20 @@ PPRINT-ONLY FLAGS for input). --right Right-justifies all fields for PPRINT output. +PROFILING FLAGS + These are flags for profiling Miller performance. + --cpuprofile {CPU-profile file name} + Create a CPU-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + --time Print elapsed execution time in seconds to stderr at + the end of the execution of the program. + --traceprofile Create a trace-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + SEPARATOR FLAGS See the Separators doc page for more about record separators, field separators, and pair separators. Also see the File formats doc page, or @@ -735,6 +750,7 @@ AUXILIARY COMMANDS help regtest repl + version For more information, please invoke mlr {subcommand} --help. MLRRC @@ -3003,4 +3019,4 @@ SEE ALSO - 2021-12-25 MILLER(1) + 2021-12-27 MILLER(1) diff --git a/man/mlr.1 b/man/mlr.1 index 468512bf1..a60a32b46 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2021-12-25 +.\" Date: 2021-12-27 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2021-12-25" "\ \&" "\ \&" +.TH "MILLER" "1" "2021-12-27" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -158,6 +158,7 @@ Flags: mlr help miscellaneous-flags mlr help output-colorization-flags mlr help pprint-only-flags + mlr help profiling-flags mlr help separator-flags Verbs: mlr help list-verbs @@ -753,6 +754,28 @@ These are flags which are applicable to PPRINT output format. .fi .if n \{\ .RE +.SH "PROFILING FLAGS" +.sp + +.if n \{\ +.RS 0 +.\} +.nf +These are flags for profiling Miller performance. +--cpuprofile {CPU-profile file name} + Create a CPU-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. +--time Print elapsed execution time in seconds to stderr at + the end of the execution of the program. +--traceprofile Create a trace-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. +.fi +.if n \{\ +.RE .SH "SEPARATOR FLAGS" .sp @@ -884,6 +907,7 @@ Available subcommands: help regtest repl + version For more information, please invoke mlr {subcommand} --help. .fi .if n \{\ diff --git a/scripts/chain-1.mlr b/scripts/chain-1.mlr index c2279799b..05fa01114 100644 --- a/scripts/chain-1.mlr +++ b/scripts/chain-1.mlr @@ -1,2 +1,2 @@ $color_shape = $color . $shape; -$y = int($k) + int($index) **3 + log10(float($quantity)/float($rate)); +$y = $k + $index **3 + log10($quantity/$rate); diff --git a/scripts/chain-cmps.sh b/scripts/chain-cmps.sh index de4bf262e..4ef6b814f 100755 --- a/scripts/chain-cmps.sh +++ b/scripts/chain-cmps.sh @@ -1,13 +1,12 @@ mlrs="mlr5 ~/tmp/miller/mlr ./mlr" -reps="1" - #mlrs="mlr5 ./mlr" -#reps="1 2 3" -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv check | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv cat | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv head | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tail | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tac | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -f shape | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -n quantity | md5sum; done; done +#reps="1" +reps="1 2 3" + +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv check > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv cat > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tail > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tac > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -f shape > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -n quantity > /dev/null; done; done diff --git a/scripts/chain-lengths.sh b/scripts/chain-lengths.sh index 4acc539e5..dd999c919 100755 --- a/scripts/chain-lengths.sh +++ b/scripts/chain-lengths.sh @@ -1,14 +1,14 @@ mlrs="mlr5 ~/tmp/miller/mlr ./mlr" -reps="1" - #mlrs="mlr5 ./mlr" -#reps="1 2 3" + +#reps="1" +reps="1 2 3" echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done @@ -17,7 +17,7 @@ echo; for mlr in $mlrs; do justtime $mlr --csv --from ~/tmp/big.csv \ then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done @@ -27,7 +27,7 @@ echo; for mlr in $mlrs; do then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done @@ -38,7 +38,7 @@ echo; for mlr in $mlrs; do then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done @@ -50,7 +50,7 @@ echo; for mlr in $mlrs; do then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done @@ -63,6 +63,6 @@ echo; for mlr in $mlrs; do then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done diff --git a/scripts/compiler-versions-build b/scripts/compiler-versions-build new file mode 100755 index 000000000..49e2c2ad2 --- /dev/null +++ b/scripts/compiler-versions-build @@ -0,0 +1,7 @@ +#!/bin/sh + +for go in go1.15.15 go1.16.12 go1.17.5 go1.18beta1; do + $go clean github.com/johnkerl/miller/cmd/mlr + $go build github.com/johnkerl/miller/cmd/mlr + mv mlr mlr-$go +done diff --git a/scripts/compiler-versions-install b/scripts/compiler-versions-install new file mode 100755 index 000000000..873e8857f --- /dev/null +++ b/scripts/compiler-versions-install @@ -0,0 +1,13 @@ +#!/bin/sh + +# https://go.dev/doc/manage-install + +go install golang.org/dl/go1.18beta1@latest +go install golang.org/dl/go1.17.5@latest +go install golang.org/dl/go1.16.12@latest +go install golang.org/dl/go1.15.15@latest + +go1.15.15 download +go1.16.12 download +go1.17.5 download +go1.18beta1 download diff --git a/scripts/compiler-versions-time b/scripts/compiler-versions-time new file mode 100755 index 000000000..03ed64965 --- /dev/null +++ b/scripts/compiler-versions-time @@ -0,0 +1,9 @@ +#!/bin/sh + +for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv check ~/tmp/big.csv > /dev/null; done +echo + +for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv cat ~/tmp/big.csv > /dev/null; done +echo + +for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv --from ~/tmp/big.csv put -f ./scripts/chain-1.mlr > /dev/null; done diff --git a/scripts/time-big-file b/scripts/time-big-file index 5da24aa78..f660391a4 100755 --- a/scripts/time-big-file +++ b/scripts/time-big-file @@ -15,4 +15,4 @@ fi if [ $# -eq 2 ]; then mlr="$2" fi -justtime $mlr $iflag cat ~/tmp/big.$suffix | md5sum - +justtime $mlr $iflag cat ~/tmp/big.$suffix > /dev/null diff --git a/scripts/time-big-files b/scripts/time-big-files index 135fd4184..2e2d3917b 100755 --- a/scripts/time-big-files +++ b/scripts/time-big-files @@ -3,11 +3,15 @@ ourdir=$(dirname $0) mlrs="mlr5 ~/tmp/miller/mlr ./mlr" +#mlrs="mlr5 ./mlr" -echo; for mlr in $mlrs; do $ourdir/time-big-file csv $mlr; done -echo; for mlr in $mlrs; do $ourdir/time-big-file csvlite $mlr; done -echo; for mlr in $mlrs; do $ourdir/time-big-file dkvp $mlr; done -echo; for mlr in $mlrs; do $ourdir/time-big-file nidx $mlr; done -echo; for mlr in $mlrs; do $ourdir/time-big-file xtab $mlr; done -echo; for mlr in $mlrs; do $ourdir/time-big-file json $mlr; done +#reps="1" +reps="1 2 3" + +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file csv $mlr; done; done +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file csvlite $mlr; done; done +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file dkvp $mlr; done; done +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file nidx $mlr; done; done +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file xtab $mlr; done; done +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file json $mlr; done; done diff --git a/test/cases/io-infer-flags/dash-O/expout b/test/cases/io-infer-flags/dash-O/expout index 2ecaa2dab..55e06d2b1 100644 --- a/test/cases/io-infer-flags/dash-O/expout +++ b/test/cases/io-infer-flags/dash-O/expout @@ -3,7 +3,7 @@ x t y z 123.45 float 124.45 123.95 0123 int 84 83.5 07 int 8 7.5 -08 float 9 8.5 +08 int 9 8.5 0 int 1 0.5 0. float 1 0.5 0.0 float 1 0.5 @@ -16,7 +16,7 @@ x t y z -0b0100 int -3 -3.5 -0x1000 int -4095 -4095.5 -07 int -6 -6.5 --08 float -7 -7.5 +-08 int -7 -7.5 -0 int 1 0.5 -0. float 1 0.5 -0.0 float 1 0.5 diff --git a/todo.txt b/todo.txt index ace2f5e36..fede517f1 100644 --- a/todo.txt +++ b/todo.txt @@ -1,7 +1,14 @@ ================================================================ PUNCHDOWN LIST +* numeric-inference perf + o README-profiling.md re various scripts + o README-profiling.md re this PR + o update mac numbers; type up linux numbers + o webdoc re on-battery anecdote + * blockers: + - linux/1.17 perf checks - fractional-strptime - improved regex doc w/ lots of examples - cmp-matrices @@ -71,6 +78,10 @@ PUNCHDOWN LIST ================================================================ NON-BLOCKERS +* pos/neg 0x/0b/0o UTs + +* 0o into BNF + ? BIFs as FCFs? * pv: 'mlr --prepipex pv --gzin tail -n 10 ~/tmp/zhuge.gz' needs --gzin & --prepipex both