Improve type-inference performance (#809)

* To-do items for broader platform/go-version benchmarking

* neaten inferrer API

* extend type-inference unit-test cases

* Add benchmark scripts for comparing compiler versions

* mlr version in addition to mlr --version

* some go-benchmark files for Mac/Linux perf comparisons

* neaten perf-scripts

* merge

* type-scan optimization tests

* type-scan optimization infra

* test new inferrer

* mlr --time option

* include --cpuprofile and --traceprofile in on-line help

* sharpen inferred/deferred-type API distinction

* replace old inferrer with newer/faster

* update docs for new type-inferrer
This commit is contained in:
John Kerl 2021-12-27 00:54:21 -05:00 committed by GitHub
parent 5e8d3fddd0
commit e10fee0724
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
47 changed files with 1595 additions and 381 deletions

1
.vimrc
View file

@ -1,2 +1,3 @@
map \d :w<C-m>:!clear;echo Building ...; echo; make mlr<C-m>
map \f :w<C-m>:!clear;echo Building ...; echo; make ut<C-m>
map \r :w<C-m>:!clear;echo Building ...; echo; make ut-scan ut-mlv<C-m>

View file

@ -31,6 +31,24 @@ install: build
unit-test ut:
go test github.com/johnkerl/miller/internal/pkg/...
ut-lib:
go test github.com/johnkerl/miller/internal/pkg/lib...
ut-scan:
go test github.com/johnkerl/miller/internal/pkg/scan/...
ut-mlv:
go test github.com/johnkerl/miller/internal/pkg/mlrval/...
ut-bifs:
go test github.com/johnkerl/miller/internal/pkg/bifs/...
ut-input:
go test github.com/johnkerl/miller/internal/pkg/input/...
bench:
go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/...
bench-mlv:
go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/mlrval/...
bench-input:
go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/input/...
# ----------------------------------------------------------------
# Regression tests (large number)
#
@ -41,12 +59,22 @@ unit-test ut:
regression-test:
go test -v regression_test.go
# ----------------------------------------------------------------
# Experimental executables:
scan:
go build github.com/johnkerl/miller/cmd/scan
# ----------------------------------------------------------------
# Formatting
# go fmt ./... finds experimental C files which we want to ignore.
fmt:
-go fmt ./cmd/...
-go fmt ./internal/pkg/...
-go fmt ./regression_test.go
# ----------------------------------------------------------------
# Static analysis
# Needs first: go install honnef.co/go/tools/cmd/staticcheck@latest
# See also: https://staticcheck.io
staticcheck:
@ -93,4 +121,4 @@ release_tarball: build check
# ================================================================
# Go does its own dependency management, outside of make.
.PHONY: build mlr check unit_test regression_test fmt staticcheck dev docs
.PHONY: build mlr scan check unit_test regression_test bench fmt staticcheck dev docs

View file

@ -8,12 +8,16 @@ import (
"runtime/debug"
"runtime/pprof"
"strconv"
"strings"
"time"
"github.com/johnkerl/miller/internal/pkg/entrypoint"
"github.com/pkg/profile" // for trace.out
)
func main() {
// For mlr --time
startTime := time.Now()
// Respect env $GOMAXPROCS, if provided, else set default.
haveSetGoMaxProcs := false
@ -63,12 +67,35 @@ func main() {
defer fmt.Fprintf(os.Stderr, "CPU profile finished.\ngo tool pprof -http=:8080 %s\n", profFilename)
}
if len(os.Args) >= 3 && os.Args[1] == "--traceprofile" {
if len(os.Args) >= 2 && os.Args[1] == "--traceprofile" {
defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop()
defer fmt.Fprintf(os.Stderr, "go tool trace trace.out\n")
}
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// This will obtain os.Args and go from there. All the usual contents of
// main() are put into this package for ease of testing.
entrypoint.Main()
mainReturn := entrypoint.Main()
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// Timing
//
// The system 'time' command is built-in, of course but it's nice to have
// simply wall-time without the real/user/sys distinction. Also, making
// this a Miller built-in is nice for Windows.
if mainReturn.PrintElapsedTime {
endTime := time.Now()
startNanos := startTime.UnixNano()
endNanos := endTime.UnixNano()
seconds := float64(endNanos-startNanos) / 1e9
fmt.Fprintf(os.Stderr, "%.6f", seconds)
for _, arg := range os.Args {
if strings.Contains(arg, " ") || strings.Contains(arg, "\t") {
fmt.Fprintf(os.Stderr, " '%s'", arg)
} else {
fmt.Fprintf(os.Stderr, " %s", arg)
}
}
fmt.Fprintf(os.Stderr, "\n")
}
}

19
cmd/scan/main.go Normal file
View file

@ -0,0 +1,19 @@
// ================================================================
// Experiments for type-inference performance optimization
// ================================================================
package main
import (
"fmt"
"os"
"github.com/johnkerl/miller/internal/pkg/scan"
)
func main() {
for _, arg := range os.Args[1:] {
scanType := scan.FindScanType(arg)
fmt.Printf("%-10s -> %s\n", arg, scan.TypeNames[scanType])
}
}

View file

@ -144,6 +144,7 @@ HELP OPTIONS
mlr help miscellaneous-flags
mlr help output-colorization-flags
mlr help pprint-only-flags
mlr help profiling-flags
mlr help separator-flags
Verbs:
mlr help list-verbs
@ -637,6 +638,20 @@ PPRINT-ONLY FLAGS
for input).
--right Right-justifies all fields for PPRINT output.
PROFILING FLAGS
These are flags for profiling Miller performance.
--cpuprofile {CPU-profile file name}
Create a CPU-profile file for performance analysis.
Instructions will be printed to stderr. This flag
must be the very first thing after 'mlr' on the
command line.
--time Print elapsed execution time in seconds to stderr at
the end of the execution of the program.
--traceprofile Create a trace-profile file for performance analysis.
Instructions will be printed to stderr. This flag
must be the very first thing after 'mlr' on the
command line.
SEPARATOR FLAGS
See the Separators doc page for more about record separators, field
separators, and pair separators. Also see the File formats doc page, or
@ -756,6 +771,7 @@ AUXILIARY COMMANDS
help
regtest
repl
version
For more information, please invoke mlr {subcommand} --help.
MLRRC
@ -3024,5 +3040,5 @@ SEE ALSO
2021-12-25 MILLER(1)
2021-12-27 MILLER(1)
</pre>

View file

@ -123,6 +123,7 @@ HELP OPTIONS
mlr help miscellaneous-flags
mlr help output-colorization-flags
mlr help pprint-only-flags
mlr help profiling-flags
mlr help separator-flags
Verbs:
mlr help list-verbs
@ -616,6 +617,20 @@ PPRINT-ONLY FLAGS
for input).
--right Right-justifies all fields for PPRINT output.
PROFILING FLAGS
These are flags for profiling Miller performance.
--cpuprofile {CPU-profile file name}
Create a CPU-profile file for performance analysis.
Instructions will be printed to stderr. This flag
must be the very first thing after 'mlr' on the
command line.
--time Print elapsed execution time in seconds to stderr at
the end of the execution of the program.
--traceprofile Create a trace-profile file for performance analysis.
Instructions will be printed to stderr. This flag
must be the very first thing after 'mlr' on the
command line.
SEPARATOR FLAGS
See the Separators doc page for more about record separators, field
separators, and pair separators. Also see the File formats doc page, or
@ -735,6 +750,7 @@ AUXILIARY COMMANDS
help
regtest
repl
version
For more information, please invoke mlr {subcommand} --help.
MLRRC
@ -3003,4 +3019,4 @@ SEE ALSO
2021-12-25 MILLER(1)
2021-12-27 MILLER(1)

View file

@ -46,7 +46,7 @@ EOF
for flag in flags
headline = `mlr help show-headline-for-flag '#{flag}'`
help = `mlr help show-help-for-flag '#{flag}'`
puts "* `#{headline}`: #{help}"
puts "* `#{headline.chomp}`: #{help}"
end
end

View file

@ -255,7 +255,8 @@ The following differences are rather technical. If they don't sound familiar to
* See also `mlr help legacy-flags` or the [legacy-flags reference](reference-main-flag-list.md#legacy-flags).
* Type-inference:
* The `-S` and `-F` flags to `mlr put` and `mlr filter` are ignored, since type-inference is no longer done in `mlr put` and `mlr filter`, but rather, when records are first read. You can use `mlr -S` and `mlr -A`, respectively, instead to control type-inference within the record-readers.
* Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as float.
* Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as deicmal integers.
* Any numbers prefix with `0o`, e.g. `0o377`, are already treated as octal regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled.
* See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags).
* Emitting a map-valued expression now requires either a temporary variable or the new `emit1` keyword. Please see the
[page on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information.
@ -270,7 +271,13 @@ The following differences are rather technical. If they don't sound familiar to
As a benchmark, the [example.csv](https://github.com/johnkerl/miller/blob/main/docs/src/example.csv) file
[was expanded](https://github.com/johnkerl/miller/blob/main/scripts/make-big-files) into a million-line CSV file,
then converted to DKVP, JSON, etc. These were run on a commodity Mac laptop with four CPUs.
then converted to DKVP, JSON, etc.
Notes:
* These were run on a commodity Mac laptop with four CPUs, on MacOS Monterey, using `go1.16.5 darwin/amd64`.
* Linux benchmarks are pending.
* As of late 2021, Miller has been benchmarks using Go compiler versions 1.15.15, 1.16.12, 1.17.5, and 1.18beta1, with no significant performance changes attributable to compiler versions.
For the [first benchmark](https://github.com/johnkerl/miller/blob/main/scripts/time-big-files), we have `mlr cat` of those files, with processing times shown:

View file

@ -213,7 +213,8 @@ The following differences are rather technical. If they don't sound familiar to
* See also `mlr help legacy-flags` or the [legacy-flags reference](reference-main-flag-list.md#legacy-flags).
* Type-inference:
* The `-S` and `-F` flags to `mlr put` and `mlr filter` are ignored, since type-inference is no longer done in `mlr put` and `mlr filter`, but rather, when records are first read. You can use `mlr -S` and `mlr -A`, respectively, instead to control type-inference within the record-readers.
* Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as float.
* Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as deicmal integers.
* Any numbers prefix with `0o`, e.g. `0o377`, are already treated as octal regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled.
* See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags).
* Emitting a map-valued expression now requires either a temporary variable or the new `emit1` keyword. Please see the
[page on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information.
@ -228,7 +229,13 @@ The following differences are rather technical. If they don't sound familiar to
As a benchmark, the [example.csv](https://github.com/johnkerl/miller/blob/main/docs/src/example.csv) file
[was expanded](https://github.com/johnkerl/miller/blob/main/scripts/make-big-files) into a million-line CSV file,
then converted to DKVP, JSON, etc. These were run on a commodity Mac laptop with four CPUs.
then converted to DKVP, JSON, etc.
Notes:
* These were run on a commodity Mac laptop with four CPUs, on MacOS Monterey, using `go1.16.5 darwin/amd64`.
* Linux benchmarks are pending.
* As of late 2021, Miller has been benchmarks using Go compiler versions 1.15.15, 1.16.12, 1.17.5, and 1.18beta1, with no significant performance changes attributable to compiler versions.
For the [first benchmark](https://github.com/johnkerl/miller/blob/main/scripts/time-big-files), we have `mlr cat` of those files, with processing times shown:

View file

@ -62,6 +62,7 @@ Flags:
mlr help miscellaneous-flags
mlr help output-colorization-flags
mlr help pprint-only-flags
mlr help profiling-flags
mlr help separator-flags
Verbs:
mlr help list-verbs

View file

@ -20,11 +20,20 @@ Quick links:
Numbers in Miller are double-precision float or 64-bit signed integers. Anything scannable as int, e.g `123` or `0xabcd`, is treated as an integer; otherwise, input scannable as float (`4.56` or `8e9`) is treated as float; everything else is a string.
If you want all numbers to be treated as floats, then you may use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b` with `$c = float($a) * float($b)`).
Three flags control input-scanning for numbers: `mlr -O`, `mlr -A`, and `mlr -S`.
<!--- TODO: probably remove this entirely for Miller6.
Or, more simply, use `mlr filter -F` and `mlr put -F` which forces all numeric input, whether from expression literals or field values, to float. Likewise `mlr stats1 -F` and `mlr step -F` force integerable accumulators (such as `count`) to be done in floating-point.
-->
Prefix `0x` means hexadecimal, e.g. `0xcafe`; prefix `0b` means binary, e.g.
`0b1011`; prefix `0o` means octal, e.g. `0o377`. Numbers in data files with
leading zeroes, e.g. `0377` or `06789`, are treated as strings in Miller,
unless you specifiy `mlr -O`: then `0377` will scan as an octal integer (with
value 255), and `06789` will scan as a decimal integer (with value 6789).
If you want all numbers from data files to be treated as floats, then you may
use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b`
with `$c = float($a) * float($b)`). Or, use `mlr -A`.
If you use `mlr -S` then all field values from data files are read in as
strings; you can cast them using `int()` or `float()`.
## Conversion by math routines

View file

@ -4,11 +4,20 @@
Numbers in Miller are double-precision float or 64-bit signed integers. Anything scannable as int, e.g `123` or `0xabcd`, is treated as an integer; otherwise, input scannable as float (`4.56` or `8e9`) is treated as float; everything else is a string.
If you want all numbers to be treated as floats, then you may use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b` with `$c = float($a) * float($b)`).
Three flags control input-scanning for numbers: `mlr -O`, `mlr -A`, and `mlr -S`.
<!--- TODO: probably remove this entirely for Miller6.
Or, more simply, use `mlr filter -F` and `mlr put -F` which forces all numeric input, whether from expression literals or field values, to float. Likewise `mlr stats1 -F` and `mlr step -F` force integerable accumulators (such as `count`) to be done in floating-point.
-->
Prefix `0x` means hexadecimal, e.g. `0xcafe`; prefix `0b` means binary, e.g.
`0b1011`; prefix `0o` means octal, e.g. `0o377`. Numbers in data files with
leading zeroes, e.g. `0377` or `06789`, are treated as strings in Miller,
unless you specifiy `mlr -O`: then `0377` will scan as an octal integer (with
value 255), and `06789` will scan as a decimal integer (with value 6789).
If you want all numbers from data files to be treated as floats, then you may
use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b`
with `$c = float($a) * float($b)`). Or, use `mlr -A`.
If you use `mlr -S` then all field values from data files are read in as
strings; you can cast them using `int()` or `float()`.
## Conversion by math routines

View file

@ -31,6 +31,7 @@ Available subcommands:
help
regtest
repl
version
For more information, please invoke mlr {subcommand} --help.
</pre>

View file

@ -60,14 +60,10 @@ Notes:
**Flags:**
* `--pass-comments
`: Immediately print commented lines (prefixed by `#`) within the input.
* `--pass-comments-with {string}
`: Immediately print commented lines within input, with specified prefix.
* `--skip-comments
`: Ignore commented lines (prefixed by `#`) within the input.
* `--skip-comments-with {string}
`: Ignore commented lines within input, with specified prefix.
* `--pass-comments`: Immediately print commented lines (prefixed by `#`) within the input.
* `--pass-comments-with {string}`: Immediately print commented lines within input, with specified prefix.
* `--skip-comments`: Ignore commented lines (prefixed by `#`) within the input.
* `--skip-comments-with {string}`: Ignore commented lines within input, with specified prefix.
## Compressed-data flags
@ -102,22 +98,14 @@ decisions that might have been made based on the file suffix. Likewise,
**Flags:**
* `--bz2in
`: Uncompress bzip2 within the Miller process. Done by default if file ends in `.bz2`.
* `--gzin
`: Uncompress gzip within the Miller process. Done by default if file ends in `.gz`.
* `--prepipe {decompression command}
`: You can, of course, already do without this for single input files, e.g. `gunzip < myfile.csv.gz | mlr ...`. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution.
* `--prepipe-bz2
`: Same as `--prepipe bz2`, except this is allowed in `.mlrrc`.
* `--prepipe-gunzip
`: Same as `--prepipe gunzip`, except this is allowed in `.mlrrc`.
* `--prepipe-zcat
`: Same as `--prepipe zcat`, except this is allowed in `.mlrrc`.
* `--prepipex {decompression command}
`: Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful for some commands like `unzip -qc` which don't read standard input. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution.
* `--zin
`: Uncompress zlib within the Miller process. Done by default if file ends in `.z`.
* `--bz2in`: Uncompress bzip2 within the Miller process. Done by default if file ends in `.bz2`.
* `--gzin`: Uncompress gzip within the Miller process. Done by default if file ends in `.gz`.
* `--prepipe {decompression command}`: You can, of course, already do without this for single input files, e.g. `gunzip < myfile.csv.gz | mlr ...`. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution.
* `--prepipe-bz2`: Same as `--prepipe bz2`, except this is allowed in `.mlrrc`.
* `--prepipe-gunzip`: Same as `--prepipe gunzip`, except this is allowed in `.mlrrc`.
* `--prepipe-zcat`: Same as `--prepipe zcat`, except this is allowed in `.mlrrc`.
* `--prepipex {decompression command}`: Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful for some commands like `unzip -qc` which don't read standard input. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution.
* `--zin`: Uncompress zlib within the Miller process. Done by default if file ends in `.z`.
## CSV-only flags
@ -126,16 +114,11 @@ These are flags which are applicable to CSV format.
**Flags:**
* `--allow-ragged-csv-input or --ragged
`: If a data line has fewer fields than the header line, fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case.
* `--headerless-csv-output or --ho
`: Print only CSV data lines; do not print CSV header lines.
* `--implicit-csv-header or --headerless-csv-input or --hi
`: Use 1,2,3,... as field labels, rather than from line 1 of input files. Tip: combine with `label` to recreate missing headers.
* `--no-implicit-csv-header
`: Opposite of `--implicit-csv-header`. This is the default anyway -- the main use is for the flags to `mlr join` if you have main file(s) which are headerless but you want to join in on a file which does have a CSV header. Then you could use `mlr --csv --implicit-csv-header join --no-implicit-csv-header -l your-join-in-with-header.csv ... your-headerless.csv`.
* `-N
`: Keystroke-saver for `--implicit-csv-header --headerless-csv-output`.
* `--allow-ragged-csv-input or --ragged`: If a data line has fewer fields than the header line, fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case.
* `--headerless-csv-output or --ho`: Print only CSV data lines; do not print CSV header lines.
* `--implicit-csv-header or --headerless-csv-input or --hi`: Use 1,2,3,... as field labels, rather than from line 1 of input files. Tip: combine with `label` to recreate missing headers.
* `--no-implicit-csv-header`: Opposite of `--implicit-csv-header`. This is the default anyway -- the main use is for the flags to `mlr join` if you have main file(s) which are headerless but you want to join in on a file which does have a CSV header. Then you could use `mlr --csv --implicit-csv-header join --no-implicit-csv-header -l your-join-in-with-header.csv ... your-headerless.csv`.
* `-N`: Keystroke-saver for `--implicit-csv-header --headerless-csv-output`.
## File-format flags
@ -152,90 +135,48 @@ are overridden in all cases by setting output format to `format2`.
**Flags:**
* `--asv or --asvlite
`: Use ASV format for input and output data.
* `--csv or -c
`: Use CSV format for input and output data.
* `--csvlite
`: Use CSV-lite format for input and output data.
* `--dkvp
`: Use DKVP format for input and output data.
* `--gen-field-name
`: Specify field name for --igen. Defaults to "i".
* `--gen-start
`: Specify start value for --igen. Defaults to 1.
* `--gen-step
`: Specify step value for --igen. Defaults to 1.
* `--gen-stop
`: Specify stop value for --igen. Defaults to 100.
* `--iasv or --iasvlite
`: Use ASV format for input data.
* `--icsv
`: Use CSV format for input data.
* `--icsvlite
`: Use CSV-lite format for input data.
* `--idkvp
`: Use DKVP format for input data.
* `--igen
`: Ignore input files and instead generate sequential numeric input using --gen-field-name, --gen-start, --gen-step, and --gen-stop values. See also the seqgen verb, which is more useful/intuitive.
* `--ijson
`: Use JSON format for input data.
* `--inidx
`: Use NIDX format for input data.
* `--io {format name}
`: Use format name for input and output data. For example: `--io csv` is the same as `--csv`.
* `--ipprint
`: Use PPRINT format for input data.
* `--itsv
`: Use TSV format for input data.
* `--itsvlite
`: Use TSV-lite format for input data.
* `--iusv or --iusvlite
`: Use USV format for input data.
* `--ixtab
`: Use XTAB format for input data.
* `--json or -j
`: Use JSON format for input and output data.
* `--nidx
`: Use NIDX format for input and output data.
* `--oasv or --oasvlite
`: Use ASV format for output data.
* `--ocsv
`: Use CSV format for output data.
* `--ocsvlite
`: Use CSV-lite format for output data.
* `--odkvp
`: Use DKVP format for output data.
* `--ojson
`: Use JSON format for output data.
* `--omd
`: Use markdown-tabular format for output data.
* `--onidx
`: Use NIDX format for output data.
* `--opprint
`: Use PPRINT format for output data.
* `--otsv
`: Use TSV format for output data.
* `--otsvlite
`: Use TSV-lite format for output data.
* `--ousv or --ousvlite
`: Use USV format for output data.
* `--oxtab
`: Use XTAB format for output data.
* `--pprint
`: Use PPRINT format for input and output data.
* `--tsv
`: Use TSV format for input and output data.
* `--tsvlite or -t
`: Use TSV-lite format for input and output data.
* `--usv or --usvlite
`: Use USV format for input and output data.
* `--xtab
`: Use XTAB format for input and output data.
* `-i {format name}
`: Use format name for input data. For example: `-i csv` is the same as `--icsv`.
* `-o {format name}
`: Use format name for output data. For example: `-o csv` is the same as `--ocsv`.
* `--asv or --asvlite`: Use ASV format for input and output data.
* `--csv or -c`: Use CSV format for input and output data.
* `--csvlite`: Use CSV-lite format for input and output data.
* `--dkvp`: Use DKVP format for input and output data.
* `--gen-field-name`: Specify field name for --igen. Defaults to "i".
* `--gen-start`: Specify start value for --igen. Defaults to 1.
* `--gen-step`: Specify step value for --igen. Defaults to 1.
* `--gen-stop`: Specify stop value for --igen. Defaults to 100.
* `--iasv or --iasvlite`: Use ASV format for input data.
* `--icsv`: Use CSV format for input data.
* `--icsvlite`: Use CSV-lite format for input data.
* `--idkvp`: Use DKVP format for input data.
* `--igen`: Ignore input files and instead generate sequential numeric input using --gen-field-name, --gen-start, --gen-step, and --gen-stop values. See also the seqgen verb, which is more useful/intuitive.
* `--ijson`: Use JSON format for input data.
* `--inidx`: Use NIDX format for input data.
* `--io {format name}`: Use format name for input and output data. For example: `--io csv` is the same as `--csv`.
* `--ipprint`: Use PPRINT format for input data.
* `--itsv`: Use TSV format for input data.
* `--itsvlite`: Use TSV-lite format for input data.
* `--iusv or --iusvlite`: Use USV format for input data.
* `--ixtab`: Use XTAB format for input data.
* `--json or -j`: Use JSON format for input and output data.
* `--nidx`: Use NIDX format for input and output data.
* `--oasv or --oasvlite`: Use ASV format for output data.
* `--ocsv`: Use CSV format for output data.
* `--ocsvlite`: Use CSV-lite format for output data.
* `--odkvp`: Use DKVP format for output data.
* `--ojson`: Use JSON format for output data.
* `--omd`: Use markdown-tabular format for output data.
* `--onidx`: Use NIDX format for output data.
* `--opprint`: Use PPRINT format for output data.
* `--otsv`: Use TSV format for output data.
* `--otsvlite`: Use TSV-lite format for output data.
* `--ousv or --ousvlite`: Use USV format for output data.
* `--oxtab`: Use XTAB format for output data.
* `--pprint`: Use PPRINT format for input and output data.
* `--tsv`: Use TSV format for input and output data.
* `--tsvlite or -t`: Use TSV-lite format for input and output data.
* `--usv or --usvlite`: Use USV format for input and output data.
* `--xtab`: Use XTAB format for input and output data.
* `-i {format name}`: Use format name for input data. For example: `-i csv` is the same as `--icsv`.
* `-o {format name}`: Use format name for output data. For example: `-o csv` is the same as `--ocsv`.
## Flatten-unflatten flags
@ -246,14 +187,10 @@ See the Flatten/unflatten doc page for more information.
**Flags:**
* `--flatsep or --jflatsep {string}
`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`.
* `--no-auto-flatten
`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.
* `--no-auto-unflatten
`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.
* `--xvright
`: Right-justify values for XTAB format.
* `--flatsep or --jflatsep {string}`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`.
* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.
* `--no-auto-unflatten`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.
* `--xvright`: Right-justify values for XTAB format.
## Format-conversion keystroke-saver flags
@ -283,12 +220,9 @@ These are flags which are applicable to JSON format.
**Flags:**
* `--jlistwrap or --jl
`: Wrap JSON output in outermost `[ ]`.
* `--jvstack
`: Put one key-value pair per line for JSON output (multi-line output).
* `--no-jvstack
`: Put objects/arrays all on one line for JSON output.
* `--jlistwrap or --jl`: Wrap JSON output in outermost `[ ]`.
* `--jvstack`: Put one key-value pair per line for JSON output (multi-line output).
* `--no-jvstack`: Put objects/arrays all on one line for JSON output.
## Legacy flags
@ -298,38 +232,22 @@ They are accepted as no-op flags in order to keep old scripts from breaking.
**Flags:**
* `--jknquoteint
`: Type information from JSON input files is now preserved throughout the processing stream.
* `--jquoteall
`: Type information from JSON input files is now preserved throughout the processing stream.
* `--json-fatal-arrays-on-input
`: Miller now supports arrays as of version 6.
* `--json-map-arrays-on-input
`: Miller now supports arrays as of version 6.
* `--json-skip-arrays-on-input
`: Miller now supports arrays as of version 6.
* `--jsonx
`: The `--jvstack` flag is now default true in Miller 6.
* `--jvquoteall
`: Type information from JSON input files is now preserved throughout the processing stream.
* `--mmap
`: Miller no longer uses memory-mapping to access data files.
* `--no-mmap
`: Miller no longer uses memory-mapping to access data files.
* `--ojsonx
`: The `--jvstack` flag is now default true in Miller 6.
* `--quote-all
`: Ignored as of version 6. Types are inferred/retained through the processing flow now.
* `--quote-minimal
`: Ignored as of version 6. Types are inferred/retained through the processing flow now.
* `--quote-none
`: Ignored as of version 6. Types are inferred/retained through the processing flow now.
* `--quote-numeric
`: Ignored as of version 6. Types are inferred/retained through the processing flow now.
* `--quote-original
`: Ignored as of version 6. Types are inferred/retained through the processing flow now.
* `--vflatsep
`: Ignored as of version 6. This functionality is subsumed into JSON formatting.
* `--jknquoteint`: Type information from JSON input files is now preserved throughout the processing stream.
* `--jquoteall`: Type information from JSON input files is now preserved throughout the processing stream.
* `--json-fatal-arrays-on-input`: Miller now supports arrays as of version 6.
* `--json-map-arrays-on-input`: Miller now supports arrays as of version 6.
* `--json-skip-arrays-on-input`: Miller now supports arrays as of version 6.
* `--jsonx`: The `--jvstack` flag is now default true in Miller 6.
* `--jvquoteall`: Type information from JSON input files is now preserved throughout the processing stream.
* `--mmap`: Miller no longer uses memory-mapping to access data files.
* `--no-mmap`: Miller no longer uses memory-mapping to access data files.
* `--ojsonx`: The `--jvstack` flag is now default true in Miller 6.
* `--quote-all`: Ignored as of version 6. Types are inferred/retained through the processing flow now.
* `--quote-minimal`: Ignored as of version 6. Types are inferred/retained through the processing flow now.
* `--quote-none`: Ignored as of version 6. Types are inferred/retained through the processing flow now.
* `--quote-numeric`: Ignored as of version 6. Types are inferred/retained through the processing flow now.
* `--quote-original`: Ignored as of version 6. Types are inferred/retained through the processing flow now.
* `--vflatsep`: Ignored as of version 6. This functionality is subsumed into JSON formatting.
## Miscellaneous flags
@ -337,44 +255,25 @@ These are flags which don't fit into any other category.
**Flags:**
* `--fflush
`: Force buffered output to be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to force frequent updates even when output is to a pipe or file, at a performance cost.
* `--from {filename}
`: Use this to specify an input file before the verb(s), rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`.
* `--hash-records
`: This is an internal parameter which normally does not need to be modified. It controls the mechanism by which Miller accesses fields within records. In general --no-hash-records is faster, and is the default. For specific use-cases involving data having many fields, and many of them being processed during a given processing run, --hash-records might offer a slight performance benefit.
* `--infer-int-as-float or -A
`: Cast all integers in data files to floats.
* `--infer-none or -S
`: Don't treat values like 123 or 456.7 in data files as int/float; leave them as strings.
* `--infer-octal or -O
`: Treat numbers like 0123 in data files as numeric; default is string. Note that 00--07 etc scan as int; 08-09 scan as float.
* `--load {filename}
`: Load DSL script file for all put/filter operations on the command line. If the name following `--load` is a directory, load all `*.mlr` files in that directory. This is just like `put -f` and `filter -f` except it's up-front on the command line, so you can do something like `alias mlr='mlr --load ~/myscripts'` if you like.
* `--mfrom {filenames}
`: Use this to specify one of more input files before the verb(s), rather than after. May be used more than once. The list of filename must end with `--`. This is useful for example since `--from *.csv` doesn't do what you might hope but `--mfrom *.csv --` does.
* `--mload {filenames}
`: Like `--load` but works with more than one filename, e.g. `--mload *.mlr --`.
* `--no-dedupe-field-names
`: By default, if an input record has a field named `x` and another also named `x`, the second will be renamed `x_2`, and so on. With this flag provided, the second `x`'s value will replace the first `x`'s value when the record is read. This flag has no effect on JSON input records, where duplicate keys always result in the last one's value being retained.
* `--no-fflush
`: Let buffered output not be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to allow less-frequent updates when output is to the terminal. This is unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead.
* `--no-hash-records
`: See --hash-records.
* `--nr-progress-mod {m}
`: With m a positive integer: print filename and record count to os.Stderr every m input records.
* `--ofmt {format}
`: E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb.
* `--records-per-batch {n}
`: This is an internal parameter for maximum number of records in a batch size. Normally this does not need to be modified.
* `--seed {n}
`: with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`.
* `--tz {timezone}
`: Specify timezone, overriding `$TZ` environment variable (if any).
* `-I
`: Process files in-place. For each file name on the command line, output is written to a temp file in the same directory, which is then renamed over the original. Each file is processed in isolation: if the output format is CSV, CSV headers will be present in each output file, statistics are only over each file's own records; and so on.
* `-n
`: Process no input files, nor standard input either. Useful for `mlr put` with `begin`/`end` statements only. (Same as `--from /dev/null`.) Also useful in `mlr -n put -v '...'` for analyzing abstract syntax trees (if that's your thing).
* `--fflush`: Force buffered output to be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to force frequent updates even when output is to a pipe or file, at a performance cost.
* `--from {filename}`: Use this to specify an input file before the verb(s), rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`.
* `--hash-records`: This is an internal parameter which normally does not need to be modified. It controls the mechanism by which Miller accesses fields within records. In general --no-hash-records is faster, and is the default. For specific use-cases involving data having many fields, and many of them being processed during a given processing run, --hash-records might offer a slight performance benefit.
* `--infer-int-as-float or -A`: Cast all integers in data files to floats.
* `--infer-none or -S`: Don't treat values like 123 or 456.7 in data files as int/float; leave them as strings.
* `--infer-octal or -O`: Treat numbers like 0123 in data files as numeric; default is string. Note that 00--07 etc scan as int; 08-09 scan as float.
* `--load {filename}`: Load DSL script file for all put/filter operations on the command line. If the name following `--load` is a directory, load all `*.mlr` files in that directory. This is just like `put -f` and `filter -f` except it's up-front on the command line, so you can do something like `alias mlr='mlr --load ~/myscripts'` if you like.
* `--mfrom {filenames}`: Use this to specify one of more input files before the verb(s), rather than after. May be used more than once. The list of filename must end with `--`. This is useful for example since `--from *.csv` doesn't do what you might hope but `--mfrom *.csv --` does.
* `--mload {filenames}`: Like `--load` but works with more than one filename, e.g. `--mload *.mlr --`.
* `--no-dedupe-field-names`: By default, if an input record has a field named `x` and another also named `x`, the second will be renamed `x_2`, and so on. With this flag provided, the second `x`'s value will replace the first `x`'s value when the record is read. This flag has no effect on JSON input records, where duplicate keys always result in the last one's value being retained.
* `--no-fflush`: Let buffered output not be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to allow less-frequent updates when output is to the terminal. This is unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead.
* `--no-hash-records`: See --hash-records.
* `--nr-progress-mod {m}`: With m a positive integer: print filename and record count to os.Stderr every m input records.
* `--ofmt {format}`: E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb.
* `--records-per-batch {n}`: This is an internal parameter for maximum number of records in a batch size. Normally this does not need to be modified.
* `--seed {n}`: with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`.
* `--tz {timezone}`: Specify timezone, overriding `$TZ` environment variable (if any).
* `-I`: Process files in-place. For each file name on the command line, output is written to a temp file in the same directory, which is then renamed over the original. Each file is processed in isolation: if the output format is CSV, CSV headers will be present in each output file, statistics are only over each file's own records; and so on.
* `-n`: Process no input files, nor standard input either. Useful for `mlr put` with `begin`/`end` statements only. (Same as `--from /dev/null`.) Also useful in `mlr -n put -v '...'` for analyzing abstract syntax trees (if that's your thing).
## Output-colorization flags
@ -436,24 +335,15 @@ and `mlr --list-color-names` to see available names (like `orchid`).
**Flags:**
* `--always-color or -C
`: Instructs Miller to colorize output even when it normally would not. Useful for piping output to `less -r`.
* `--fail-color
`: Specify the color (see `--list-color-codes` and `--list-color-names`) for failing cases in `mlr regtest`.
* `--help-color
`: Specify the color (see `--list-color-codes` and `--list-color-names`) for highlights in `mlr help` output.
* `--key-color
`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record keys.
* `--list-color-codes
`: Show the available color codes in the range 0..255, such as 170 for example.
* `--list-color-names
`: Show the names for the available color codes, such as `orchid` for example.
* `--no-color or -M
`: Instructs Miller to not colorize any output.
* `--pass-color
`: Specify the color (see `--list-color-codes` and `--list-color-names`) for passing cases in `mlr regtest`.
* `--value-color
`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record values.
* `--always-color or -C`: Instructs Miller to colorize output even when it normally would not. Useful for piping output to `less -r`.
* `--fail-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for failing cases in `mlr regtest`.
* `--help-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for highlights in `mlr help` output.
* `--key-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record keys.
* `--list-color-codes`: Show the available color codes in the range 0..255, such as 170 for example.
* `--list-color-names`: Show the names for the available color codes, such as `orchid` for example.
* `--no-color or -M`: Instructs Miller to not colorize any output.
* `--pass-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for passing cases in `mlr regtest`.
* `--value-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record values.
## PPRINT-only flags
@ -462,10 +352,18 @@ These are flags which are applicable to PPRINT output format.
**Flags:**
* `--barred
`: Prints a border around PPRINT output (not available for input).
* `--right
`: Right-justifies all fields for PPRINT output.
* `--barred`: Prints a border around PPRINT output (not available for input).
* `--right`: Right-justifies all fields for PPRINT output.
## Profiling flags
These are flags for profiling Miller performance.
**Flags:**
* `--cpuprofile {CPU-profile file name}`: Create a CPU-profile file for performance analysis. Instructions will be printed to stderr. This flag must be the very first thing after 'mlr' on the command line.
* `--time`: Print elapsed execution time in seconds to stderr at the end of the execution of the program.
* `--traceprofile`: Create a trace-profile file for performance analysis. Instructions will be printed to stderr. This flag must be the very first thing after 'mlr' on the command line.
## Separator flags
@ -566,28 +464,16 @@ Notes about all other separators:
**Flags:**
* `--fs {string}
`: Specify FS for input and output.
* `--ifs {string}
`: Specify FS for input.
* `--ifs-regex {string}
`: Specify FS for input as a regular expression.
* `--ips {string}
`: Specify PS for input.
* `--ips-regex {string}
`: Specify PS for input as a regular expression.
* `--irs {string}
`: Specify RS for input.
* `--ofs {string}
`: Specify FS for output.
* `--ops {string}
`: Specify PS for output.
* `--ors {string}
`: Specify RS for output.
* `--ps {string}
`: Specify PS for input and output.
* `--repifs
`: Let IFS be repeated: e.g. for splitting on multiple spaces.
* `--rs {string}
`: Specify RS for input and output.
* `--fs {string}`: Specify FS for input and output.
* `--ifs {string}`: Specify FS for input.
* `--ifs-regex {string}`: Specify FS for input as a regular expression.
* `--ips {string}`: Specify PS for input.
* `--ips-regex {string}`: Specify PS for input as a regular expression.
* `--irs {string}`: Specify RS for input.
* `--ofs {string}`: Specify FS for output.
* `--ops {string}`: Specify PS for output.
* `--ors {string}`: Specify RS for output.
* `--ps {string}`: Specify PS for input and output.
* `--repifs`: Let IFS be repeated: e.g. for splitting on multiple spaces.
* `--rs {string}`: Specify RS for input and output.

2
go.mod
View file

@ -17,7 +17,7 @@ require (
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51
github.com/lestrrat-go/strftime v1.0.4
github.com/mattn/go-isatty v0.0.12
github.com/pkg/profile v1.6.0 // indirect
github.com/pkg/profile v1.6.0
github.com/stretchr/testify v1.7.0 // indirect
golang.org/x/sys v0.0.0-20210326220804-49726bf1d181
golang.org/x/term v0.0.0-20201210144234-2321bbc49cbf

View file

@ -8,10 +8,12 @@ package auxents
import (
"fmt"
"os"
"runtime"
"github.com/johnkerl/miller/internal/pkg/auxents/help"
"github.com/johnkerl/miller/internal/pkg/auxents/regtest"
"github.com/johnkerl/miller/internal/pkg/auxents/repl"
"github.com/johnkerl/miller/internal/pkg/version"
)
// tAuxMain is a function-pointer type for the entrypoint handler for a given auxent,
@ -38,6 +40,7 @@ func init() {
{"help", help.HelpMain},
{"regtest", regtest.RegTestMain},
{"repl", repl.ReplMain},
{"version", showVersion},
}
}
@ -82,3 +85,8 @@ func ShowAuxEntries(o *os.File) {
fmt.Fprintf(o, "For more information, please invoke mlr {subcommand} --help.\n")
}
func showVersion(args []string) int {
fmt.Printf("mlr version %s for %s/%s/%s\n", version.STRING, runtime.GOOS, runtime.GOARCH, runtime.Version())
return 0
}

View file

@ -50,7 +50,7 @@ func getPrompt2() string {
func (repl *Repl) printStartupBanner() {
if repl.inputIsTerminal {
fmt.Printf("Miller %s REPL for %s:%s:%s\n", version.STRING, runtime.GOOS, runtime.GOARCH, runtime.Version())
fmt.Printf("Miller %s REPL for %s/%s/%s\n", version.STRING, runtime.GOOS, runtime.GOARCH, runtime.Version())
fmt.Printf("Docs: %s\n", lib.DOC_URL)
fmt.Printf("Type ':h' or ':help' for online help; ':q' or ':quit' to quit.\n")
}

View file

@ -98,6 +98,7 @@ var FLAG_TABLE = FlagTable{
&CommentsInDataFlagSection,
&OutputColorizationFlagSection,
&FlattenUnflattenFlagSection,
&ProfilingFlagSection,
&MiscFlagSection,
},
}
@ -2410,6 +2411,51 @@ var FlattenUnflattenFlagSection = FlagSection{
},
}
// ================================================================
// PROFILING FLAGS
func ProfilingPrintInfo() {
fmt.Print("These are flags for profiling Miller performance.")
}
func init() { ProfilingFlagSection.Sort() }
var ProfilingFlagSection = FlagSection{
name: "Profiling flags",
infoPrinter: ProfilingPrintInfo,
flags: []Flag{
{
name: "--cpuprofile",
arg: "{CPU-profile file name}",
help: `Create a CPU-profile file for performance analysis. Instructions will be printed to stderr.
This flag must be the very first thing after 'mlr' on the command line.`,
parser: func(args []string, argc int, pargi *int, options *TOptions) {
// Already handled in main(). Nothing to do here except to accept this as valid syntax.
*pargi += 2
},
},
{
name: "--traceprofile",
help: `Create a trace-profile file for performance analysis. Instructions will be printed to stderr.
This flag must be the very first thing after 'mlr' on the command line.`,
parser: func(args []string, argc int, pargi *int, options *TOptions) {
// Already handled in main(). Nothing to do here except to accept this as valid syntax.
*pargi += 1
},
},
{
name: "--time",
help: "Print elapsed execution time in seconds to stderr at the end of the execution of the program.",
parser: func(args []string, argc int, pargi *int, options *TOptions) {
options.PrintElapsedTime = true
*pargi += 1
},
},
},
}
// ================================================================
// MISC FLAGS

View file

@ -153,6 +153,8 @@ type TOptions struct {
HaveRandSeed bool
RandSeed int
PrintElapsedTime bool // mlr --time
}
// Not usable until FinalizeReaderOptions and FinalizeWriterOptions are called.

View file

@ -1,3 +1,4 @@
// ================================================================
// Miller main command-line parsing.
//
// Before Miller 6 the ordering was:
@ -65,6 +66,7 @@
// foo.csv' the '--csv' looks like it belongs to the 'head' verb. When people
// use '#!/bin/sh' scripts they need to insert the '--' in 'mlr head -n 10 --
// --csv foo.csv'; for 'mlr -s' we insert the '--' for them.
// ================================================================
package climain
@ -128,15 +130,7 @@ func parseCommandLinePassOne(
oargi := argi
if args[argi][0] == '-' {
if args[argi] == "--cpuprofile" {
// Already handled in main(); ignore here, and don't send it to pass two.
cli.CheckArgCount(args, argi, argc, 1)
argi += 2
} else if args[argi] == "--traceprofile" {
// Already handled in main(); ignore here, and don't send it to pass two.
argi += 1
} else if args[argi] == "--version" {
if args[argi] == "--version" {
// Exiting flag: handle it immediately.
fmt.Printf("mlr %s\n", version.STRING)
os.Exit(0)

View file

@ -20,8 +20,11 @@ import (
"github.com/johnkerl/miller/internal/pkg/transformers"
)
// ----------------------------------------------------------------
func Main() {
type MainReturn struct {
PrintElapsedTime bool
}
func Main() MainReturn {
// Special handling for Windows so we can do things like:
//
// mlr put '$a = $b . "cd \"efg\" hi"' foo.dat
@ -55,6 +58,10 @@ func Main() {
} else {
processInPlace(options)
}
return MainReturn{
PrintElapsedTime: options.PrintElapsedTime,
}
}
// ----------------------------------------------------------------

View file

@ -0,0 +1,71 @@
package input
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/johnkerl/miller/internal/pkg/cli"
)
// go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/input/...
func BenchmarkDKVPParse(b *testing.B) {
readerOptions := &cli.TReaderOptions{
InputFileFormat: "dkvp",
IFS: ",",
IPS: "=",
IRS: "\n",
}
reader, err := NewRecordReaderDKVP(readerOptions, 1)
assert.Nil(b, err)
for i := 0; i < b.N; i++ {
_, _ = recordFromDKVPLine(
reader,
"color=yellow,shape=triangle,flag=true,k=1,index=11,quantity=43.6498,rate=9.8870",
)
}
}
func BenchmarkNIDXParse(b *testing.B) {
readerOptions := &cli.TReaderOptions{
InputFileFormat: "nidx",
IFS: " ",
AllowRepeatIFS: true,
IRS: "\n",
}
reader, err := NewRecordReaderNIDX(readerOptions, 1)
assert.Nil(b, err)
for i := 0; i < b.N; i++ {
_, _ = recordFromDKVPLine(
reader,
"yellow triangle true 1 11 43.6498 9.8870",
)
}
}
func BenchmarkXTABParse(b *testing.B) {
readerOptions := &cli.TReaderOptions{
InputFileFormat: "xtab",
IPS: " ",
IFS: "\n",
IRS: "\n",
}
reader, err := NewRecordReaderXTAB(readerOptions, 1)
assert.Nil(b, err)
stanza := newStanza()
stanza.dataLines.PushBack("color yellow")
stanza.dataLines.PushBack("shape triangle")
stanza.dataLines.PushBack("flag true")
stanza.dataLines.PushBack("k 1")
stanza.dataLines.PushBack("index 11")
stanza.dataLines.PushBack("quantity 43.6498")
stanza.dataLines.PushBack("rate 9.8870")
for i := 0; i < b.N; i++ {
_, _ = reader.recordFromXTABLines(stanza.dataLines)
}
}

View file

@ -101,7 +101,7 @@ func TryIntFromString(input string) (int, bool) {
}
}
// Following twos-complement formatting familiar from all manners of
// Following twos-complement formatting familiar from all manner of
// languages, including C which was Miller's original implementation
// language, we want to allow 0x00....00 through 0x7f....ff as positive
// 64-bit integers and 0x80....00 through 0xff....ff as negative ones. Go's

View file

@ -0,0 +1,34 @@
package mlrval
import (
"testing"
)
// go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/mlrval/...
func BenchmarkFromDeferredType(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = FromDeferredType("123")
}
}
func BenchmarkInferIntFromDeferredType(b *testing.B) {
for i := 0; i < b.N; i++ {
mv := FromDeferredType("123")
mv.Type()
}
}
func BenchmarkInferFloatFromDeferredType(b *testing.B) {
for i := 0; i < b.N; i++ {
mv := FromDeferredType("123.4")
mv.Type()
}
}
func BenchmarkInferStringFromDeferredType(b *testing.B) {
for i := 0; i < b.N; i++ {
mv := FromDeferredType("abc")
mv.Type()
}
}

View file

@ -1,10 +1,9 @@
package mlrval
import (
"regexp"
"strings"
"strconv"
"github.com/johnkerl/miller/internal/pkg/lib"
"github.com/johnkerl/miller/internal/pkg/scan"
)
// TODO: comment no infer-bool from data files. Always false in this path.
@ -15,19 +14,19 @@ import (
func (mv *Mlrval) Type() MVType {
if mv.mvtype == MT_PENDING {
packageLevelInferrer(mv, mv.printrep, false)
packageLevelInferrer(mv)
}
return mv.mvtype
}
// Support for mlr -S, mlr -A, mlr -O.
type tInferrer func(mv *Mlrval, input string, inferBool bool) *Mlrval
type tInferrer func(mv *Mlrval) *Mlrval
var packageLevelInferrer tInferrer = inferWithOctalAsString
var packageLevelInferrer tInferrer = inferNormally
// SetInferrerOctalAsInt is for default behavior.
func SetInferrerOctalAsString() {
packageLevelInferrer = inferWithOctalAsString
// SetInferNormally is the default behavior.
func SetInferNormally() {
packageLevelInferrer = inferNormally
}
// SetInferrerOctalAsInt is for mlr -O.
@ -42,67 +41,25 @@ func SetInferrerIntAsFloat() {
// SetInferrerStringOnly is for mlr -S.
func SetInferrerStringOnly() {
packageLevelInferrer = inferStringOnly
packageLevelInferrer = inferString
}
// When loading data files, don't scan these words into floats -- even though
// the Go library is willing to do so.
var downcasedFloatNamesToNotInfer = map[string]bool{
"inf": true,
"+inf": true,
"-inf": true,
"infinity": true,
"+infinity": true,
"-infinity": true,
"nan": true,
// ----------------------------------------------------------------
func inferNormally(mv *Mlrval) *Mlrval {
scanType := scan.FindScanType(mv.printrep)
return normalInferrerTable[scanType](mv)
}
var octalDetector = regexp.MustCompile("^-?0[0-9]+")
// inferWithOctalAsString is for default behavior.
func inferWithOctalAsString(mv *Mlrval, input string, inferBool bool) *Mlrval {
inferWithOctalAsInt(mv, input, inferBool)
if mv.mvtype != MT_INT && mv.mvtype != MT_FLOAT {
return mv
}
if octalDetector.MatchString(mv.printrep) {
return mv.SetFromString(input)
} else {
return mv
}
}
// inferWithOctalAsInt is for mlr -O.
func inferWithOctalAsInt(mv *Mlrval, input string, inferBool bool) *Mlrval {
if input == "" {
return mv.SetFromVoid()
}
intval, iok := lib.TryIntFromString(input)
if iok {
return mv.SetFromPrevalidatedIntString(input, intval)
}
if downcasedFloatNamesToNotInfer[strings.ToLower(input)] == false {
floatval, fok := lib.TryFloatFromString(input)
if fok {
return mv.SetFromPrevalidatedFloatString(input, floatval)
}
}
if inferBool {
boolval, bok := lib.TryBoolFromBoolString(input)
if bok {
return mv.SetFromPrevalidatedBoolString(input, boolval)
}
}
return mv.SetFromString(input)
// xxx temp
func inferWithOctalAsInt(mv *Mlrval) *Mlrval {
scanType := scan.FindScanType(mv.printrep)
return leadingZeroAsIntInferrerTable[scanType](mv)
}
// inferWithIntAsFloat is for mlr -A.
func inferWithIntAsFloat(mv *Mlrval, input string, inferBool bool) *Mlrval {
inferWithOctalAsString(mv, input, inferBool)
func inferWithIntAsFloat(mv *Mlrval) *Mlrval {
inferNormally(mv)
if mv.Type() == MT_INT {
mv.floatval = float64(mv.intval)
mv.mvtype = MT_FLOAT
@ -110,7 +67,166 @@ func inferWithIntAsFloat(mv *Mlrval, input string, inferBool bool) *Mlrval {
return mv
}
// inferStringOnly is for mlr -S.
func inferStringOnly(mv *Mlrval, input string, inferBool bool) *Mlrval {
return mv.SetFromString(input)
// inferString is for mlr -S.
func inferString(mv *Mlrval) *Mlrval {
return mv.SetFromString(mv.printrep)
}
// ----------------------------------------------------------------
// Important: synchronize this with the type-ordering in the scan package.
var normalInferrerTable []tInferrer = []tInferrer{
inferString,
inferDecimalInt,
inferString, // inferLeadingZeroDecimalIntAsInt,
inferOctalInt,
inferString, // inferFromLeadingZeroOctalIntAsInt,
inferHexInt,
inferBinaryInt,
inferMaybeFloat,
}
// Important: synchronize this with the type-ordering in the scan package.
var leadingZeroAsIntInferrerTable []tInferrer = []tInferrer{
inferString,
inferDecimalInt,
inferLeadingZeroDecimalIntAsInt,
inferOctalInt,
inferFromLeadingZeroOctalIntAsInt,
inferHexInt,
inferBinaryInt,
inferMaybeFloat,
}
// TODO: comment
func inferDecimalInt(mv *Mlrval) *Mlrval {
intval, err := strconv.ParseInt(mv.printrep, 10, 64)
if err == nil {
return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval))
} else {
return mv.SetFromString(mv.printrep)
}
}
// TODO: comment
func inferLeadingZeroDecimalIntAsInt(mv *Mlrval) *Mlrval {
intval, err := strconv.ParseInt(mv.printrep, 10, 64)
if err == nil {
return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval))
} else {
return mv.SetFromString(mv.printrep)
}
}
// TODO: comment
// E.g. explicit 0o377, not 0377
func inferOctalInt(mv *Mlrval) *Mlrval {
return inferBaseInt(mv, 8)
}
// TODO: comment
func inferFromLeadingZeroOctalIntAsInt(mv *Mlrval) *Mlrval {
intval, err := strconv.ParseInt(mv.printrep, 8, 64)
if err == nil {
return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval))
} else {
return mv.SetFromString(mv.printrep)
}
}
// TODO: comment
func inferHexInt(mv *Mlrval) *Mlrval {
var input string
var negate bool
// Skip known leading 0x or -0x prefix
if mv.printrep[0] == '-' {
input = mv.printrep[3:]
negate = true
} else {
input = mv.printrep[2:]
negate = false
}
// Following twos-complement formatting familiar from all manner of
// languages, including C which was Miller's original implementation
// language, we want to allow 0x00....00 through 0x7f....ff as positive
// 64-bit integers and 0x80....00 through 0xff....ff as negative ones. Go's
// signed-int parsing explicitly doesn't allow that, but we don't want Go
// semantics to dictate Miller semantics. So, we try signed-int parsing
// for 0x00....00 through 0x7f....ff, as well as positive or negative
// decimal. Failing that, we try unsigned-int parsing for 0x80....00
// through 0xff....ff.
i0 := input[0]
if len(input) == 16 && ('8' <= i0 && i0 <= 'f') {
uintval, err := strconv.ParseUint(input, 16, 64)
intval := int(uintval)
if negate {
intval = -intval
}
if err == nil {
return mv.SetFromPrevalidatedIntString(mv.printrep, intval)
} else {
return mv.SetFromString(mv.printrep)
}
} else {
intval, err := strconv.ParseInt(input, 16, 64)
if negate {
intval = -intval
}
if err == nil {
return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval))
} else {
return mv.SetFromString(mv.printrep)
}
}
}
// TODO: comment
func inferBinaryInt(mv *Mlrval) *Mlrval {
return inferBaseInt(mv, 2)
}
// TODO: comment
func inferMaybeFloat(mv *Mlrval) *Mlrval {
floatval, err := strconv.ParseFloat(mv.printrep, 64)
if err == nil {
return mv.SetFromPrevalidatedFloatString(mv.printrep, floatval)
} else {
return mv.SetFromString(mv.printrep)
}
}
// TODO: comment
func inferFromBool(mv *Mlrval) *Mlrval {
if mv.printrep == "true" {
return mv.SetFromPrevalidatedBoolString(mv.printrep, true)
} else {
return mv.SetFromPrevalidatedBoolString(mv.printrep, false)
}
}
// TODO: comment
// Shared code for 0o/0b integers
func inferBaseInt(mv *Mlrval, base int) *Mlrval {
var input string
var negate bool
// Skip known leading 0x or -0x prefix
if mv.printrep[0] == '-' {
input = mv.printrep[3:]
negate = true
} else {
input = mv.printrep[2:]
negate = false
}
intval, err := strconv.ParseInt(input, base, 64)
if err == nil {
if negate {
intval = -intval
}
return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval))
} else {
return mv.SetFromString(mv.printrep)
}
}

View file

@ -0,0 +1,252 @@
// ================================================================
// Tests mlrval constructors.
// ================================================================
package mlrval
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestInferNormally(t *testing.T) {
assert.True(t, inferNormally(FromDeferredType("")).IsVoid())
assert.True(t, inferNormally(FromDeferredType("true")).IsString())
assert.True(t, inferNormally(FromDeferredType("false")).IsString())
assert.True(t, inferNormally(FromDeferredType("abc")).IsString())
assert.True(t, inferNormally(FromDeferredType("0123")).IsString())
assert.True(t, inferNormally(FromDeferredType("-0123")).IsString())
assert.True(t, inferNormally(FromDeferredType("0377")).IsString())
assert.True(t, inferNormally(FromDeferredType("-0377")).IsString())
assert.True(t, inferNormally(FromDeferredType("0923")).IsString())
assert.True(t, inferNormally(FromDeferredType("-0923")).IsString())
assert.True(t, inferNormally(FromDeferredType("123")).IsInt())
assert.True(t, inferNormally(FromDeferredType("-123")).IsInt())
assert.True(t, inferNormally(FromDeferredType("0xff")).IsInt())
assert.True(t, inferNormally(FromDeferredType("-0xff")).IsInt())
assert.True(t, inferNormally(FromDeferredType("0b1011")).IsInt())
assert.True(t, inferNormally(FromDeferredType("-0b1011")).IsInt())
assert.True(t, inferNormally(FromDeferredType("0x7fffffffffffffff")).IsInt())
assert.True(t, inferNormally(FromDeferredType("0x8000000000000000")).IsInt())
assert.True(t, inferNormally(FromDeferredType("0xffffffffffffffff")).IsInt())
assert.True(t, inferNormally(FromDeferredType("12_3")).IsString())
assert.True(t, inferNormally(FromDeferredType("-12_3")).IsString())
assert.True(t, inferNormally(FromDeferredType("1_2.3_4")).IsString())
assert.True(t, inferNormally(FromDeferredType("-1_2.3_4")).IsString())
assert.True(t, inferNormally(FromDeferredType("0xca_fe")).IsString())
assert.True(t, inferNormally(FromDeferredType("-0xca_fe")).IsString())
assert.True(t, inferNormally(FromDeferredType("0b1011_1101")).IsString())
assert.True(t, inferNormally(FromDeferredType("-0b1011_1101")).IsString())
assert.True(t, inferNormally(FromDeferredType(".")).IsString())
assert.True(t, inferNormally(FromDeferredType("-.")).IsString())
assert.True(t, inferNormally(FromDeferredType("123.")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("-123.")).IsFloat())
assert.True(t, inferNormally(FromDeferredType(".123")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("-.123")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("123.456")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("-123.456")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("1e2.")).IsString())
assert.True(t, inferNormally(FromDeferredType("-1e2.")).IsString())
assert.True(t, inferNormally(FromDeferredType("1e-2.")).IsString())
assert.True(t, inferNormally(FromDeferredType("-1e-2.")).IsString())
assert.True(t, inferNormally(FromDeferredType("1.2e3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("-1.2e3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("1.2e-3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("-1.2e-3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("1.e3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("-1.e3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("1.e-3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("-1.e-3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType(".2e3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("-.2e3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType(".2e-3")).IsFloat())
assert.True(t, inferNormally(FromDeferredType("-.2e-3")).IsFloat())
}
func TestInferWithOctalAsInt(t *testing.T) {
assert.True(t, inferWithOctalAsInt(FromDeferredType("")).IsVoid())
assert.True(t, inferWithOctalAsInt(FromDeferredType("true")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("false")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("abc")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("0123")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-0123")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("0377")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-0377")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("0923")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-0923")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("123")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-123")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("0xff")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-0xff")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("0b1011")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-0b1011")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("0x7fffffffffffffff")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("0x8000000000000000")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("0xffffffffffffffff")).IsInt())
assert.True(t, inferWithOctalAsInt(FromDeferredType("12_3")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-12_3")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("1_2.3_4")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-1_2.3_4")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("0xca_fe")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-0xca_fe")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("0b1011_1101")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-0b1011_1101")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType(".")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-.")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("123.")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-123.")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType(".123")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-.123")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("123.456")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-123.456")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("1e2.")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-1e2.")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("1e-2.")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-1e-2.")).IsString())
assert.True(t, inferWithOctalAsInt(FromDeferredType("1.2e3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.2e3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("1.2e-3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.2e-3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("1.e3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.e3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("1.e-3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.e-3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType(".2e3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType(".2e-3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e-3")).IsFloat())
}
func TestInferWithIntAsFloat(t *testing.T) {
assert.True(t, inferWithIntAsFloat(FromDeferredType("")).IsVoid())
assert.True(t, inferWithIntAsFloat(FromDeferredType("true")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("false")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("abc")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("0123")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-0123")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("0377")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-0377")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("0923")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-0923")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("123")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-123")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("0xff")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-0xff")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("0b1011")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-0b1011")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("0x7fffffffffffffff")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("0x8000000000000000")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("0xffffffffffffffff")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("12_3")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-12_3")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("1_2.3_4")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-1_2.3_4")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("0xca_fe")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-0xca_fe")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("0b1011_1101")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-0b1011_1101")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType(".")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-.")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("123.")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-123.")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType(".123")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-.123")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("123.456")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-123.456")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("1e2.")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-1e2.")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("1e-2.")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-1e-2.")).IsString())
assert.True(t, inferWithIntAsFloat(FromDeferredType("1.2e3")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.2e3")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("1.2e-3")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.2e-3")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("1.e3")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.e3")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("1.e-3")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.e-3")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType(".2e3")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType("-.2e3")).IsFloat())
assert.True(t, inferWithIntAsFloat(FromDeferredType(".2e-3")).IsFloat())
assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e-3")).IsFloat())
}
func TestInferString(t *testing.T) {
assert.True(t, inferString(FromDeferredType("")).IsVoid())
assert.True(t, inferString(FromDeferredType("true")).IsString())
assert.True(t, inferString(FromDeferredType("false")).IsString())
assert.True(t, inferString(FromDeferredType("abc")).IsString())
assert.True(t, inferString(FromDeferredType("0123")).IsString())
assert.True(t, inferString(FromDeferredType("-0123")).IsString())
assert.True(t, inferString(FromDeferredType("0377")).IsString())
assert.True(t, inferString(FromDeferredType("-0377")).IsString())
assert.True(t, inferString(FromDeferredType("0923")).IsString())
assert.True(t, inferString(FromDeferredType("-0923")).IsString())
assert.True(t, inferString(FromDeferredType("123")).IsString())
assert.True(t, inferString(FromDeferredType("-123")).IsString())
assert.True(t, inferString(FromDeferredType("0xff")).IsString())
assert.True(t, inferString(FromDeferredType("-0xff")).IsString())
assert.True(t, inferString(FromDeferredType("0b1011")).IsString())
assert.True(t, inferString(FromDeferredType("-0b1011")).IsString())
assert.True(t, inferString(FromDeferredType("0x7fffffffffffffff")).IsString())
assert.True(t, inferString(FromDeferredType("0x8000000000000000")).IsString())
assert.True(t, inferString(FromDeferredType("0xffffffffffffffff")).IsString())
assert.True(t, inferString(FromDeferredType("12_3")).IsString())
assert.True(t, inferString(FromDeferredType("-12_3")).IsString())
assert.True(t, inferString(FromDeferredType("1_2.3_4")).IsString())
assert.True(t, inferString(FromDeferredType("-1_2.3_4")).IsString())
assert.True(t, inferString(FromDeferredType("0xca_fe")).IsString())
assert.True(t, inferString(FromDeferredType("-0xca_fe")).IsString())
assert.True(t, inferString(FromDeferredType("0b1011_1101")).IsString())
assert.True(t, inferString(FromDeferredType("-0b1011_1101")).IsString())
assert.True(t, inferString(FromDeferredType(".")).IsString())
assert.True(t, inferString(FromDeferredType("-.")).IsString())
assert.True(t, inferString(FromDeferredType("123.")).IsString())
assert.True(t, inferString(FromDeferredType("-123.")).IsString())
assert.True(t, inferString(FromDeferredType(".123")).IsString())
assert.True(t, inferString(FromDeferredType("-.123")).IsString())
assert.True(t, inferString(FromDeferredType("123.456")).IsString())
assert.True(t, inferString(FromDeferredType("-123.456")).IsString())
assert.True(t, inferString(FromDeferredType("1e2.")).IsString())
assert.True(t, inferString(FromDeferredType("-1e2.")).IsString())
assert.True(t, inferString(FromDeferredType("1e-2.")).IsString())
assert.True(t, inferString(FromDeferredType("-1e-2.")).IsString())
assert.True(t, inferString(FromDeferredType("1.2e3")).IsString())
assert.True(t, inferString(FromDeferredType("-1.2e3")).IsString())
assert.True(t, inferString(FromDeferredType("1.2e-3")).IsString())
assert.True(t, inferString(FromDeferredType("-1.2e-3")).IsString())
assert.True(t, inferString(FromDeferredType("1.e3")).IsString())
assert.True(t, inferString(FromDeferredType("-1.e3")).IsString())
assert.True(t, inferString(FromDeferredType("1.e-3")).IsString())
assert.True(t, inferString(FromDeferredType("-1.e-3")).IsString())
assert.True(t, inferString(FromDeferredType(".2e3")).IsString())
assert.True(t, inferString(FromDeferredType("-.2e3")).IsString())
assert.True(t, inferString(FromDeferredType(".2e-3")).IsString())
assert.True(t, inferString(FromDeferredType("-.2e-3")).IsString())
}

View file

@ -39,9 +39,15 @@ func FromInferredType(input string) *Mlrval {
printrep: input,
printrepValid: true,
}
// TODO: comment re inferBool arg
packageLevelInferrer(mv, mv.printrep, true)
return mv
// TODO: comment re data files vs literals context -- this is for the latter
if input == "true" {
return TRUE
} else if input == "false" {
return FALSE
} else {
packageLevelInferrer(mv)
return mv
}
}
func FromString(input string) *Mlrval {

View file

@ -0,0 +1,90 @@
package scan
// TODO: comment re context
// 00000000: 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f |................|
// 00000010: 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f |................|
// 00000020: 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f | !"#$%&'()*+,-./|
// 00000030: 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f |0123456789:;<=>?|
// 00000040: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f |@ABCDEFGHIJKLMNO|
// 00000050: 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f |PQRSTUVWXYZ[\]^_|
// 00000060: 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f |`abcdefghijklmno|
// 00000070: 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f |pqrstuvwxyz{|}~.|
var isDecimalDigitTable = []bool{
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f
true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 40-4f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 60-6f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f
}
var isOctalDigitTable = []bool{
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f
true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, // 30-3f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 40-4f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 60-6f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f
}
var isHexDigitTable = []bool{
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f
true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f
false, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, // 40-4f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f
false, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, // 60-6f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f
}
// Possible character in floats include '.', 0-9, [eE], [-+] -- the latter two for things like 1.2e-8.
// Miller intentionally does not accept 'inf' or 'NaN' as float numbers in file-input data.
var isFloatDigitTable = []bool{
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f
false, false, false, false, false, false, false, false, false, false, false, true, false, true, true, false, // 20-2f
true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f
false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, // 40-4f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f
false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, // 60-6f
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f
}
func isDecimalDigit(c byte) bool {
if c < 128 { // byte is unsigned in Go
return isDecimalDigitTable[c]
} else {
return false
}
}
func isOctalDigit(c byte) bool {
if c < 128 { // byte is unsigned in Go
return isOctalDigitTable[c]
} else {
return false
}
}
func isHexDigit(c byte) bool {
if c < 128 { // byte is unsigned in Go
return isHexDigitTable[c]
} else {
return false
}
}
func isFloatDigit(c byte) bool {
if c < 128 { // byte is unsigned in Go
return isFloatDigitTable[c]
} else {
return false
}
}

View file

@ -0,0 +1,57 @@
package scan
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestIsDecimalDigit(t *testing.T) {
var c byte
for c = 0x00; c < 0xff; c++ {
if c >= '0' && c <= '9' {
assert.True(t, isDecimalDigit(c))
} else {
assert.False(t, isDecimalDigit(c))
}
}
}
func TestIsOctalDigit(t *testing.T) {
var c byte
for c = 0x00; c < 0xff; c++ {
if c >= '0' && c <= '7' {
assert.True(t, isOctalDigit(c))
} else {
assert.False(t, isOctalDigit(c))
}
}
}
func TestIsHexDigit(t *testing.T) {
var c byte
for c = 0x00; c < 0xff; c++ {
if c >= '0' && c <= '9' {
assert.True(t, isHexDigit(c))
} else if c >= 'a' && c <= 'f' {
assert.True(t, isHexDigit(c))
} else if c >= 'A' && c <= 'F' {
assert.True(t, isHexDigit(c))
} else {
assert.False(t, isHexDigit(c))
}
}
}
func TestIsFloatDigit(t *testing.T) {
var c byte
for c = 0x00; c < 0xff; c++ {
if c >= '0' && c <= '9' {
assert.True(t, isFloatDigit(c))
} else if c == '.' || c == '-' || c == '+' || c == 'e' || c == 'E' {
assert.True(t, isFloatDigit(c))
} else {
assert.False(t, isFloatDigit(c))
}
}
}

3
internal/pkg/scan/doc.go Normal file
View file

@ -0,0 +1,3 @@
// Package scan contains low-level logic for efficient type-inference of string
// to int/float/bool/string.
package scan

182
internal/pkg/scan/find.go Normal file
View file

@ -0,0 +1,182 @@
package scan
import ()
// TODO: comment re context
// o grammar for numbers & case-through
// k len 0
// - len 1
// k has leading minus; strip & rest
// - 0x, 0b, 0[0-9]
// - decimal: leading minus; [0-9]+
// - octal: leading minus; 0[0-7]+
// - hex: leading minus; 0[xX][0-9a-fA-F]+
// - float: leadinug minus; [0-9] or '.'
//
// o float literals:
// 123 123. 123.4 .234
// 1e2 1e-2 1.2e3 1.e3 1.2e-3 1.e-3
// .2e3 .2e-3 1.e-3
//
// ?- [0-9]+
// ?- [0-9]+ '.' [0-9]*
// ?- [0-9]* '.' [0-9]+
// ?- [0-9]+ [eE] ?- [0-9]+
// ?- [0-9]+ '.' [0-9]* [eE] ?- [0-9]+
// ?- [0-9]* '.' [0-9]+ [eE] ?- [0-9]+
func FindScanType(sinput string) ScanType {
input := []byte(sinput)
if len(input) == 0 {
return scanTypeString
}
i0 := input[0]
if i0 == '-' {
return findScanTypePositiveNumberOrString(input[1:])
}
if i0 >= '0' && i0 <= '9' {
return findScanTypePositiveNumberOrString(input)
}
if i0 == '.' {
if len(input) == 1 {
return scanTypeString
} else {
return findScanTypePositiveDecimalOrFloatOrString(input)
}
}
return scanTypeString
}
// Convenience function for unit test
func findScanTypeName(sinput string) string {
return TypeNames[FindScanType(sinput)]
}
func findScanTypePositiveNumberOrString(input []byte) ScanType {
if len(input) == 0 {
return scanTypeString
}
i0 := input[0]
if i0 == '.' {
return findScanTypePositiveFloatOrString(input)
}
if isDecimalDigit(i0) {
if len(input) == 1 {
return scanTypeDecimalInt
}
if i0 == '0' {
i1 := input[1]
if i1 == 'x' || i1 == 'X' {
if len(input) == 2 {
return scanTypeString
} else {
return findScanTypePositiveHexOrString(input[2:])
}
}
if i1 == 'o' || i1 == 'O' {
if len(input) == 2 {
return scanTypeString
} else {
return findScanTypePositiveOctalOrString(input[2:])
}
}
if i1 == 'b' || i1 == 'B' {
if len(input) == 2 {
return scanTypeString
} else {
return findScanTypePositiveBinaryOrString(input[2:])
}
}
allOctal := true
allDecimal := true
for _, c := range input[1:] {
if !isOctalDigit(c) {
allOctal = false
}
if !isDecimalDigit(c) {
allDecimal = false
break
}
}
if allOctal {
return scanTypeLeadingZeroOctalInt
}
if allDecimal {
return scanTypeLeadingZeroDecimalInt
}
// else fall through
}
return findScanTypePositiveDecimalOrFloatOrString(input)
}
return scanTypeString
}
func findScanTypePositiveFloatOrString(input []byte) ScanType {
for _, c := range []byte(input) {
if !isFloatDigit(c) {
return scanTypeString
}
}
return scanTypeMaybeFloat
}
func findScanTypePositiveDecimalOrFloatOrString(input []byte) ScanType {
maybeInt := true
for _, c := range []byte(input) {
// All float digits are decimal-int digits so if the current character
// is not a float digit, this can't be either a float or a decimal int.
// Example: "1x2"
if !isFloatDigit(c) {
return scanTypeString
}
// Examples: "1e2" or "1x2".
if !isDecimalDigit(c) {
maybeInt = false
}
}
if maybeInt {
return scanTypeDecimalInt
} else {
return scanTypeMaybeFloat
}
}
// Leading 0o has already been stripped
func findScanTypePositiveOctalOrString(input []byte) ScanType {
for _, c := range []byte(input) {
if !isOctalDigit(c) {
return scanTypeString
}
}
return scanTypeOctalInt
}
// Leading 0x has already been stripped
func findScanTypePositiveHexOrString(input []byte) ScanType {
for _, c := range []byte(input) {
if !isHexDigit(c) {
return scanTypeString
}
}
return scanTypeHexInt
}
// Leading 0b has already been stripped
func findScanTypePositiveBinaryOrString(input []byte) ScanType {
for _, c := range []byte(input) {
if c < '0' || c > '1' {
return scanTypeString
}
}
return scanTypeBinaryInt
}

View file

@ -0,0 +1,68 @@
package scan
import (
"testing"
)
// go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/scan/...
func BenchmarkFromNormalCases(b *testing.B) {
data := []string{
"yellow", "triangle", "true", "1", "11", "43.6498", "9.8870",
"red", "square", "true", "2", "15", "79.2778", "0.0130",
"red", "circle", "true", "3", "16", "13.8103", "2.9010",
"red", "square", "false", "4", "48", "77.5542", "7.4670",
"purple", "triangle", "false", "5", "51", "81.2290", "8.5910",
"red", "square", "false", "6", "64", "77.1991", "9.5310",
"purple", "triangle", "false", "7", "65", "80.1405", "5.8240",
"yellow", "circle", "true", "8", "73", "63.9785", "4.2370",
"yellow", "circle", "true", "9", "87", "63.5058", "8.3350",
"purple", "square", "false", "10", "91", "72.3735", "8.2430",
}
ndata := len(data)
for i := 0; i < b.N; i++ {
_ = FindScanType(data[i%ndata])
}
}
func BenchmarkFromAbnormalCases(b *testing.B) {
data := []string{
"", "-",
"abc", "-abc",
"0", "-0",
"1", "-1",
"2", "-2",
"123", "-123",
"1.", "-1.",
".2", "-.2",
".", "-.",
"1.2", "-1.2",
"1.2.3", "-1.2.3",
"1e2e3", "-1e2e3",
"12e-2", "-12e-2",
"1e2x3", "-1e2x3",
"0x", "-0x",
"0x0", "-0x0",
"0xcafe", "-0xcafe",
"0xcape", "-0xcape",
"0o", "-0o",
"0o0", "-0o0",
"0o1234", "-0o1234",
"0b", "-0b",
"0b0", "-0b0",
"0b1011", "-0b1011",
"0b1021", "-0b1021",
"true", "true",
"false", "false",
"True", "True",
"False", "False",
}
ndata := len(data)
for i := 0; i < b.N; i++ {
_ = FindScanType(data[i%ndata])
}
}

View file

@ -0,0 +1,114 @@
package scan
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestFindScanTypeNameStrings(t *testing.T) {
assert.Equal(t, typeNameString, findScanTypeName(""))
assert.Equal(t, typeNameString, findScanTypeName("-"))
assert.Equal(t, typeNameString, findScanTypeName("abc"))
assert.Equal(t, typeNameString, findScanTypeName("-abc"))
}
func TestFindScanTypeNameDecimals(t *testing.T) {
assert.Equal(t, typeNameDecimalInt, findScanTypeName("0"))
assert.Equal(t, typeNameDecimalInt, findScanTypeName("-0"))
assert.Equal(t, typeNameDecimalInt, findScanTypeName("1"))
assert.Equal(t, typeNameDecimalInt, findScanTypeName("-1"))
assert.Equal(t, typeNameDecimalInt, findScanTypeName("2"))
assert.Equal(t, typeNameDecimalInt, findScanTypeName("-2"))
assert.Equal(t, typeNameDecimalInt, findScanTypeName("123"))
assert.Equal(t, typeNameDecimalInt, findScanTypeName("-123"))
}
func TestFindScanTypeNameFloats(t *testing.T) {
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1."))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1."))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName(".2"))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-.2"))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-."))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.2"))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.2"))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("12e-2"))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-12e-2"))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.2.3"))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.2.3"))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1e2e3"))
assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1e2e3"))
assert.Equal(t, typeNameString, findScanTypeName("."))
assert.Equal(t, typeNameString, findScanTypeName("1e2x3"))
assert.Equal(t, typeNameString, findScanTypeName("-1e2x3"))
assert.Equal(t, typeNameString, findScanTypeName("inf"))
assert.Equal(t, typeNameString, findScanTypeName("infinity"))
assert.Equal(t, typeNameString, findScanTypeName("NaN"))
assert.Equal(t, typeNameString, findScanTypeName("-inf"))
assert.Equal(t, typeNameString, findScanTypeName("-infinity"))
assert.Equal(t, typeNameString, findScanTypeName("-NaN"))
}
func TestFindScanTypeNameHexes(t *testing.T) {
assert.Equal(t, typeNameHexInt, findScanTypeName("0x0"))
assert.Equal(t, typeNameHexInt, findScanTypeName("-0x0"))
assert.Equal(t, typeNameHexInt, findScanTypeName("0xf"))
assert.Equal(t, typeNameHexInt, findScanTypeName("-0xf"))
assert.Equal(t, typeNameHexInt, findScanTypeName("0xcafe"))
assert.Equal(t, typeNameHexInt, findScanTypeName("-0xcafe"))
assert.Equal(t, typeNameHexInt, findScanTypeName("0x7ffffffffffffffe"))
assert.Equal(t, typeNameHexInt, findScanTypeName("0x7fffffffffffffff"))
assert.Equal(t, typeNameHexInt, findScanTypeName("0x8000000000000000"))
assert.Equal(t, typeNameHexInt, findScanTypeName("0x8000000000000001"))
assert.Equal(t, typeNameHexInt, findScanTypeName("0xfffffffffffffffe"))
assert.Equal(t, typeNameHexInt, findScanTypeName("0xffffffffffffffff"))
assert.Equal(t, typeNameString, findScanTypeName("0x"))
assert.Equal(t, typeNameString, findScanTypeName("-0x"))
assert.Equal(t, typeNameString, findScanTypeName("0xcape"))
assert.Equal(t, typeNameString, findScanTypeName("-0xcape"))
}
func TestFindScanTypeNameOctals(t *testing.T) {
assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("00"))
assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("-00"))
assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("01"))
assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("-01"))
assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("0377"))
assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("-0377"))
assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("08"))
assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("-08"))
assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("06789"))
assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("-06789"))
assert.Equal(t, typeNameOctalInt, findScanTypeName("0o377"))
assert.Equal(t, typeNameOctalInt, findScanTypeName("-0o377"))
assert.Equal(t, typeNameString, findScanTypeName("0o6789"))
assert.Equal(t, typeNameString, findScanTypeName("-0o6789"))
}
func TestFindScanTypeNameBinaries(t *testing.T) {
assert.Equal(t, typeNameBinaryInt, findScanTypeName("0b0"))
assert.Equal(t, typeNameBinaryInt, findScanTypeName("-0b0"))
assert.Equal(t, typeNameBinaryInt, findScanTypeName("0b1011"))
assert.Equal(t, typeNameBinaryInt, findScanTypeName("-0b1011"))
assert.Equal(t, typeNameString, findScanTypeName("0b"))
assert.Equal(t, typeNameString, findScanTypeName("-0b"))
assert.Equal(t, typeNameString, findScanTypeName("0b1021"))
assert.Equal(t, typeNameString, findScanTypeName("-0b1021"))
}
func TestFindScanTypeNameBooleans(t *testing.T) {
assert.Equal(t, typeNameString, findScanTypeName("true"))
assert.Equal(t, typeNameString, findScanTypeName("True"))
assert.Equal(t, typeNameString, findScanTypeName("false"))
assert.Equal(t, typeNameString, findScanTypeName("False"))
}

36
internal/pkg/scan/type.go Normal file
View file

@ -0,0 +1,36 @@
package scan
// TODO: comment re context
type ScanType int
const (
scanTypeString ScanType = 0
scanTypeDecimalInt = 1
scanTypeLeadingZeroDecimalInt = 2
scanTypeOctalInt = 3
scanTypeLeadingZeroOctalInt = 4
scanTypeHexInt = 5
scanTypeBinaryInt = 6
scanTypeMaybeFloat = 7
)
const typeNameString = "string"
const typeNameDecimalInt = "decint" // e.g. 123
const typeNameLeadingZeroDecimalInt = "lzdecint" // e.g. 0899
const typeNameOctalInt = "octint" // e.g. 0o377
const typeNameLeadingZeroOctalInt = "lzoctint" // e.g. 0377
const typeNameHexInt = "hexint" // e.g. 0xcafe
const typeNameBinaryInt = "binint" // e.g. 0b1011
const typeNameMaybeFloat = "float?" // characters in [0-9\.-+eE] but needs parse to be sure
var TypeNames = []string{
typeNameString,
typeNameDecimalInt,
typeNameLeadingZeroDecimalInt,
typeNameOctalInt,
typeNameLeadingZeroOctalInt,
typeNameHexInt,
typeNameBinaryInt,
typeNameMaybeFloat,
}

View file

@ -0,0 +1,18 @@
package scan
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestTypeNames(t *testing.T) {
assert.Equal(t, TypeNames[scanTypeString], "string")
assert.Equal(t, TypeNames[scanTypeDecimalInt], "decint")
assert.Equal(t, TypeNames[scanTypeLeadingZeroDecimalInt], "lzdecint") // e.g. 0899
assert.Equal(t, TypeNames[scanTypeOctalInt], "octint") // e.g. 0o377
assert.Equal(t, TypeNames[scanTypeLeadingZeroOctalInt], "lzoctint") // e.g. 0377
assert.Equal(t, TypeNames[scanTypeHexInt], "hexint") // e.g. 0xcafe
assert.Equal(t, TypeNames[scanTypeBinaryInt], "binint") // e.g. 0b1011
assert.Equal(t, TypeNames[scanTypeMaybeFloat], "float?") // characters in [0-9\.-+eE] but needs parse to be sure
}

View file

@ -123,6 +123,7 @@ HELP OPTIONS
mlr help miscellaneous-flags
mlr help output-colorization-flags
mlr help pprint-only-flags
mlr help profiling-flags
mlr help separator-flags
Verbs:
mlr help list-verbs
@ -616,6 +617,20 @@ PPRINT-ONLY FLAGS
for input).
--right Right-justifies all fields for PPRINT output.
PROFILING FLAGS
These are flags for profiling Miller performance.
--cpuprofile {CPU-profile file name}
Create a CPU-profile file for performance analysis.
Instructions will be printed to stderr. This flag
must be the very first thing after 'mlr' on the
command line.
--time Print elapsed execution time in seconds to stderr at
the end of the execution of the program.
--traceprofile Create a trace-profile file for performance analysis.
Instructions will be printed to stderr. This flag
must be the very first thing after 'mlr' on the
command line.
SEPARATOR FLAGS
See the Separators doc page for more about record separators, field
separators, and pair separators. Also see the File formats doc page, or
@ -735,6 +750,7 @@ AUXILIARY COMMANDS
help
regtest
repl
version
For more information, please invoke mlr {subcommand} --help.
MLRRC
@ -3003,4 +3019,4 @@ SEE ALSO
2021-12-25 MILLER(1)
2021-12-27 MILLER(1)

View file

@ -2,12 +2,12 @@
.\" Title: mlr
.\" Author: [see the "AUTHOR" section]
.\" Generator: ./mkman.rb
.\" Date: 2021-12-25
.\" Date: 2021-12-27
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "MILLER" "1" "2021-12-25" "\ \&" "\ \&"
.TH "MILLER" "1" "2021-12-27" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Portability definitions
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -158,6 +158,7 @@ Flags:
mlr help miscellaneous-flags
mlr help output-colorization-flags
mlr help pprint-only-flags
mlr help profiling-flags
mlr help separator-flags
Verbs:
mlr help list-verbs
@ -753,6 +754,28 @@ These are flags which are applicable to PPRINT output format.
.fi
.if n \{\
.RE
.SH "PROFILING FLAGS"
.sp
.if n \{\
.RS 0
.\}
.nf
These are flags for profiling Miller performance.
--cpuprofile {CPU-profile file name}
Create a CPU-profile file for performance analysis.
Instructions will be printed to stderr. This flag
must be the very first thing after 'mlr' on the
command line.
--time Print elapsed execution time in seconds to stderr at
the end of the execution of the program.
--traceprofile Create a trace-profile file for performance analysis.
Instructions will be printed to stderr. This flag
must be the very first thing after 'mlr' on the
command line.
.fi
.if n \{\
.RE
.SH "SEPARATOR FLAGS"
.sp
@ -884,6 +907,7 @@ Available subcommands:
help
regtest
repl
version
For more information, please invoke mlr {subcommand} --help.
.fi
.if n \{\

View file

@ -1,2 +1,2 @@
$color_shape = $color . $shape;
$y = int($k) + int($index) **3 + log10(float($quantity)/float($rate));
$y = $k + $index **3 + log10($quantity/$rate);

View file

@ -1,13 +1,12 @@
mlrs="mlr5 ~/tmp/miller/mlr ./mlr"
reps="1"
#mlrs="mlr5 ./mlr"
#reps="1 2 3"
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv check | md5sum; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv cat | md5sum; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv head | md5sum; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tail | md5sum; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tac | md5sum; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -f shape | md5sum; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -n quantity | md5sum; done; done
#reps="1"
reps="1 2 3"
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv check > /dev/null; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv cat > /dev/null; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tail > /dev/null; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tac > /dev/null; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -f shape > /dev/null; done; done
echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -n quantity > /dev/null; done; done

View file

@ -1,14 +1,14 @@
mlrs="mlr5 ~/tmp/miller/mlr ./mlr"
reps="1"
#mlrs="mlr5 ./mlr"
#reps="1 2 3"
#reps="1"
reps="1 2 3"
echo; for mlr in $mlrs; do
for k in $reps; do
justtime $mlr --csv --from ~/tmp/big.csv \
then put -f scripts/chain-1.mlr \
| md5sum;
> /dev/null
done
done
@ -17,7 +17,7 @@ echo; for mlr in $mlrs; do
justtime $mlr --csv --from ~/tmp/big.csv \
then put -f scripts/chain-1.mlr \
then put -f scripts/chain-1.mlr \
| md5sum;
> /dev/null
done
done
@ -27,7 +27,7 @@ echo; for mlr in $mlrs; do
then put -f scripts/chain-1.mlr \
then put -f scripts/chain-1.mlr \
then put -f scripts/chain-1.mlr \
| md5sum;
> /dev/null
done
done
@ -38,7 +38,7 @@ echo; for mlr in $mlrs; do
then put -f scripts/chain-1.mlr \
then put -f scripts/chain-1.mlr \
then put -f scripts/chain-1.mlr \
| md5sum;
> /dev/null
done
done
@ -50,7 +50,7 @@ echo; for mlr in $mlrs; do
then put -f scripts/chain-1.mlr \
then put -f scripts/chain-1.mlr \
then put -f scripts/chain-1.mlr \
| md5sum;
> /dev/null
done
done
@ -63,6 +63,6 @@ echo; for mlr in $mlrs; do
then put -f scripts/chain-1.mlr \
then put -f scripts/chain-1.mlr \
then put -f scripts/chain-1.mlr \
| md5sum;
> /dev/null
done
done

View file

@ -0,0 +1,7 @@
#!/bin/sh
for go in go1.15.15 go1.16.12 go1.17.5 go1.18beta1; do
$go clean github.com/johnkerl/miller/cmd/mlr
$go build github.com/johnkerl/miller/cmd/mlr
mv mlr mlr-$go
done

View file

@ -0,0 +1,13 @@
#!/bin/sh
# https://go.dev/doc/manage-install
go install golang.org/dl/go1.18beta1@latest
go install golang.org/dl/go1.17.5@latest
go install golang.org/dl/go1.16.12@latest
go install golang.org/dl/go1.15.15@latest
go1.15.15 download
go1.16.12 download
go1.17.5 download
go1.18beta1 download

9
scripts/compiler-versions-time Executable file
View file

@ -0,0 +1,9 @@
#!/bin/sh
for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv check ~/tmp/big.csv > /dev/null; done
echo
for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv cat ~/tmp/big.csv > /dev/null; done
echo
for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv --from ~/tmp/big.csv put -f ./scripts/chain-1.mlr > /dev/null; done

View file

@ -15,4 +15,4 @@ fi
if [ $# -eq 2 ]; then
mlr="$2"
fi
justtime $mlr $iflag cat ~/tmp/big.$suffix | md5sum -
justtime $mlr $iflag cat ~/tmp/big.$suffix > /dev/null

View file

@ -3,11 +3,15 @@
ourdir=$(dirname $0)
mlrs="mlr5 ~/tmp/miller/mlr ./mlr"
#mlrs="mlr5 ./mlr"
echo; for mlr in $mlrs; do $ourdir/time-big-file csv $mlr; done
echo; for mlr in $mlrs; do $ourdir/time-big-file csvlite $mlr; done
echo; for mlr in $mlrs; do $ourdir/time-big-file dkvp $mlr; done
echo; for mlr in $mlrs; do $ourdir/time-big-file nidx $mlr; done
echo; for mlr in $mlrs; do $ourdir/time-big-file xtab $mlr; done
echo; for mlr in $mlrs; do $ourdir/time-big-file json $mlr; done
#reps="1"
reps="1 2 3"
echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file csv $mlr; done; done
echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file csvlite $mlr; done; done
echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file dkvp $mlr; done; done
echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file nidx $mlr; done; done
echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file xtab $mlr; done; done
echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file json $mlr; done; done

View file

@ -3,7 +3,7 @@ x t y z
123.45 float 124.45 123.95
0123 int 84 83.5
07 int 8 7.5
08 float 9 8.5
08 int 9 8.5
0 int 1 0.5
0. float 1 0.5
0.0 float 1 0.5
@ -16,7 +16,7 @@ x t y z
-0b0100 int -3 -3.5
-0x1000 int -4095 -4095.5
-07 int -6 -6.5
-08 float -7 -7.5
-08 int -7 -7.5
-0 int 1 0.5
-0. float 1 0.5
-0.0 float 1 0.5

View file

@ -1,7 +1,14 @@
================================================================
PUNCHDOWN LIST
* numeric-inference perf
o README-profiling.md re various scripts
o README-profiling.md re this PR
o update mac numbers; type up linux numbers
o webdoc re on-battery anecdote
* blockers:
- linux/1.17 perf checks
- fractional-strptime
- improved regex doc w/ lots of examples
- cmp-matrices
@ -71,6 +78,10 @@ PUNCHDOWN LIST
================================================================
NON-BLOCKERS
* pos/neg 0x/0b/0o UTs
* 0o into BNF
? BIFs as FCFs?
* pv: 'mlr --prepipex pv --gzin tail -n 10 ~/tmp/zhuge.gz' needs --gzin & --prepipex both