diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 71448c3a7..ca5ad7259 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -36,11 +36,11 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
- uses: github/codeql-action/init@cdcdbb579706841c47f7063dda365e292e5cad7a
+ uses: github/codeql-action/init@cdefb33c0f6224e58673d9004f47f7cb3e328b89
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
@@ -51,7 +51,7 @@ jobs:
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
- uses: github/codeql-action/autobuild@cdcdbb579706841c47f7063dda365e292e5cad7a
+ uses: github/codeql-action/autobuild@cdefb33c0f6224e58673d9004f47f7cb3e328b89
# ℹ️ Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
@@ -65,4 +65,4 @@ jobs:
# make release
- name: Perform CodeQL Analysis
- uses: github/codeql-action/analyze@cdcdbb579706841c47f7063dda365e292e5cad7a
+ uses: github/codeql-action/analyze@cdefb33c0f6224e58673d9004f47f7cb3e328b89
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
index 967a6cbcc..839eeb43f 100644
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -21,7 +21,7 @@ jobs:
steps:
# Check out the code base
- name: Check out code
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
# Full git history is needed to get a proper list of changed files within `super-linter`
fetch-depth: 0
@@ -29,7 +29,7 @@ jobs:
# Run linter against code base
# https://github.com/codespell-project/codespell
- name: Codespell
- uses: codespell-project/actions-codespell@94259cd8be02ad2903ba34a22d9c13de21a74461
+ uses: codespell-project/actions-codespell@8f01853be192eb0f849a5c7d721450e7a467c579
with:
check_filenames: true
ignore_words_file: .codespellignore
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 4bc73d4a4..98d170d1d 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -15,18 +15,18 @@ jobs:
os: [ubuntu-latest, macos-latest, windows-latest]
steps:
- - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+ - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- name: Set up Go
- uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491
+ uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5
with:
- go-version: 1.19
+ go-version: 1.24
- name: Build
run: make build
- - name: Test
- run: make check
+ - name: Unit tests
+ run: make unit-test
- name: Regression tests
# We run these with a convoluted path to ensure the tests don't
@@ -41,7 +41,7 @@ jobs:
if: matrix.os == 'windows-latest'
run: mkdir -p bin/${{matrix.os}} && cp mlr.exe bin/${{matrix.os}}
- - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+ - uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
with:
name: mlr-${{matrix.os}}
path: bin/${{matrix.os}}/*
diff --git a/.github/workflows/release-snap.yaml b/.github/workflows/release-snap.yaml
new file mode 100644
index 000000000..d0dfada19
--- /dev/null
+++ b/.github/workflows/release-snap.yaml
@@ -0,0 +1,29 @@
+name: Release for Snap
+on:
+ push:
+ tags:
+ - v*
+ workflow_dispatch:
+
+jobs:
+ snap:
+ strategy:
+ matrix:
+ os: [ubuntu-latest, ubuntu-24.04-arm]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v6
+
+ - name: Build snap
+ uses: snapcore/action-build@v1
+ id: build
+
+ - name: Publish to Snap Store
+ uses: snapcore/action-publish@v1
+ env:
+ SNAPCRAFT_STORE_CREDENTIALS: ${{ secrets.SNAPCRAFT_TOKEN }}
+ with:
+ snap: ${{ steps.build.outputs.snap }}
+ # release: stable # or edge, beta, candidate
+ release: stable
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3e69dee50..fa2b59ec5 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,4 +1,4 @@
-name: Release
+name: Release for GitHub
on:
push:
tags:
@@ -6,7 +6,7 @@ on:
workflow_dispatch:
env:
- GO_VERSION: 1.19.13
+ GO_VERSION: 1.24.5
jobs:
release:
@@ -17,19 +17,19 @@ jobs:
runs-on: ${{ matrix.platform }}
steps:
- name: Set up Go
- uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491
+ uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5
with:
go-version: ${{ env.GO_VERSION }}
id: go
- name: Check out code into the Go module directory
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
# https://github.com/marketplace/actions/cache
- name: Cache Go modules
- uses: actions/cache@704facf57e6136b1bc63b828d79edcd491f0ee84
+ uses: actions/cache@8b402f58fbc84540c8b491a91e594a4576fec3d7
with:
path: |
~/.cache/go-build
@@ -40,7 +40,7 @@ jobs:
# https://goreleaser.com/ci/actions/
- name: Run GoReleaser
- uses: goreleaser/goreleaser-action@7ec5c2b0c6cdda6e8bbb49444bc797dd33d74dd8
+ uses: goreleaser/goreleaser-action@e435ccd777264be153ace6237001ef4d979d3a7a
#if: startsWith(github.ref, 'refs/tags/v')
with:
version: latest
diff --git a/.github/workflows/test-snap-can-build.yml b/.github/workflows/test-snap-can-build.yml
new file mode 100644
index 000000000..c6c197de9
--- /dev/null
+++ b/.github/workflows/test-snap-can-build.yml
@@ -0,0 +1,28 @@
+name: 🧪 Snap Builds
+
+on:
+ push:
+ branches: '*'
+ pull_request:
+ branches: '*'
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ node-version: [20.x]
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - uses: snapcore/action-build@v1
+ id: build
+
+ - uses: diddlesnaps/snapcraft-review-action@v1
+ with:
+ snap: ${{ steps.build.outputs.snap }}
+ isClassic: 'false'
+ # Plugs and Slots declarations to override default denial (requires store assertion to publish)
+ # plugs: ./plug-declaration.json
+ # slots: ./slot-declaration.json
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 8ad896314..57e91bb94 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -17,3 +17,5 @@ python:
mkdocs:
configuration: docs/mkdocs.yml
+
+formats: all
diff --git a/Makefile b/Makefile
index fe27b8dd3..ec2b817cb 100644
--- a/Makefile
+++ b/Makefile
@@ -7,12 +7,12 @@ INSTALLDIR=$(PREFIX)/bin
# This must remain the first target in this file, which is what 'make' with no
# arguments will run.
build:
- go build github.com/johnkerl/miller/cmd/mlr
+ go build github.com/johnkerl/miller/v6/cmd/mlr
@echo "Build complete. The Miller executable is ./mlr (or .\mlr.exe on Windows)."
@echo "You can use 'make check' to run tests".
quiet:
- @go build github.com/johnkerl/miller/cmd/mlr
+ @go build github.com/johnkerl/miller/v6/cmd/mlr
# For interactive use, 'mlr regtest' offers more options and transparency.
check: unit-test regression-test
@@ -33,25 +33,25 @@ install: build
# ----------------------------------------------------------------
# Unit tests (small number)
unit-test ut: build
- go test github.com/johnkerl/miller/pkg/...
+ go test github.com/johnkerl/miller/v6/pkg/...
ut-lib:build
- go test github.com/johnkerl/miller/pkg/lib...
+ go test github.com/johnkerl/miller/v6/pkg/lib...
ut-scan:build
- go test github.com/johnkerl/miller/pkg/scan/...
+ go test github.com/johnkerl/miller/v6/pkg/scan/...
ut-mlv:build
- go test github.com/johnkerl/miller/pkg/mlrval/...
+ go test github.com/johnkerl/miller/v6/pkg/mlrval/...
ut-bifs:build
- go test github.com/johnkerl/miller/pkg/bifs/...
+ go test github.com/johnkerl/miller/v6/pkg/bifs/...
ut-input:build
- go test github.com/johnkerl/miller/pkg/input/...
+ go test github.com/johnkerl/miller/v6/pkg/input/...
bench:build
- go test -run=nonesuch -bench=. github.com/johnkerl/miller/pkg/...
+ go test -run=nonesuch -bench=. github.com/johnkerl/miller/v6/pkg/...
bench-mlv:build
- go test -run=nonesuch -bench=. github.com/johnkerl/miller/pkg/mlrval/...
+ go test -run=nonesuch -bench=. github.com/johnkerl/miller/v6/pkg/mlrval/...
bench-input:build
- go test -run=nonesuch -bench=. github.com/johnkerl/miller/pkg/input/...
+ go test -run=nonesuch -bench=. github.com/johnkerl/miller/v6/pkg/input/...
# ----------------------------------------------------------------
# Regression tests (large number)
@@ -114,7 +114,7 @@ it: build check
so: install
mlr:
- go build github.com/johnkerl/miller/cmd/mlr
+ go build github.com/johnkerl/miller/v6/cmd/mlr
# ----------------------------------------------------------------
# Please see comments in ./create-release-tarball as well as
diff --git a/README-dev.md b/README-dev.md
index 0e363db5c..6dd708f95 100644
--- a/README-dev.md
+++ b/README-dev.md
@@ -95,13 +95,14 @@ So, in broad overview, the key packages are:
* Miller dependencies are all in the Go standard library, except two:
* GOCC lexer/parser code-generator from [github.com/goccmack/gocc](https://github.com/goccmack/gocc):
+ * Forked at [github.com/johnkerl/gocc](github.com/johnkerl/gocc).
* This package defines the grammar for Miller's domain-specific language (DSL) for the Miller `put` and `filter` verbs. And, GOCC is a joy to use. :)
* It is used on the terms of its open-source license.
* [golang.org/x/term](https://pkg.go.dev/golang.org/x/term):
* Just a one-line Miller callsite for is-a-terminal checking for the [Miller REPL](./pkg/terminals/repl/README.md).
* It is used on the terms of its open-source license.
* See also [./go.mod](go.mod). Setup:
- * `go get github.com/goccmack/gocc`
+ * `go get github.com/johnkerl/gocc`
* `go get golang.org/x/term`
### Miller per se
diff --git a/README.md b/README.md
index be095ed66..73d788982 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,7 @@ key-value-pair data in a variety of data formats.
* [Miller in 10 minutes](https://miller.readthedocs.io/en/latest/10min)
* [A Guide To Command-Line Data Manipulation](https://www.smashingmagazine.com/2022/12/guide-command-line-data-manipulation-cli-miller)
* [A quick tutorial on Miller](https://www.ict4g.net/adolfo/notes/data-analysis/miller-quick-tutorial.html)
+* [Miller Exercises](https://github.com/GuilloteauQ/miller-exercises)
* [Tools to manipulate CSV files from the Command Line](https://www.ict4g.net/adolfo/notes/data-analysis/tools-to-manipulate-csv.html)
* [www.togaware.com/linux/survivor/CSV_Files.html](https://www.togaware.com/linux/survivor/CSV_Files.html)
* [MLR for CSV manipulation](https://guillim.github.io/terminal/2018/06/19/MLR-for-CSV-manipulation.html)
@@ -45,22 +46,18 @@ key-value-pair data in a variety of data formats.
* [Active issues](https://github.com/johnkerl/miller/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc)
# Installing
-
There's a good chance you can get Miller pre-built for your system:
-
[](https://launchpad.net/ubuntu/+source/miller)
[](https://launchpad.net/ubuntu/xenial/+package/miller)
[](https://packages.fedoraproject.org/pkgs/miller/miller/)
[](https://packages.debian.org/stable/miller)
[](https://packages.gentoo.org/packages/sys-apps/miller)
-
[](http://www.pro-linux.de/cgi-bin/DBApp/check.cgi?ShowApp..20427.100)
[](https://aur.archlinux.org/packages/miller-git)
-
[](http://pkgsrc.se/textproc/miller)
[](https://www.freshports.org/textproc/miller/)
-
[](https://anaconda.org/conda-forge/miller/)
+[](https://snapcraft.io/miller)
[](https://formulae.brew.sh/formula/miller)
[](https://www.macports.org/ports.php?by=name&substr=miller)
[](https://chocolatey.org/packages/miller)
@@ -68,9 +65,9 @@ There's a good chance you can get Miller pre-built for your system:
|OS|Installation command|
|---|---|
-|Linux|`yum install miller`
`apt-get install miller`|
+|Linux|`yum install miller`
`apt-get install miller`
`snap install miller`|
|Mac|`brew install miller`
`port install miller`|
-|Windows|`choco install miller`
`winget install Miller.Miller`|
+|Windows|`choco install miller`
`winget install Miller.Miller`
`scoop install main/miller`|
See also [README-versions.md](./README-versions.md) for a full list of package versions. Note that long-term-support (LtS) releases will likely be on older versions.
@@ -94,6 +91,7 @@ See also [building from source](https://miller.readthedocs.io/en/latest/build.ht
[](https://github.com/johnkerl/miller/actions/workflows/go.yml)
[](https://github.com/johnkerl/miller/actions/workflows/codeql-analysis.yml)
[](https://github.com/johnkerl/miller/actions/workflows/codespell.yml)
+[](https://github.com/johnkerl/miller/actions/workflows/test-snap-can-build.yml)
@@ -110,9 +108,9 @@ See also [building from source](https://miller.readthedocs.io/en/latest/build.ht
* To install: `make install`. This installs the executable `/usr/local/bin/mlr` and manual page `/usr/local/share/man/man1/mlr.1` (so you can do `man mlr`).
* You can do `./configure --prefix=/some/install/path` before `make install` if you want to install somewhere other than `/usr/local`.
* Without `make`:
- * To build: `go build github.com/johnkerl/miller/cmd/mlr`.
- * To run tests: `go test github.com/johnkerl/miller/pkg/...` and `mlr regtest`.
- * To install: `go install github.com/johnkerl/miller/cmd/mlr` will install to _GOPATH_`/bin/mlr`.
+ * To build: `go build github.com/johnkerl/miller/v6/cmd/mlr`.
+ * To run tests: `go test github.com/johnkerl/miller/v6/pkg/...` and `mlr regtest`.
+ * To install: `go install github.com/johnkerl/miller/v6/cmd/mlr@latest` will install to _GOPATH_`/bin/mlr`.
* See also the doc page on [building from source](https://miller.readthedocs.io/en/latest/build).
* For more developer information please see [README-dev.md](./README-dev.md).
diff --git a/cmd/experiments/colors/main.go b/cmd/experiments/colors/main.go
index 5f5093eee..2e41124d9 100644
--- a/cmd/experiments/colors/main.go
+++ b/cmd/experiments/colors/main.go
@@ -3,15 +3,18 @@ package main
import (
"fmt"
- "github.com/johnkerl/miller/pkg/colorizer"
+ "github.com/johnkerl/miller/v6/pkg/colorizer"
)
-const boldString = "\u001b[1m"
-const underlineString = "\u001b[4m"
-const reversedString = "\u001b[7m"
-const redString = "\u001b[1;31m"
-const blueString = "\u001b[1;34m"
-const defaultString = "\u001b[0m"
+const (
+ boldString = "\u001b[1m"
+ reversedString = "\u001b[7m"
+ redString = "\u001b[1;31m"
+ blueString = "\u001b[1;34m"
+ defaultString = "\u001b[0m"
+
+ // underlineString = "\u001b[4m"
+)
func main() {
fmt.Printf("Hello, world!\n")
diff --git a/cmd/experiments/dsl_parser/one/build b/cmd/experiments/dsl_parser/one/build
index 373184a92..b43d4bc26 100755
--- a/cmd/experiments/dsl_parser/one/build
+++ b/cmd/experiments/dsl_parser/one/build
@@ -28,9 +28,9 @@ mkdir -p $dir
# ----------------------------------------------------------------
# Run the parser-generator
-# Build the bin/gocc executable:
-go get github.com/goccmack/gocc
-#go get github.com/johnkerl/gocc
+# Build the bin/gocc executable (use my fork for performance):
+# get github.com/goccmack/gocc
+go get github.com/johnkerl/gocc
bingocc="$GOPATH/bin/gocc"
if [ ! -x "$bingocc" ]; then
diff --git a/cmd/experiments/dsl_parser/one/go.mod b/cmd/experiments/dsl_parser/one/go.mod
index e4f49daf8..4e81172d6 100644
--- a/cmd/experiments/dsl_parser/one/go.mod
+++ b/cmd/experiments/dsl_parser/one/go.mod
@@ -1,5 +1,5 @@
module one
-go 1.16
+go 1.24
-require github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808 // indirect
+toolchain go1.24.5
diff --git a/cmd/experiments/dsl_parser/one/go.sum b/cmd/experiments/dsl_parser/one/go.sum
index dfc52feaf..e69de29bb 100644
--- a/cmd/experiments/dsl_parser/one/go.sum
+++ b/cmd/experiments/dsl_parser/one/go.sum
@@ -1,26 +0,0 @@
-github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808 h1:MBgZdx/wBJWTR2Q79mQfP6c8uXdQiu5JowfEz3KhFac=
-github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808/go.mod h1:dWhnuKE5wcnGTExA2DH6Iicu21YnWwOPMrc/GyhtbCk=
-github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4=
-golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
-golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
diff --git a/cmd/experiments/dsl_parser/two/build b/cmd/experiments/dsl_parser/two/build
index 2cb7893d3..1ea06c916 100755
--- a/cmd/experiments/dsl_parser/two/build
+++ b/cmd/experiments/dsl_parser/two/build
@@ -28,9 +28,9 @@ mkdir -p $dir
# ----------------------------------------------------------------
# Run the parser-generator
-# Build the bin/gocc executable:
-go get github.com/goccmack/gocc
-#go get github.com/johnkerl/gocc
+# Build the bin/gocc executable (use my fork for performance):
+# go get github.com/goccmack/gocc
+go get github.com/johnkerl/gocc
bingocc="$GOPATH/bin/gocc"
if [ ! -x "$bingocc" ]; then
exit 1
diff --git a/cmd/experiments/dsl_parser/two/go.mod b/cmd/experiments/dsl_parser/two/go.mod
index be38de9a3..81c05ea5e 100644
--- a/cmd/experiments/dsl_parser/two/go.mod
+++ b/cmd/experiments/dsl_parser/two/go.mod
@@ -1,5 +1,5 @@
module two
-go 1.16
+go 1.24
-require github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808 // indirect
+toolchain go1.24.5
diff --git a/cmd/experiments/dsl_parser/two/go.sum b/cmd/experiments/dsl_parser/two/go.sum
index dfc52feaf..e69de29bb 100644
--- a/cmd/experiments/dsl_parser/two/go.sum
+++ b/cmd/experiments/dsl_parser/two/go.sum
@@ -1,26 +0,0 @@
-github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808 h1:MBgZdx/wBJWTR2Q79mQfP6c8uXdQiu5JowfEz3KhFac=
-github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808/go.mod h1:dWhnuKE5wcnGTExA2DH6Iicu21YnWwOPMrc/GyhtbCk=
-github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4=
-golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
-golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
diff --git a/cmd/mlr/main.go b/cmd/mlr/main.go
index 3e37bdca1..dc2b1f8b4 100644
--- a/cmd/mlr/main.go
+++ b/cmd/mlr/main.go
@@ -11,7 +11,7 @@ import (
"strings"
"time"
- "github.com/johnkerl/miller/pkg/entrypoint"
+ "github.com/johnkerl/miller/v6/pkg/entrypoint"
"github.com/pkg/profile" // for trace.out
)
diff --git a/cmd/scan/main.go b/cmd/scan/main.go
index f93e0226e..d42b08115 100644
--- a/cmd/scan/main.go
+++ b/cmd/scan/main.go
@@ -8,7 +8,7 @@ import (
"fmt"
"os"
- "github.com/johnkerl/miller/pkg/scan"
+ "github.com/johnkerl/miller/v6/pkg/scan"
)
func main() {
diff --git a/cmd/sizes/main.go b/cmd/sizes/main.go
index 5ae6209cc..8e06398fe 100644
--- a/cmd/sizes/main.go
+++ b/cmd/sizes/main.go
@@ -3,7 +3,7 @@
// ================================================================
/*
-go build github.com/johnkerl/miller/cmd/sizes
+go build github.com/johnkerl/miller/v6/cmd/sizes
*/
package main
@@ -11,7 +11,7 @@ package main
import (
"fmt"
- "github.com/johnkerl/miller/pkg/mlrval"
+ "github.com/johnkerl/miller/v6/pkg/mlrval"
)
func main() {
diff --git a/delve.txt b/delve.txt
new file mode 100644
index 000000000..a34052ee1
--- /dev/null
+++ b/delve.txt
@@ -0,0 +1,5 @@
+dlv exec ./mlr -- --csv --from x.csv sub -a def ghi
+break main.main
+ # or wherever
+restart
+continue
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 287d929c7..6b36e5a94 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -8,6 +8,8 @@ theme:
code: Lato Mono
features:
- navigation.top
+ - content.action.edit
+ - content.action.view
custom_dir: overrides
repo_url: https://github.com/johnkerl/miller
repo_name: miller
@@ -116,5 +118,9 @@ nav:
- "What's new in Miller 6": "new-in-miller-6.md"
markdown_extensions:
-- toc:
+ - toc:
permalink: true
+ - admonition
+ - pymdownx.details
+ - pymdownx.superfences
+
diff --git a/docs/src/10min.md b/docs/src/10min.md
index d9e4d2416..eaec2be05 100644
--- a/docs/src/10min.md
+++ b/docs/src/10min.md
@@ -20,7 +20,7 @@ Quick links:
Let's take a quick look at some of the most useful Miller verbs -- file-format-aware, name-index-empowered equivalents of standard system commands.
-For most of this section we'll use our [example.csv](./example.csv).
+For most of this section, we'll use our [example.csv](./example.csv).
`mlr cat` is like system `cat` (or `type` on Windows) -- it passes the data through unmodified:
diff --git a/docs/src/10min.md.in b/docs/src/10min.md.in
index 0fdc94bf1..32f06d7d7 100644
--- a/docs/src/10min.md.in
+++ b/docs/src/10min.md.in
@@ -4,7 +4,7 @@
Let's take a quick look at some of the most useful Miller verbs -- file-format-aware, name-index-empowered equivalents of standard system commands.
-For most of this section we'll use our [example.csv](./example.csv).
+For most of this section, we'll use our [example.csv](./example.csv).
`mlr cat` is like system `cat` (or `type` on Windows) -- it passes the data through unmodified:
diff --git a/docs/src/build.md b/docs/src/build.md
index 0b6787898..b6678282f 100644
--- a/docs/src/build.md
+++ b/docs/src/build.md
@@ -18,7 +18,7 @@ Quick links:
Please also see [Installation](installing-miller.md) for information about pre-built executables.
-You will need to first install Go version 1.15 or higher: please see [https://go.dev](https://go.dev).
+You will need to first install Go ([this version](https://github.com/johnkerl/miller/blob/main/go.mod#L17)): please see [https://go.dev](https://go.dev).
## Miller license
@@ -31,16 +31,16 @@ Two-clause BSD license [https://github.com/johnkerl/miller/blob/master/LICENSE.t
* `cd mlr-i.j.k`
* `cd go`
* `make` creates the `./mlr` (or `.\mlr.exe` on Windows) executable
- * Without `make`: `go build github.com/johnkerl/miller/cmd/mlr`
+ * Without `make`: `go build github.com/johnkerl/miller/v6/cmd/mlr`
* `make check` runs tests
- * Without `make`: `go test github.com/johnkerl/miller/pkg/...` and `mlr regtest`
+ * Without `make`: `go test github.com/johnkerl/miller/v6/pkg/...` and `mlr regtest`
* `make install` installs the `mlr` executable and the `mlr` manpage
- * Without make: `go install github.com/johnkerl/miller/cmd/mlr` will install to _GOPATH_`/bin/mlr`
+ * Without make: `go install github.com/johnkerl/miller/v6/cmd/mlr` will install to _GOPATH_`/bin/mlr`
## From git clone
* `git clone https://github.com/johnkerl/miller`
-* `make`/`go build github.com/johnkerl/miller/cmd/mlr` as above
+* `make`/`go build github.com/johnkerl/miller/v6/cmd/mlr` as above
## In case of problems
diff --git a/docs/src/build.md.in b/docs/src/build.md.in
index 5138c9b8f..3d35ee560 100644
--- a/docs/src/build.md.in
+++ b/docs/src/build.md.in
@@ -2,7 +2,7 @@
Please also see [Installation](installing-miller.md) for information about pre-built executables.
-You will need to first install Go version 1.15 or higher: please see [https://go.dev](https://go.dev).
+You will need to first install Go ([this version](https://github.com/johnkerl/miller/blob/main/go.mod#L17)): please see [https://go.dev](https://go.dev).
## Miller license
@@ -15,16 +15,16 @@ Two-clause BSD license [https://github.com/johnkerl/miller/blob/master/LICENSE.t
* `cd mlr-i.j.k`
* `cd go`
* `make` creates the `./mlr` (or `.\mlr.exe` on Windows) executable
- * Without `make`: `go build github.com/johnkerl/miller/cmd/mlr`
+ * Without `make`: `go build github.com/johnkerl/miller/v6/cmd/mlr`
* `make check` runs tests
- * Without `make`: `go test github.com/johnkerl/miller/pkg/...` and `mlr regtest`
+ * Without `make`: `go test github.com/johnkerl/miller/v6/pkg/...` and `mlr regtest`
* `make install` installs the `mlr` executable and the `mlr` manpage
- * Without make: `go install github.com/johnkerl/miller/cmd/mlr` will install to _GOPATH_`/bin/mlr`
+ * Without make: `go install github.com/johnkerl/miller/v6/cmd/mlr` will install to _GOPATH_`/bin/mlr`
## From git clone
* `git clone https://github.com/johnkerl/miller`
-* `make`/`go build github.com/johnkerl/miller/cmd/mlr` as above
+* `make`/`go build github.com/johnkerl/miller/v6/cmd/mlr` as above
## In case of problems
diff --git a/docs/src/data-diving-examples.md b/docs/src/data-diving-examples.md
index 39738f193..297eca211 100644
--- a/docs/src/data-diving-examples.md
+++ b/docs/src/data-diving-examples.md
@@ -26,7 +26,7 @@ Vertical-tabular format is good for a quick look at CSV data layout -- seeing wh
wc -l data/flins.csv
- 36635 data/flins.csv +36635 data/flins.csv
@@ -160,11 +160,11 @@ CITRUS COUNTY 1332.9 79974.9 483785.1 stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012
-tiv_2011_tiv_2012_corr 0.9730497632351692 -tiv_2011_tiv_2012_ols_m 0.9835583980337723 -tiv_2011_tiv_2012_ols_b 433854.6428968317 +tiv_2011_tiv_2012_corr 0.9730497632351701 +tiv_2011_tiv_2012_ols_m 0.9835583980337732 +tiv_2011_tiv_2012_ols_b 433854.6428968301 tiv_2011_tiv_2012_ols_n 36634 -tiv_2011_tiv_2012_r2 0.9468258417320189 +tiv_2011_tiv_2012_r2 0.9468258417320204
@@ -227,7 +227,7 @@ Peek at the data: wc -l data/colored-shapes.dkvp
- 10078 data/colored-shapes.dkvp +10078 data/colored-shapes.dkvp
@@ -322,7 +322,7 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
u_v_corr w_x_corr
-0.1334180491027861 -0.011319841199866178
+0.1334180491027861 -0.011319841199852926
@@ -332,22 +332,22 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
color shape u_v_corr w_x_corr - red circle 0.9807984401887236 -0.01856553658708754 -orange square 0.17685855992752927 -0.07104431573806054 - green circle 0.05764419437577255 0.01179572988801509 - red square 0.05574477124893523 -0.0006801456507510942 -yellow triangle 0.04457273771962798 0.024604310103081825 -yellow square 0.04379172927296089 -0.04462197201631237 -purple circle 0.03587354936895086 0.1341133954140899 - blue square 0.03241153095761164 -0.053507648119643196 - blue triangle 0.015356427073158766 -0.0006089997461435399 -orange circle 0.010518953877704048 -0.16279397329279383 - red triangle 0.00809782571528034 0.012486621357942596 -purple triangle 0.005155190909099334 -0.045057909256220656 -purple square -0.025680276963377404 0.05769429647930396 - green square -0.0257760734502851 -0.003265173252087127 -orange triangle -0.030456661186085785 -0.1318699981926352 -yellow circle -0.06477331572781474 0.07369449819706045 - blue circle -0.10234761901929677 -0.030528539069837757 - green triangle -0.10901825107358765 -0.04848782060162929 + red circle 0.9807984401887242 -0.018565536587084836 +orange square 0.17685855992752933 -0.07104431573805543 + green circle 0.05764419437577257 0.011795729888018455 + red square 0.0557447712489348 -0.0006801456507506415 +yellow triangle 0.0445727377196281 0.024604310103079844 +yellow square 0.0437917292729612 -0.044621972016306265 +purple circle 0.03587354936895115 0.13411339541407613 + blue square 0.03241153095761152 -0.05350764811965621 + blue triangle 0.015356427073158612 -0.0006089997461408209 +orange circle 0.010518953877704181 -0.1627939732927932 + red triangle 0.00809782571528054 0.01248662135795501 +purple triangle 0.005155190909099739 -0.04505790925621933 +purple square -0.02568027696337717 0.057694296479293694 + green square -0.025776073450284875 -0.0032651732520739014 +orange triangle -0.030456661186085584 -0.13186999819263814 +yellow circle -0.06477331572781515 0.0736944981970553 + blue circle -0.1023476190192966 -0.030528539069839333 + green triangle -0.10901825107358747 -0.04848782060162855diff --git a/docs/src/data/flatten-dots.csv b/docs/src/data/flatten-dots.csv new file mode 100644 index 000000000..6a7947149 --- /dev/null +++ b/docs/src/data/flatten-dots.csv @@ -0,0 +1,2 @@ +a,b.,.c,.,d..e,f.g +1,2,3,4,5,6 diff --git a/docs/src/data/key-change.json b/docs/src/data/key-change.json new file mode 100644 index 000000000..c2719c54f --- /dev/null +++ b/docs/src/data/key-change.json @@ -0,0 +1,5 @@ +[ + { "a": 1, "b": 2, "c": 3 }, + { "a": 4, "b": 5, "c": 6 }, + { "a": 7, "X": 8, "c": 9 } +] diff --git a/docs/src/data/under-over.json b/docs/src/data/under-over.json new file mode 100644 index 000000000..0de486a83 --- /dev/null +++ b/docs/src/data/under-over.json @@ -0,0 +1,6 @@ +[ + { "a": 1, "b": 2, "c": 3 }, + { "a": 4, "b": 5, "c": 6, "d": 7 }, + { "a": 7, "b": 8 }, + { "a": 9, "b": 10, "c": 11 } +] diff --git a/docs/src/date-time-examples.md b/docs/src/date-time-examples.md index 5bcbdac01..cab74de3c 100644 --- a/docs/src/date-time-examples.md +++ b/docs/src/date-time-examples.md @@ -68,7 +68,7 @@ date,qoh wc -l data/miss-date.csv
- 1372 data/miss-date.csv +1372 data/miss-date.csvSince there are 1372 lines in the data file, some automation is called for. To find the missing dates, you can convert the dates to seconds since the epoch using `strptime`, then compute adjacent differences (the `cat -n` simply inserts record-counters): diff --git a/docs/src/extra.css b/docs/src/extra.css index 2f25087f2..e1395aff3 100644 --- a/docs/src/extra.css +++ b/docs/src/extra.css @@ -236,3 +236,8 @@ img { --md-footer-fg-color: #800000; --md-footer-fg-color: #eae2cb; } + +.md-nav__link--active { + text-decoration: underline; +} + diff --git a/docs/src/features.md b/docs/src/features.md index 36d4f66ee..ae1222a3f 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -16,7 +16,7 @@ Quick links: # Features -Miller is like awk, sed, cut, join, and sort for **name-indexed data such as +Miller is like awk, sed, cut, join, and sort for **name-indexed data, such as CSV, TSV, JSON, and JSON Lines**. You get to work with your data using named fields, without needing to count positional column indices. @@ -36,9 +36,9 @@ including but not limited to the familiar CSV, TSV, JSON, and JSON Lines. * Miller complements SQL **databases**: you can slice, dice, and reformat data on the client side on its way into or out of a database. (See [SQL Examples](sql-examples.md).) You can also reap some of the benefits of databases for quick, setup-free one-off tasks when you just need to query some data in disk files in a hurry. -* Miller also goes beyond the classic Unix tools by stepping fully into our modern, **no-SQL** world: its essential record-heterogeneity property allows Miller to operate on data where records with different schema (field names) are interleaved. +* Miller also goes beyond the classic Unix tools by stepping fully into our modern, **no-SQL** world: its essential record-heterogeneity property allows Miller to operate on data where records with different schemas (field names) are interleaved. -* Miller is **streaming**: most operations need only a single record in memory at a time, rather than ingesting all input before producing any output. For those operations which require deeper retention (`sort`, `tac`, `stats1`), Miller retains only as much data as needed. This means that whenever functionally possible, you can operate on files which are larger than your system's available RAM, and you can use Miller in **tail -f** contexts. +* Miller is **streaming**: most operations need only a single record in memory at a time, rather than ingesting all input before producing any output. For those operations that require deeper retention (`sort`, `tac`, `stats1`), Miller retains only as much data as needed. This means that whenever functionally possible, you can operate on files that are larger than your system's available RAM, and you can use Miller in **tail -f** contexts. * Miller is **pipe-friendly** and interoperates with the Unix toolkit @@ -46,10 +46,10 @@ including but not limited to the familiar CSV, TSV, JSON, and JSON Lines. * Miller does **conversion** between formats -* Miller's **processing is format-aware**: e.g. CSV `sort` and `tac` keep header lines first +* Miller's **processing is format-aware**: e.g., CSV `sort` and `tac` keep header lines first * Miller has high-throughput **performance** on par with the Unix toolkit -* Not unlike [jq](https://stedolan.github.io/jq/) (for JSON), Miller is written in Go which is a portable, modern language, and Miller has no runtime dependencies. You can download or compile a single binary, `scp` it to a faraway machine, and expect it to work. +* Not unlike [jq](https://stedolan.github.io/jq/) (for JSON), Miller is written in Go, which is a portable, modern language, and Miller has no runtime dependencies. You can download or compile a single binary, `scp` it to a faraway machine, and expect it to work. Releases and release notes: [https://github.com/johnkerl/miller/releases](https://github.com/johnkerl/miller/releases). diff --git a/docs/src/features.md.in b/docs/src/features.md.in index 22b2c5378..13ea25bb2 100644 --- a/docs/src/features.md.in +++ b/docs/src/features.md.in @@ -1,6 +1,6 @@ # Features -Miller is like awk, sed, cut, join, and sort for **name-indexed data such as +Miller is like awk, sed, cut, join, and sort for **name-indexed data, such as CSV, TSV, JSON, and JSON Lines**. You get to work with your data using named fields, without needing to count positional column indices. @@ -20,9 +20,9 @@ including but not limited to the familiar CSV, TSV, JSON, and JSON Lines. * Miller complements SQL **databases**: you can slice, dice, and reformat data on the client side on its way into or out of a database. (See [SQL Examples](sql-examples.md).) You can also reap some of the benefits of databases for quick, setup-free one-off tasks when you just need to query some data in disk files in a hurry. -* Miller also goes beyond the classic Unix tools by stepping fully into our modern, **no-SQL** world: its essential record-heterogeneity property allows Miller to operate on data where records with different schema (field names) are interleaved. +* Miller also goes beyond the classic Unix tools by stepping fully into our modern, **no-SQL** world: its essential record-heterogeneity property allows Miller to operate on data where records with different schemas (field names) are interleaved. -* Miller is **streaming**: most operations need only a single record in memory at a time, rather than ingesting all input before producing any output. For those operations which require deeper retention (`sort`, `tac`, `stats1`), Miller retains only as much data as needed. This means that whenever functionally possible, you can operate on files which are larger than your system's available RAM, and you can use Miller in **tail -f** contexts. +* Miller is **streaming**: most operations need only a single record in memory at a time, rather than ingesting all input before producing any output. For those operations that require deeper retention (`sort`, `tac`, `stats1`), Miller retains only as much data as needed. This means that whenever functionally possible, you can operate on files that are larger than your system's available RAM, and you can use Miller in **tail -f** contexts. * Miller is **pipe-friendly** and interoperates with the Unix toolkit @@ -30,10 +30,10 @@ including but not limited to the familiar CSV, TSV, JSON, and JSON Lines. * Miller does **conversion** between formats -* Miller's **processing is format-aware**: e.g. CSV `sort` and `tac` keep header lines first +* Miller's **processing is format-aware**: e.g., CSV `sort` and `tac` keep header lines first * Miller has high-throughput **performance** on par with the Unix toolkit -* Not unlike [jq](https://stedolan.github.io/jq/) (for JSON), Miller is written in Go which is a portable, modern language, and Miller has no runtime dependencies. You can download or compile a single binary, `scp` it to a faraway machine, and expect it to work. +* Not unlike [jq](https://stedolan.github.io/jq/) (for JSON), Miller is written in Go, which is a portable, modern language, and Miller has no runtime dependencies. You can download or compile a single binary, `scp` it to a faraway machine, and expect it to work. Releases and release notes: [https://github.com/johnkerl/miller/releases](https://github.com/johnkerl/miller/releases). diff --git a/docs/src/file-formats.md b/docs/src/file-formats.md index 8611a7a22..8a09dac54 100644 --- a/docs/src/file-formats.md +++ b/docs/src/file-formats.md @@ -20,7 +20,7 @@ Miller handles name-indexed data using several formats: some you probably know by name, such as CSV, TSV, JSON, and JSON Lines -- and other formats you're likely already seeing and using in your structured data. -Additionally, Miller gives you the option of including comments within your data. +Additionally, Miller gives you the option to include comments within your data. ## Examples @@ -69,7 +69,7 @@ PPRINT: pretty-printed tabular | 4 5 6 | Record 2: "apple":"4", "bat":"5", "cog":"6" +---------------------+ -Markdown tabular (supported for output only): +Markdown tabular: +-----------------------+ | | apple | bat | cog | | | | --- | --- | --- | | @@ -102,21 +102,27 @@ NIDX: implicitly numerically indexed (Unix-toolkit style) ## CSV/TSV/ASV/USV/etc. -When `mlr` is invoked with the `--csv` or `--csvlite` option, key names are found on the first record and values are taken from subsequent records. This includes the case of CSV-formatted files. See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream. +When `mlr` is invoked with the `--csv` or `--csvlite` option, key names are found on the first record, and values are taken from subsequent records. This includes the case of CSV-formatted files. See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream. Miller has record separator `RS` and field separator `FS`, just as `awk` does. (See also the [separators page](reference-main-separators.md).) -**TSV (tab-separated values):** `FS` is tab and `RS` is newline (or carriage return + linefeed for -Windows). On input, if fields have `\r`, `\n`, `\t`, or `\\`, those are decoded as carriage return, -newline, tab, and backslash, respectively. On output, the reverse is done -- for example, if a field -has an embedded newline, that newline is replaced by `\n`. +**CSV (comma-separated values):** Miller's `--csv` flag supports [RFC-4180 CSV](https://tools.ietf.org/html/rfc4180). + +* This includes CRLF line terminators by default, regardless of platform. +* Any cell containing a comma or a carriage return within it must be double-quoted. + +**TSV (tab-separated values):** Miller's `--tsv` supports [IANA TSV](https://www.iana.org/assignments/media-types/text/tab-separated-values). + +* `FS` is tab and `RS` is newline (or carriage return + linefeed for Windows). +* On input, if fields have `\r`, `\n`, `\t`, or `\\`, those are decoded as carriage return, newline, tab, and backslash, respectively. +* On output, the reverse is done -- for example, if a field has an embedded newline, that newline is replaced by `\n`. +* A tab within a cell must be encoded as `\t`. +* A carriage return within a cell must be encoded as `\n`. **ASV (ASCII-separated values):** the flags `--asv`, `--iasv`, `--oasv`, `--asvlite`, `--iasvlite`, and `--oasvlite` are analogous except they use ASCII FS and RS `0x1f` and `0x1e`, respectively. **USV (Unicode-separated values):** likewise, the flags `--usv`, `--iusv`, `--ousv`, `--usvlite`, `--iusvlite`, and `--ousvlite` use Unicode FS and RS `U+241F` (UTF-8 `0x0xe2909f`) and `U+241E` (UTF-8 `0xe2909e`), respectively. -Miller's `--csv` flag supports [RFC-4180 CSV](https://tools.ietf.org/html/rfc4180). This includes CRLF line-terminators by default, regardless of platform. - Here are the differences between CSV and CSV-lite: * CSV-lite naively splits lines on newline, and fields on comma -- embedded commas and newlines are not escaped in any way. @@ -125,30 +131,98 @@ Here are the differences between CSV and CSV-lite: * CSV does not allow heterogeneous data; CSV-lite does (see also [Record Heterogeneity](record-heterogeneity.md)). -* TSV-lite is simply CSV-lite with field separator set to tab instead of comma. -In particular, no encode/decode of `\r`, `\n`, `\t`, or `\\` is done. +* TSV-lite is simply CSV-lite with the field separator set to tab instead of a comma. +In particular, no encoding/decoding of `\r`, `\n`, `\t`, or `\\` is done. * CSV-lite allows changing FS and/or RS to any values, perhaps multi-character. +* CSV-lite and TSV-lite handle schema changes ("schema" meaning "ordered list of field names in a given record") by adding a newline and re-emitting the header. CSV and TSV, by contrast, do the following: + * If there are too few keys, but these match the header, empty fields are emitted. + * If there are too many keys, but these match the header up to the number of header fields, the extra fields are emitted. + * If keys don't match the header, this is an error. + +
+cat data/under-over.json ++
+[
+ { "a": 1, "b": 2, "c": 3 },
+ { "a": 4, "b": 5, "c": 6, "d": 7 },
+ { "a": 7, "b": 8 },
+ { "a": 9, "b": 10, "c": 11 }
+]
+
+
++mlr --ijson --ocsvlite cat data/under-over.json ++
+a,b,c +1,2,3 + +a,b,c,d +4,5,6,7 + +a,b +7,8 + +a,b,c +9,10,11 ++ +
+mlr --ijson --ocsvlite cat data/key-change.json ++
+a,b,c +1,2,3 +4,5,6 + +a,X,c +7,8,9 ++ +
+mlr --ijson --ocsv cat data/under-over.json ++
+a,b,c +1,2,3 +4,5,6,7 +7,8, +9,10,11 ++ +
+mlr --ijson --ocsv cat data/key-change.json ++
+a,b,c +1,2,3 +4,5,6 +mlr: CSV schema change: first keys "a,b,c"; current keys "a,X,c" +mlr: exiting due to data error. ++ * In short, use-cases for CSV-lite and TSV-lite are often found when dealing with CSV/TSV files which are formatted in some non-standard way -- you have a little more flexibility available to you. (As an example of this flexibility: ASV and USV are nothing more than CSV-lite with different values for FS and RS.) CSV, TSV, CSV-lite, and TSV-lite have in common the `--implicit-csv-header` flag for input and the `--headerless-csv-output` flag for output. -See also the [`--lazy-quotes` flag](reference-main-flag-list.md#csv-only-flags) which can help with CSV files which are not fully compliant with RFC-4180. +See also the [`--lazy-quotes` flag](reference-main-flag-list.md#csv-only-flags), which can help with CSV files that are not fully compliant with RFC-4180. ## JSON [JSON](https://json.org) is a format which supports scalars (numbers, strings, -boolean, etc.) as well as "objects" (maps) and "arrays" (lists), while Miller +booleans, etc.) as well as "objects" (maps) and "arrays" (lists), while Miller is a tool for handling **tabular data** only. By *tabular JSON* I mean the data is either a sequence of one or more objects, or an array consisting of one or more objects. Miller treats JSON objects as name-indexed records. This means Miller cannot (and should not) handle arbitrary JSON. In practice, -though, Miller can handle single JSON objects as well as list of them. The only -kinds of JSON that are unmillerable are single scalars (e.g. file contents `3`) -and arrays of non-object (e.g. file contents `[1,2,3,4,5]`). Check out -[jq](https://stedolan.github.io/jq/) for a tool which handles all valid JSON. +though, Miller can handle single JSON objects as well as lists of them. The only +kinds of JSON that are unmillerable are single scalars (e.g., file contents `3`) +and arrays of non-object (e.g., file contents `[1,2,3,4,5]`). Check out +[jq](https://stedolan.github.io/jq/) for a tool that handles all valid JSON. In short, if you have tabular data represented in JSON -- lists of objects, either with or without outermost `[...]` -- [then Miller can handle that for @@ -262,7 +336,7 @@ input as well as output in JSON format, JSON structure is preserved throughout t ] -But if the input format is JSON and the output format is not (or vice versa) then key-concatenation applies: +But if the input format is JSON and the output format is not (or vice versa), then key-concatenation applies:
mlr --ijson --opprint head -n 4 data/json-example-2.json @@ -281,7 +355,7 @@ Use `--jflatsep yourseparatorhere` to specify the string used for key concatenat ### JSON-in-CSV -It's quite common to have CSV data which contains stringified JSON as a column. +It's quite common to have CSV data that contains stringified JSON as a column. See the [JSON parse and stringify section](reference-main-data-types.md#json-parse-and-stringify) for ways to decode these in Miller. @@ -336,7 +410,7 @@ records; using `--ojsonl`, you get no outermost `[...]`, and one line per record ## PPRINT: Pretty-printed tabular -Miller's pretty-print format is like CSV, but column-aligned. For example, compare +Miller's pretty-print format is similar to CSV, but with column alignment. For example, compare-which renders like this when dropped into various web tools (e.g. github comments): +which renders like this when dropped into various web tools (e.g. github.comments):  -As of Miller 4.3.0, markdown format is supported only for output, not input. +As of Miller 4.3.0, markdown format is supported only for output, not input; as of Miller 6.11.0, markdown format +is supported for input as well. ## XTAB: Vertical tabular @@ -488,7 +594,7 @@ a=eks,b=wye,i=4,x=0.381399,y=0.134188 a=wye,b=pan,i=5,x=0.573288,y=0.863624 -Such data are easy to generate, e.g. in Ruby with +Such data is easy to generate, e.g., in Ruby withmlr --ocsv cat data/small @@ -362,11 +436,11 @@ eks wye 4 0.381399 0.134188 wye pan 5 0.573288 0.863624-Note that while Miller is a line-at-a-time processor and retains input lines in memory only where necessary (e.g. for sort), pretty-print output requires it to accumulate all input lines (so that it can compute maximum column widths) before producing any output. This has two consequences: (a) pretty-print output won't work on `tail -f` contexts, where Miller will be waiting for an end-of-file marker which never arrives; (b) pretty-print output for large files is constrained by available machine memory. +Note that while Miller is a line-at-a-time processor and retains input lines in memory only where necessary (e.g., for sort), pretty-print output requires it to accumulate all input lines (so that it can compute maximum column widths) before producing any output. This has two consequences: (a) Pretty-print output will not work in `tail -f` contexts, where Miller will be waiting for an end-of-file marker that never arrives; (b) Pretty-print output for large files is constrained by the available machine memory. See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream. -For output only (this isn't supported in the input-scanner as of 5.0.0) you can use `--barred` with pprint output format: +Since Miller 5.0.0, you can use `--barred` or `--barred-output` with pprint output format:mlr --opprint --barred cat data/small @@ -383,6 +457,37 @@ For output only (this isn't supported in the input-scanner as of 5.0.0) you can +-----+-----+---+----------+----------++Since Miller 6.11.0, you can use `--barred-input` with pprint input format: + ++mlr -o pprint --barred cat data/small | mlr -i pprint --barred-input -o json filter '$b == "pan"' +++[ +{ + "a": "pan", + "b": "pan", + "i": 1, + "x": 0.346791, + "y": 0.726802 +}, +{ + "a": "eks", + "b": "pan", + "i": 2, + "x": 0.758679, + "y": 0.522151 +}, +{ + "a": "wye", + "b": "pan", + "i": 5, + "x": 0.573288, + "y": 0.863624 +} +] ++ ## Markdown tabular Markdown format looks like this: @@ -400,11 +505,12 @@ Markdown format looks like this: | wye | pan | 5 | 0.573288 | 0.863624 |
puts "host=#{hostname},seconds=#{t2-t1},message=#{msg}"
@@ -510,7 +616,7 @@ logger.log("type=3,user=$USER,date=$date\n");
Fields lacking an IPS will have positional index (starting at 1) used as the key, as in NIDX format. For example, `dish=7,egg=8,flint` is parsed as `"dish" => "7", "egg" => "8", "3" => "flint"` and `dish,egg,flint` is parsed as `"1" => "dish", "2" => "egg", "3" => "flint"`.
-As discussed in [Record Heterogeneity](record-heterogeneity.md), Miller handles changes of field names within the same data stream. But using DKVP format this is particularly natural. One of my favorite use-cases for Miller is in application/server logs, where I log all sorts of lines such as
+As discussed in [Record Heterogeneity](record-heterogeneity.md), Miller handles changes of field names within the same data stream. But using DKVP format, this is particularly natural. One of my favorite use-cases for Miller is in application/server logs, where I log all sorts of lines such as
resource=/path/to/file,loadsec=0.45,ok=true
@@ -518,10 +624,9 @@ record_count=100, resource=/path/to/file
resource=/some/other/path,loadsec=0.97,ok=false
-etc. and I just log them as needed. Then later, I can use `grep`, `mlr --opprint group-like`, etc.
-to analyze my logs.
+etc., and I log them as needed. Then later, I can use `grep`, `mlr --opprint group-like`, etc. to analyze my logs.
-See the [separators page](reference-main-separators.md) regarding how to specify separators other than the default equals-sign and comma.
+See the [separators page](reference-main-separators.md) regarding how to specify separators other than the default equals sign and comma.
## NIDX: Index-numbered (toolkit style)
@@ -604,19 +709,19 @@ While you can do format conversion using `mlr --icsv --ojson cat myfile.csv`, th
FORMAT-CONVERSION KEYSTROKE-SAVER FLAGS
As keystroke-savers for format-conversion you may use the following.
The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX,
-JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively. Note that markdown
-format is available for output only.
+JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively.
-| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
-+--------+-------+-------+--------+--------+--------+--------+--------+----------+
-| CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
-| TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
-| JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
-| JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m |
-| DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m |
-| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m |
-| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m |
-| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m |
+| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
++----------+----------+----------+----------+-------+-------+-------+-------+--------+----------|
+| CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
+| TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
+| JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
+| JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m |
+| DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m |
+| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m |
+| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m |
+| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m |
+| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
-p Keystroke-saver for `--nidx --fs space --repifs`.
-T Keystroke-saver for `--nidx --fs tab`.
@@ -624,7 +729,7 @@ format is available for output only.
## Comments in data
-You can include comments within your data files, and either have them ignored, or passed directly through to the standard output as soon as they are encountered:
+You can include comments within your data files, and either have them ignored or passed directly through to the standard output as soon as they are encountered:
mlr help comments-in-data-flags
@@ -652,12 +757,14 @@ Notes:
within the input.
--pass-comments-with {string}
Immediately print commented lines within input, with
- specified prefix.
+ specified prefix. For CSV input format, the prefix
+ must be a single character.
--skip-comments Ignore commented lines (prefixed by `#`) within the
input.
--skip-comments-with {string}
Ignore commented lines within input, with specified
- prefix.
+ prefix. For CSV input format, the prefix must be a
+ single character.
Examples:
diff --git a/docs/src/file-formats.md.in b/docs/src/file-formats.md.in
index fd624a80e..2ed581b19 100644
--- a/docs/src/file-formats.md.in
+++ b/docs/src/file-formats.md.in
@@ -4,7 +4,7 @@ Miller handles name-indexed data using several formats: some you probably know
by name, such as CSV, TSV, JSON, and JSON Lines -- and other formats you're likely already
seeing and using in your structured data.
-Additionally, Miller gives you the option of including comments within your data.
+Additionally, Miller gives you the option to include comments within your data.
## Examples
@@ -14,21 +14,27 @@ GENMD-EOF
## CSV/TSV/ASV/USV/etc.
-When `mlr` is invoked with the `--csv` or `--csvlite` option, key names are found on the first record and values are taken from subsequent records. This includes the case of CSV-formatted files. See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
+When `mlr` is invoked with the `--csv` or `--csvlite` option, key names are found on the first record, and values are taken from subsequent records. This includes the case of CSV-formatted files. See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
Miller has record separator `RS` and field separator `FS`, just as `awk` does. (See also the [separators page](reference-main-separators.md).)
-**TSV (tab-separated values):** `FS` is tab and `RS` is newline (or carriage return + linefeed for
-Windows). On input, if fields have `\r`, `\n`, `\t`, or `\\`, those are decoded as carriage return,
-newline, tab, and backslash, respectively. On output, the reverse is done -- for example, if a field
-has an embedded newline, that newline is replaced by `\n`.
+**CSV (comma-separated values):** Miller's `--csv` flag supports [RFC-4180 CSV](https://tools.ietf.org/html/rfc4180).
+
+* This includes CRLF line terminators by default, regardless of platform.
+* Any cell containing a comma or a carriage return within it must be double-quoted.
+
+**TSV (tab-separated values):** Miller's `--tsv` supports [IANA TSV](https://www.iana.org/assignments/media-types/text/tab-separated-values).
+
+* `FS` is tab and `RS` is newline (or carriage return + linefeed for Windows).
+* On input, if fields have `\r`, `\n`, `\t`, or `\\`, those are decoded as carriage return, newline, tab, and backslash, respectively.
+* On output, the reverse is done -- for example, if a field has an embedded newline, that newline is replaced by `\n`.
+* A tab within a cell must be encoded as `\t`.
+* A carriage return within a cell must be encoded as `\n`.
**ASV (ASCII-separated values):** the flags `--asv`, `--iasv`, `--oasv`, `--asvlite`, `--iasvlite`, and `--oasvlite` are analogous except they use ASCII FS and RS `0x1f` and `0x1e`, respectively.
**USV (Unicode-separated values):** likewise, the flags `--usv`, `--iusv`, `--ousv`, `--usvlite`, `--iusvlite`, and `--ousvlite` use Unicode FS and RS `U+241F` (UTF-8 `0x0xe2909f`) and `U+241E` (UTF-8 `0xe2909e`), respectively.
-Miller's `--csv` flag supports [RFC-4180 CSV](https://tools.ietf.org/html/rfc4180). This includes CRLF line-terminators by default, regardless of platform.
-
Here are the differences between CSV and CSV-lite:
* CSV-lite naively splits lines on newline, and fields on comma -- embedded commas and newlines are not escaped in any way.
@@ -37,30 +43,55 @@ Here are the differences between CSV and CSV-lite:
* CSV does not allow heterogeneous data; CSV-lite does (see also [Record Heterogeneity](record-heterogeneity.md)).
-* TSV-lite is simply CSV-lite with field separator set to tab instead of comma.
-In particular, no encode/decode of `\r`, `\n`, `\t`, or `\\` is done.
+* TSV-lite is simply CSV-lite with the field separator set to tab instead of a comma.
+In particular, no encoding/decoding of `\r`, `\n`, `\t`, or `\\` is done.
* CSV-lite allows changing FS and/or RS to any values, perhaps multi-character.
+* CSV-lite and TSV-lite handle schema changes ("schema" meaning "ordered list of field names in a given record") by adding a newline and re-emitting the header. CSV and TSV, by contrast, do the following:
+ * If there are too few keys, but these match the header, empty fields are emitted.
+ * If there are too many keys, but these match the header up to the number of header fields, the extra fields are emitted.
+ * If keys don't match the header, this is an error.
+
+GENMD-RUN-COMMAND
+cat data/under-over.json
+GENMD-EOF
+
+GENMD-RUN-COMMAND
+mlr --ijson --ocsvlite cat data/under-over.json
+GENMD-EOF
+
+GENMD-RUN-COMMAND-TOLERATING-ERROR
+mlr --ijson --ocsvlite cat data/key-change.json
+GENMD-EOF
+
+GENMD-RUN-COMMAND
+mlr --ijson --ocsv cat data/under-over.json
+GENMD-EOF
+
+GENMD-RUN-COMMAND-TOLERATING-ERROR
+mlr --ijson --ocsv cat data/key-change.json
+GENMD-EOF
+
* In short, use-cases for CSV-lite and TSV-lite are often found when dealing with CSV/TSV files which are formatted in some non-standard way -- you have a little more flexibility available to you. (As an example of this flexibility: ASV and USV are nothing more than CSV-lite with different values for FS and RS.)
CSV, TSV, CSV-lite, and TSV-lite have in common the `--implicit-csv-header` flag for input and the `--headerless-csv-output` flag for output.
-See also the [`--lazy-quotes` flag](reference-main-flag-list.md#csv-only-flags) which can help with CSV files which are not fully compliant with RFC-4180.
+See also the [`--lazy-quotes` flag](reference-main-flag-list.md#csv-only-flags), which can help with CSV files that are not fully compliant with RFC-4180.
## JSON
[JSON](https://json.org) is a format which supports scalars (numbers, strings,
-boolean, etc.) as well as "objects" (maps) and "arrays" (lists), while Miller
+booleans, etc.) as well as "objects" (maps) and "arrays" (lists), while Miller
is a tool for handling **tabular data** only. By *tabular JSON* I mean the
data is either a sequence of one or more objects, or an array consisting of one
or more objects. Miller treats JSON objects as name-indexed records.
This means Miller cannot (and should not) handle arbitrary JSON. In practice,
-though, Miller can handle single JSON objects as well as list of them. The only
-kinds of JSON that are unmillerable are single scalars (e.g. file contents `3`)
-and arrays of non-object (e.g. file contents `[1,2,3,4,5]`). Check out
-[jq](https://stedolan.github.io/jq/) for a tool which handles all valid JSON.
+though, Miller can handle single JSON objects as well as lists of them. The only
+kinds of JSON that are unmillerable are single scalars (e.g., file contents `3`)
+and arrays of non-object (e.g., file contents `[1,2,3,4,5]`). Check out
+[jq](https://stedolan.github.io/jq/) for a tool that handles all valid JSON.
In short, if you have tabular data represented in JSON -- lists of objects,
either with or without outermost `[...]` -- [then Miller can handle that for
@@ -98,7 +129,7 @@ GENMD-RUN-COMMAND
mlr --json head -n 2 data/json-example-2.json
GENMD-EOF
-But if the input format is JSON and the output format is not (or vice versa) then key-concatenation applies:
+But if the input format is JSON and the output format is not (or vice versa), then key-concatenation applies:
GENMD-RUN-COMMAND
mlr --ijson --opprint head -n 4 data/json-example-2.json
@@ -110,7 +141,7 @@ Use `--jflatsep yourseparatorhere` to specify the string used for key concatenat
### JSON-in-CSV
-It's quite common to have CSV data which contains stringified JSON as a column.
+It's quite common to have CSV data that contains stringified JSON as a column.
See the [JSON parse and stringify section](reference-main-data-types.md#json-parse-and-stringify) for ways to
decode these in Miller.
@@ -139,7 +170,7 @@ records; using `--ojsonl`, you get no outermost `[...]`, and one line per record
## PPRINT: Pretty-printed tabular
-Miller's pretty-print format is like CSV, but column-aligned. For example, compare
+Miller's pretty-print format is similar to CSV, but with column alignment. For example, compare
GENMD-RUN-COMMAND
mlr --ocsv cat data/small
@@ -149,16 +180,22 @@ GENMD-RUN-COMMAND
mlr --opprint cat data/small
GENMD-EOF
-Note that while Miller is a line-at-a-time processor and retains input lines in memory only where necessary (e.g. for sort), pretty-print output requires it to accumulate all input lines (so that it can compute maximum column widths) before producing any output. This has two consequences: (a) pretty-print output won't work on `tail -f` contexts, where Miller will be waiting for an end-of-file marker which never arrives; (b) pretty-print output for large files is constrained by available machine memory.
+Note that while Miller is a line-at-a-time processor and retains input lines in memory only where necessary (e.g., for sort), pretty-print output requires it to accumulate all input lines (so that it can compute maximum column widths) before producing any output. This has two consequences: (a) Pretty-print output will not work in `tail -f` contexts, where Miller will be waiting for an end-of-file marker that never arrives; (b) Pretty-print output for large files is constrained by the available machine memory.
See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
-For output only (this isn't supported in the input-scanner as of 5.0.0) you can use `--barred` with pprint output format:
+Since Miller 5.0.0, you can use `--barred` or `--barred-output` with pprint output format:
GENMD-RUN-COMMAND
mlr --opprint --barred cat data/small
GENMD-EOF
+Since Miller 6.11.0, you can use `--barred-input` with pprint input format:
+
+GENMD-RUN-COMMAND
+mlr -o pprint --barred cat data/small | mlr -i pprint --barred-input -o json filter '$b == "pan"'
+GENMD-EOF
+
## Markdown tabular
Markdown format looks like this:
@@ -167,11 +204,12 @@ GENMD-RUN-COMMAND
mlr --omd cat data/small
GENMD-EOF
-which renders like this when dropped into various web tools (e.g. github comments):
+which renders like this when dropped into various web tools (e.g. github.comments):

-As of Miller 4.3.0, markdown format is supported only for output, not input.
+As of Miller 4.3.0, markdown format is supported only for output, not input; as of Miller 6.11.0, markdown format
+is supported for input as well.
## XTAB: Vertical tabular
@@ -242,7 +280,7 @@ GENMD-RUN-COMMAND
mlr cat data/small
GENMD-EOF
-Such data are easy to generate, e.g. in Ruby with
+Such data is easy to generate, e.g., in Ruby with
GENMD-CARDIFY
puts "host=#{hostname},seconds=#{t2-t1},message=#{msg}"
@@ -264,7 +302,7 @@ GENMD-EOF
Fields lacking an IPS will have positional index (starting at 1) used as the key, as in NIDX format. For example, `dish=7,egg=8,flint` is parsed as `"dish" => "7", "egg" => "8", "3" => "flint"` and `dish,egg,flint` is parsed as `"1" => "dish", "2" => "egg", "3" => "flint"`.
-As discussed in [Record Heterogeneity](record-heterogeneity.md), Miller handles changes of field names within the same data stream. But using DKVP format this is particularly natural. One of my favorite use-cases for Miller is in application/server logs, where I log all sorts of lines such as
+As discussed in [Record Heterogeneity](record-heterogeneity.md), Miller handles changes of field names within the same data stream. But using DKVP format, this is particularly natural. One of my favorite use-cases for Miller is in application/server logs, where I log all sorts of lines such as
GENMD-CARDIFY
resource=/path/to/file,loadsec=0.45,ok=true
@@ -272,10 +310,9 @@ record_count=100, resource=/path/to/file
resource=/some/other/path,loadsec=0.97,ok=false
GENMD-EOF
-etc. and I just log them as needed. Then later, I can use `grep`, `mlr --opprint group-like`, etc.
-to analyze my logs.
+etc., and I log them as needed. Then later, I can use `grep`, `mlr --opprint group-like`, etc. to analyze my logs.
-See the [separators page](reference-main-separators.md) regarding how to specify separators other than the default equals-sign and comma.
+See the [separators page](reference-main-separators.md) regarding how to specify separators other than the default equals sign and comma.
## NIDX: Index-numbered (toolkit style)
@@ -323,7 +360,7 @@ GENMD-EOF
## Comments in data
-You can include comments within your data files, and either have them ignored, or passed directly through to the standard output as soon as they are encountered:
+You can include comments within your data files, and either have them ignored or passed directly through to the standard output as soon as they are encountered:
GENMD-RUN-COMMAND
mlr help comments-in-data-flags
diff --git a/docs/src/flatten-unflatten.md b/docs/src/flatten-unflatten.md
index 7a3c138d2..da0e817db 100644
--- a/docs/src/flatten-unflatten.md
+++ b/docs/src/flatten-unflatten.md
@@ -348,6 +348,50 @@ a.1,a.3,a.5
]
+## Non-inferencing cases
+
+An additional heuristic is that if a field name starts with a `.`, ends with
+a `.`, or has two or more consecutive `.` characters, no attempt is made
+to unflatten it on conversion from non-JSON to JSON.
+
++cat data/flatten-dots.csv ++
+a,b.,.c,.,d..e,f.g +1,2,3,4,5,6 ++ +
+mlr --icsv --oxtab cat data/flatten-dots.csv ++
+a 1 +b. 2 +.c 3 +. 4 +d..e 5 +f.g 6 ++ +
+mlr --icsv --ojson cat data/flatten-dots.csv ++
+[
+{
+ "a": 1,
+ "b.": 2,
+ ".c": 3,
+ ".": 4,
+ "d..e": 5,
+ "f": {
+ "g": 6
+ }
+}
+]
+
+
## Manual control
To see what our options are for manually controlling flattening and
diff --git a/docs/src/flatten-unflatten.md.in b/docs/src/flatten-unflatten.md.in
index 68033d594..951ea1f58 100644
--- a/docs/src/flatten-unflatten.md.in
+++ b/docs/src/flatten-unflatten.md.in
@@ -156,6 +156,24 @@ GENMD-RUN-COMMAND
mlr --c2j cat data/non-consecutive.csv
GENMD-EOF
+## Non-inferencing cases
+
+An additional heuristic is that if a field name starts with a `.`, ends with
+a `.`, or has two or more consecutive `.` characters, no attempt is made
+to unflatten it on conversion from non-JSON to JSON.
+
+GENMD-RUN-COMMAND
+cat data/flatten-dots.csv
+GENMD-EOF
+
+GENMD-RUN-COMMAND
+mlr --icsv --oxtab cat data/flatten-dots.csv
+GENMD-EOF
+
+GENMD-RUN-COMMAND
+mlr --icsv --ojson cat data/flatten-dots.csv
+GENMD-EOF
+
## Manual control
To see what our options are for manually controlling flattening and
diff --git a/docs/src/how-to-release.md b/docs/src/how-to-release.md
index 2833f1417..b19529094 100644
--- a/docs/src/how-to-release.md
+++ b/docs/src/how-to-release.md
@@ -30,9 +30,9 @@ In this example I am using version 6.2.0 to 6.3.0; of course that will change fo
* If Go version is being updated: edit all three of
- * `go.mod`
- * `.github/workflows/go.yml`
- * `.github/workflows/release.yml`
+ * `go.mod`
+ * `.github/workflows/go.yml`
+ * `.github/workflows/release.yml`
* Create the release tarball:
@@ -40,7 +40,7 @@ In this example I am using version 6.2.0 to 6.3.0; of course that will change fo
* This creates `miller-6.3.0.tar.gz` which we'll upload to GitHub, the URL of which will be in our `miller.spec`
* Prepare the source RPM following [README-RPM.md](https://github.com/johnkerl/miller/blob/main/README-RPM.md).
-* Create the Github release tag:
+* Create the GitHub release tag:
* Don't forget the `v` in `v6.3.0`
* Write the release notes -- save as a pre-release until below
@@ -48,7 +48,7 @@ In this example I am using version 6.2.0 to 6.3.0; of course that will change fo
* Thanks to [PR 822](https://github.com/johnkerl/miller/pull/822) which introduces [goreleaser](https://github.com/johnkerl/miller/blob/main/.goreleaser.yml) there are versions for many platforms auto-built and auto-attached to the GitHub release.
* Attach the release tarball and SRPM. Double-check assets were successfully uploaded.
* Publish the release in pre-release mode, until all CI jobs finish successfully. Note that gorelease will create and attach the rest of the binaries.
- * Before marking the release as public, download an executable from among the generated binaries and make sure its `mlr version` prints what you expect -- else, restart this process.
+ * Before marking the release as public, download an executable from among the generated binaries and make sure its `mlr version` prints what you expect -- else, restart this process. MacOS: `xattr -d com.apple.quarantine ./mlr` first.
* Then mark the release as public.
* Build the release-specific docs:
diff --git a/docs/src/how-to-release.md.in b/docs/src/how-to-release.md.in
index fac0248c1..522cdbfa9 100644
--- a/docs/src/how-to-release.md.in
+++ b/docs/src/how-to-release.md.in
@@ -14,9 +14,9 @@ In this example I am using version 6.2.0 to 6.3.0; of course that will change fo
* If Go version is being updated: edit all three of
- * `go.mod`
- * `.github/workflows/go.yml`
- * `.github/workflows/release.yml`
+ * `go.mod`
+ * `.github/workflows/go.yml`
+ * `.github/workflows/release.yml`
* Create the release tarball:
@@ -24,7 +24,7 @@ In this example I am using version 6.2.0 to 6.3.0; of course that will change fo
* This creates `miller-6.3.0.tar.gz` which we'll upload to GitHub, the URL of which will be in our `miller.spec`
* Prepare the source RPM following [README-RPM.md](https://github.com/johnkerl/miller/blob/main/README-RPM.md).
-* Create the Github release tag:
+* Create the GitHub release tag:
* Don't forget the `v` in `v6.3.0`
* Write the release notes -- save as a pre-release until below
@@ -32,7 +32,7 @@ In this example I am using version 6.2.0 to 6.3.0; of course that will change fo
* Thanks to [PR 822](https://github.com/johnkerl/miller/pull/822) which introduces [goreleaser](https://github.com/johnkerl/miller/blob/main/.goreleaser.yml) there are versions for many platforms auto-built and auto-attached to the GitHub release.
* Attach the release tarball and SRPM. Double-check assets were successfully uploaded.
* Publish the release in pre-release mode, until all CI jobs finish successfully. Note that gorelease will create and attach the rest of the binaries.
- * Before marking the release as public, download an executable from among the generated binaries and make sure its `mlr version` prints what you expect -- else, restart this process.
+ * Before marking the release as public, download an executable from among the generated binaries and make sure its `mlr version` prints what you expect -- else, restart this process. MacOS: `xattr -d com.apple.quarantine ./mlr` first.
* Then mark the release as public.
* Build the release-specific docs:
diff --git a/docs/src/index.md b/docs/src/index.md
index 799123fca..bcb69c8ed 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -16,20 +16,20 @@ Quick links:
# Introduction
-**Miller is a command-line tool for querying, shaping, and reformatting data files in various formats including CSV, TSV, JSON, and JSON Lines.**
+**Miller is a command-line tool for querying, shaping, and reformatting data files in various formats, including CSV, TSV, JSON, and JSON Lines.**
-**The big picture:** Even well into the 21st century, our world is full of text-formatted data like CSV. Google _CSV memes_, for example. We need tooling to _thrive in this world_, nimbly manipulating data which is in CSVs. And we need tooling to _move beyond CSV_, to be able to pull data out and into other storage and processing systems. Miller is designed for both these goals.
+**The big picture:** Even well into the 21st century, our world is full of text-formatted data such as CSV. Google _CSV memes_, for example. We need tooling to _thrive in this world_, nimbly manipulating data which is in CSVs. And we need tooling to _move beyond CSV_, to be able to pull data out and into other storage and processing systems. Miller is designed for both of these goals.
In several senses, Miller is more than one tool:
**Format conversion:** You can convert CSV files to JSON, or vice versa, or
pretty-print your data horizontally or vertically to make it easier to read.
-**Data manipulation:** With a few keystrokes you can remove columns you don't care about -- or, make new ones.
+**Data manipulation:** With a few keystrokes, you can remove columns you don't care about -- or make new ones.
-**Pre-processing/post-processing vs standalone use:** You can use Miller to clean data files and put them into standard formats, perhaps in preparation to load them into a database or a hands-off data-processing pipeline. Or you can use it post-process and summary database-query output. As well, you can use Miller to explore and analyze your data interactively.
+**Pre-processing/post-processing vs standalone use:** You can use Miller to clean data files and put them into standard formats, perhaps in preparation for loading them into a database or a hands-off data-processing pipeline. Or you can use it post-process and summarize database-query output. As well, you can use Miller to explore and analyze your data interactively.
-**Compact verbs vs programming language:** For low-keystroking you can do things like
+**Compact verbs vs programming language:** For low-keystroking, you can do things like
mlr --csv sort -f name input.csv @@ -39,16 +39,16 @@ pretty-print your data horizontally or vertically to make it easier to read. mlr --json head -n 1 myfile.json-The `sort`, `head`, etc are called *verbs*. They're analogs of familiar command-line tools like `sort`, `head`, and so on -- but they're aware of name-indexed, multi-line file formats like CSV, TSV, and JSON. In addition, though, using Miller's `put` verb you can use programming-language statements for expressions like +The `sort`, `head`, etc., are called *verbs*. They're analogs of familiar command-line tools like `sort`, `head`, and so on -- but they're aware of name-indexed, multi-line file formats like CSV, TSV, and JSON. In addition, though, using Miller's `put` verb, you can use programming-language statements for expressions like
mlr --csv put '$rate = $units / $seconds' input.csv-which allow you to succintly express your own logic. +which allow you to express your own logic succinctly. **Multiple domains:** People use Miller for data analysis, data science, software engineering, devops/system-administration, journalism, scientific research, and more. -In the following you can see how CSV, TSV, tabular, JSON, and other **file formats** share a common theme which is **lists of key-value-pairs**. Miller embraces this common theme. +In the following, you can see how CSV, TSV, tabular, JSON, and other **file formats** share a common theme which is **lists of key-value-pairs**. Miller embraces this common theme.  diff --git a/docs/src/index.md.in b/docs/src/index.md.in index 6d16f6d19..25073a3f1 100644 --- a/docs/src/index.md.in +++ b/docs/src/index.md.in @@ -1,19 +1,19 @@ # Introduction -**Miller is a command-line tool for querying, shaping, and reformatting data files in various formats including CSV, TSV, JSON, and JSON Lines.** +**Miller is a command-line tool for querying, shaping, and reformatting data files in various formats, including CSV, TSV, JSON, and JSON Lines.** -**The big picture:** Even well into the 21st century, our world is full of text-formatted data like CSV. Google _CSV memes_, for example. We need tooling to _thrive in this world_, nimbly manipulating data which is in CSVs. And we need tooling to _move beyond CSV_, to be able to pull data out and into other storage and processing systems. Miller is designed for both these goals. +**The big picture:** Even well into the 21st century, our world is full of text-formatted data such as CSV. Google _CSV memes_, for example. We need tooling to _thrive in this world_, nimbly manipulating data which is in CSVs. And we need tooling to _move beyond CSV_, to be able to pull data out and into other storage and processing systems. Miller is designed for both of these goals. In several senses, Miller is more than one tool: **Format conversion:** You can convert CSV files to JSON, or vice versa, or pretty-print your data horizontally or vertically to make it easier to read. -**Data manipulation:** With a few keystrokes you can remove columns you don't care about -- or, make new ones. +**Data manipulation:** With a few keystrokes, you can remove columns you don't care about -- or make new ones. -**Pre-processing/post-processing vs standalone use:** You can use Miller to clean data files and put them into standard formats, perhaps in preparation to load them into a database or a hands-off data-processing pipeline. Or you can use it post-process and summary database-query output. As well, you can use Miller to explore and analyze your data interactively. +**Pre-processing/post-processing vs standalone use:** You can use Miller to clean data files and put them into standard formats, perhaps in preparation for loading them into a database or a hands-off data-processing pipeline. Or you can use it post-process and summarize database-query output. As well, you can use Miller to explore and analyze your data interactively. -**Compact verbs vs programming language:** For low-keystroking you can do things like +**Compact verbs vs programming language:** For low-keystroking, you can do things like GENMD-SHOW-COMMAND mlr --csv sort -f name input.csv @@ -23,16 +23,16 @@ GENMD-SHOW-COMMAND mlr --json head -n 1 myfile.json GENMD-EOF -The `sort`, `head`, etc are called *verbs*. They're analogs of familiar command-line tools like `sort`, `head`, and so on -- but they're aware of name-indexed, multi-line file formats like CSV, TSV, and JSON. In addition, though, using Miller's `put` verb you can use programming-language statements for expressions like +The `sort`, `head`, etc., are called *verbs*. They're analogs of familiar command-line tools like `sort`, `head`, and so on -- but they're aware of name-indexed, multi-line file formats like CSV, TSV, and JSON. In addition, though, using Miller's `put` verb, you can use programming-language statements for expressions like GENMD-SHOW-COMMAND mlr --csv put '$rate = $units / $seconds' input.csv GENMD-EOF -which allow you to succintly express your own logic. +which allow you to express your own logic succinctly. **Multiple domains:** People use Miller for data analysis, data science, software engineering, devops/system-administration, journalism, scientific research, and more. -In the following you can see how CSV, TSV, tabular, JSON, and other **file formats** share a common theme which is **lists of key-value-pairs**. Miller embraces this common theme. +In the following, you can see how CSV, TSV, tabular, JSON, and other **file formats** share a common theme which is **lists of key-value-pairs**. Miller embraces this common theme.  diff --git a/docs/src/installing-miller.md b/docs/src/installing-miller.md index b5ae44227..9de4558ff 100644 --- a/docs/src/installing-miller.md +++ b/docs/src/installing-miller.md @@ -21,7 +21,7 @@ You can install Miller for various platforms as follows. Download a binary: * You can get binaries for several platforms on the [releases page](https://github.com/johnkerl/miller/releases). -* You can get latest (head) builds for Linux, MacOS, and Windows by visiting [https://github.com/johnkerl/miller/actions](https://github.com/johnkerl/miller/actions), selecting the latest build, and clicking _Artifacts_. (These are retained for 5 days after each commit.) +* You can get the latest (head) builds for Linux, MacOS, and Windows by visiting [https://github.com/johnkerl/miller/actions](https://github.com/johnkerl/miller/actions), selecting the latest build, and clicking _Artifacts_. (These are retained for 5 days after each commit.) * See also the [build page](build.md) if you prefer to build from source. Using a package manager: @@ -30,6 +30,7 @@ Using a package manager: * MacOS: `brew update` and `brew install miller`, or `sudo port selfupdate` and `sudo port install miller`, depending on your preference of [Homebrew](https://brew.sh) or [MacPorts](https://macports.org). * Windows: `choco install miller` using [Chocolatey](https://chocolatey.org). * Note: Miller 6 was released 2022-01-09; [several platforms](https://github.com/johnkerl/miller/blob/main/README-versions.md) may have Miller 5 available. +* As of Miller 6.16.0, you can do `snap install miller`. Note however that the executable is named `miller`, _not_ `mlr`. See also [https://snapcraft.io/miller](https://snapcraft.io/miller). See also: @@ -37,7 +38,7 @@ See also: * [@jauderho](https://github.com/jauderho)'s [docker images](https://hub.docker.com/r/jauderho/miller/tags) as discussed in [GitHub Discussions](https://github.com/johnkerl/miller/discussions/851#discussioncomment-1943255) * Example invocation: `docker run --rm -i jauderho/miller:latest --csv sort -f shape < ./example.csv` -Note that the [Miller releases page](https://github.com/johnkerl/miller/releases), `brew`, `macports`, `chocolatey`, and `conda` tend to have current versions; `yum` and `apt-get` may have outdate versions depending on your platform. +Note that the [Miller releases page](https://github.com/johnkerl/miller/releases), `brew`, `macports`, `chocolatey`, and `conda` tend to have current versions; `yum` and `apt-get` may have outdated versions depending on your platform. As a first check, you should be able to run `mlr --version` at your system's command prompt and see something like the following: @@ -50,7 +51,7 @@ mlr 6.0.0 A note on documentation: -* If you downloaded the Miller binary from a tagged release, or installed it using a package manager, you should see a version like `mlr 6.0.0` or `mlr 5.10.3` -- please see the [release docs page](release-docs.md) to find the documentation for your version. +* If you downloaded the Miller binary from a tagged release or installed it using a package manager, you should see a version like `mlr 6.0.0` or `mlr 5.10.3` -- please see the [release docs page](release-docs.md) to find the documentation for your version. * If you installed from source or using a recent build artifact from GitHub Actions, you should see a version like `mlr 6.0.0-dev` -- [https://miller.readthedocs.io](https://miller.readthedocs.io) is the correct reference, since it contains information for the latest contributions to the [Miller repository](https://github.com/johnkerl/miller). As a second check, given [example.csv](./example.csv) you should be able to do @@ -89,6 +90,6 @@ yellow circle true 9 87 63.5058 8.3350 purple square false 10 91 72.3735 8.2430 -If you run into issues on these checks, please check out the resources on the [community page](community.md) for help. +If you encounter issues with these checks, please refer to the resources on the [community page](community.md) for help. Otherwise, let's go on to [Miller in 10 minutes](10min.md)! diff --git a/docs/src/installing-miller.md.in b/docs/src/installing-miller.md.in index da908cdc5..74e5c9f53 100644 --- a/docs/src/installing-miller.md.in +++ b/docs/src/installing-miller.md.in @@ -5,7 +5,7 @@ You can install Miller for various platforms as follows. Download a binary: * You can get binaries for several platforms on the [releases page](https://github.com/johnkerl/miller/releases). -* You can get latest (head) builds for Linux, MacOS, and Windows by visiting [https://github.com/johnkerl/miller/actions](https://github.com/johnkerl/miller/actions), selecting the latest build, and clicking _Artifacts_. (These are retained for 5 days after each commit.) +* You can get the latest (head) builds for Linux, MacOS, and Windows by visiting [https://github.com/johnkerl/miller/actions](https://github.com/johnkerl/miller/actions), selecting the latest build, and clicking _Artifacts_. (These are retained for 5 days after each commit.) * See also the [build page](build.md) if you prefer to build from source. Using a package manager: @@ -14,6 +14,7 @@ Using a package manager: * MacOS: `brew update` and `brew install miller`, or `sudo port selfupdate` and `sudo port install miller`, depending on your preference of [Homebrew](https://brew.sh) or [MacPorts](https://macports.org). * Windows: `choco install miller` using [Chocolatey](https://chocolatey.org). * Note: Miller 6 was released 2022-01-09; [several platforms](https://github.com/johnkerl/miller/blob/main/README-versions.md) may have Miller 5 available. +* As of Miller 6.16.0, you can do `snap install miller`. Note however that the executable is named `miller`, _not_ `mlr`. See also [https://snapcraft.io/miller](https://snapcraft.io/miller). See also: @@ -21,7 +22,7 @@ See also: * [@jauderho](https://github.com/jauderho)'s [docker images](https://hub.docker.com/r/jauderho/miller/tags) as discussed in [GitHub Discussions](https://github.com/johnkerl/miller/discussions/851#discussioncomment-1943255) * Example invocation: `docker run --rm -i jauderho/miller:latest --csv sort -f shape < ./example.csv` -Note that the [Miller releases page](https://github.com/johnkerl/miller/releases), `brew`, `macports`, `chocolatey`, and `conda` tend to have current versions; `yum` and `apt-get` may have outdate versions depending on your platform. +Note that the [Miller releases page](https://github.com/johnkerl/miller/releases), `brew`, `macports`, `chocolatey`, and `conda` tend to have current versions; `yum` and `apt-get` may have outdated versions depending on your platform. As a first check, you should be able to run `mlr --version` at your system's command prompt and see something like the following: @@ -32,7 +33,7 @@ GENMD-EOF A note on documentation: -* If you downloaded the Miller binary from a tagged release, or installed it using a package manager, you should see a version like `mlr 6.0.0` or `mlr 5.10.3` -- please see the [release docs page](release-docs.md) to find the documentation for your version. +* If you downloaded the Miller binary from a tagged release or installed it using a package manager, you should see a version like `mlr 6.0.0` or `mlr 5.10.3` -- please see the [release docs page](release-docs.md) to find the documentation for your version. * If you installed from source or using a recent build artifact from GitHub Actions, you should see a version like `mlr 6.0.0-dev` -- [https://miller.readthedocs.io](https://miller.readthedocs.io) is the correct reference, since it contains information for the latest contributions to the [Miller repository](https://github.com/johnkerl/miller). As a second check, given [example.csv](./example.csv) you should be able to do @@ -45,6 +46,6 @@ GENMD-RUN-COMMAND mlr --icsv --opprint cat example.csv GENMD-EOF -If you run into issues on these checks, please check out the resources on the [community page](community.md) for help. +If you encounter issues with these checks, please refer to the resources on the [community page](community.md) for help. Otherwise, let's go on to [Miller in 10 minutes](10min.md)! diff --git a/docs/src/keystroke-savers.md b/docs/src/keystroke-savers.md index c62231709..ec15e9308 100644 --- a/docs/src/keystroke-savers.md +++ b/docs/src/keystroke-savers.md @@ -18,7 +18,7 @@ Quick links: ## Short format specifiers, including --c2p -In our examples so far we've often made use of `mlr --icsv --opprint` or `mlr --icsv --ojson`. These are such frequently occurring patterns that they have short options like `--c2p` and `--c2j`: +In our examples so far, we've often made use of `mlr --icsv --opprint` or `mlr --icsv --ojson`. These are such frequently occurring patterns that they have short options like `--c2p` and `--c2j`:
mlr --c2p head -n 2 example.csv @@ -59,7 +59,7 @@ You can get the full list [here](file-formats.md#data-conversion-keystroke-saver ## File names up front, including --from -Already we saw that you can put the filename first using `--from`. When you're interacting with your data at the command line, this makes it easier to up-arrow and append to the previous command: +Already, we saw that you can put the filename first using `--from`. When you're interacting with your data at the command line, this makes it easier to up-arrow and append to the previous command:-A solution here is Miller's +A solution here is Miller's [clean-whitespace verb](reference-verbs.md#clean-whitespace):mlr --c2p --from example.csv sort -nr index then head -n 3 @@ -110,7 +110,7 @@ I think `mlr --csv ...` explains itself better than `mlr -c ...`. Nonetheless, t ## .mlrrc file -If you want the default file format for Miller to be CSV, you can simply put `--csv` on a line by itself in your `~/.mlrrc` file. Then instead of `mlr --csv cat example.csv` you can just do `mlr cat example.csv`. This is just a personal default, though, so `mlr --opprint cat example.csv` will use default CSV format for input, and PPRINT (tabular) for output. +If you want the default file format for Miller to be CSV, you can put `--csv` on a line by itself in your `~/.mlrrc` file. Then, instead of `mlr --csv cat example.csv` you can just do `mlr cat example.csv`. This is just a personal default, though, so `mlr --opprint cat example.csv` will use default CSV format for input, and PPRINT (tabular) for output. You can read more about this at the [Customization](customization.md) page. @@ -126,6 +126,6 @@ fraction -f count \ filename-which-varies.csv-Typing this out can get a bit old, if the only thing that changes for you is the filename. +Typing this out can get a bit old if the only thing that changes for you is the filename. See [Scripting with Miller](scripting.md) for some keystroke-saving options. diff --git a/docs/src/keystroke-savers.md.in b/docs/src/keystroke-savers.md.in index 720dfad14..648c63fc7 100644 --- a/docs/src/keystroke-savers.md.in +++ b/docs/src/keystroke-savers.md.in @@ -2,7 +2,7 @@ ## Short format specifiers, including --c2p -In our examples so far we've often made use of `mlr --icsv --opprint` or `mlr --icsv --ojson`. These are such frequently occurring patterns that they have short options like `--c2p` and `--c2j`: +In our examples so far, we've often made use of `mlr --icsv --opprint` or `mlr --icsv --ojson`. These are such frequently occurring patterns that they have short options like `--c2p` and `--c2j`: GENMD-RUN-COMMAND mlr --c2p head -n 2 example.csv @@ -16,7 +16,7 @@ You can get the full list [here](file-formats.md#data-conversion-keystroke-saver ## File names up front, including --from -Already we saw that you can put the filename first using `--from`. When you're interacting with your data at the command line, this makes it easier to up-arrow and append to the previous command: +Already, we saw that you can put the filename first using `--from`. When you're interacting with your data at the command line, this makes it easier to up-arrow and append to the previous command: GENMD-RUN-COMMAND mlr --c2p --from example.csv sort -nr index then head -n 3 @@ -55,7 +55,7 @@ I think `mlr --csv ...` explains itself better than `mlr -c ...`. Nonetheless, t ## .mlrrc file -If you want the default file format for Miller to be CSV, you can simply put `--csv` on a line by itself in your `~/.mlrrc` file. Then instead of `mlr --csv cat example.csv` you can just do `mlr cat example.csv`. This is just a personal default, though, so `mlr --opprint cat example.csv` will use default CSV format for input, and PPRINT (tabular) for output. +If you want the default file format for Miller to be CSV, you can put `--csv` on a line by itself in your `~/.mlrrc` file. Then, instead of `mlr --csv cat example.csv` you can just do `mlr cat example.csv`. This is just a personal default, though, so `mlr --opprint cat example.csv` will use default CSV format for input, and PPRINT (tabular) for output. You can read more about this at the [Customization](customization.md) page. @@ -71,6 +71,6 @@ fraction -f count \ filename-which-varies.csv GENMD-EOF -Typing this out can get a bit old, if the only thing that changes for you is the filename. +Typing this out can get a bit old if the only thing that changes for you is the filename. See [Scripting with Miller](scripting.md) for some keystroke-saving options. diff --git a/docs/src/kubectl-and-helm.md b/docs/src/kubectl-and-helm.md index 38bd31abf..5f53001be 100644 --- a/docs/src/kubectl-and-helm.md +++ b/docs/src/kubectl-and-helm.md @@ -152,7 +152,7 @@ $ helm list | mlr --itsv --ojson head -n 1 ]
diff --git a/docs/src/kubectl-and-helm.md.in b/docs/src/kubectl-and-helm.md.in
index 2f7d7d26f..14c0facf4 100644
--- a/docs/src/kubectl-and-helm.md.in
+++ b/docs/src/kubectl-and-helm.md.in
@@ -136,7 +136,7 @@ $ helm list | mlr --itsv --ojson head -n 1
]
GENMD-EOF
-A solution here is Miller's
+A solution here is Miller's
[clean-whitespace verb](reference-verbs.md#clean-whitespace):
GENMD-CARDIFY
diff --git a/docs/src/manpage.md b/docs/src/manpage.md
index de7ce4b6f..39203a0c9 100644
--- a/docs/src/manpage.md
+++ b/docs/src/manpage.md
@@ -19,9 +19,7 @@ Quick links:
This is simply a copy of what you should see on running `man mlr` at a command prompt, once Miller is installed on your system.
-MILLER(1) MILLER(1)
-
-
+4mMILLER24m(1) 4mMILLER24m(1)
1mNAME0m
Miller -- like awk, sed, cut, join, and sort for name-indexed data such
@@ -50,7 +48,7 @@ MILLER(1) MILLER(1)
insertion-ordered hash map. This encompasses a variety of data
formats, including but not limited to the familiar CSV, TSV, and JSON.
(Miller can handle positionally-indexed data as a special case.) This
- manpage documents mlr 6.10.0.
+ manpage documents mlr 6.16.0.
1mEXAMPLES0m
mlr --icsv --opprint cat example.csv
@@ -103,7 +101,7 @@ MILLER(1) MILLER(1)
| 4 5 6 | Record 2: "apple":"4", "bat":"5", "cog":"6"
+---------------------+
- Markdown tabular (supported for output only):
+ Markdown tabular:
+-----------------------+
| | apple | bat | cog | |
| | --- | --- | --- | |
@@ -147,6 +145,7 @@ MILLER(1) MILLER(1)
mlr help comments-in-data-flags
mlr help compressed-data-flags
mlr help csv/tsv-only-flags
+ mlr help dkvp-only-flags
mlr help file-format-flags
mlr help flatten-unflatten-flags
mlr help format-conversion-keystroke-saver-flags
@@ -199,9 +198,9 @@ MILLER(1) MILLER(1)
json-parse json-stringify join label latin1-to-utf8 least-frequent
merge-fields most-frequent nest nothing put regularize remove-empty-columns
rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle
- skip-trivial-records sort sort-within-records split ssub stats1 stats2 step
- sub summary tac tail tee template top utf8-to-latin1 unflatten uniq unspace
- unsparsify
+ skip-trivial-records sort sort-within-records sparsify split ssub stats1
+ stats2 step sub summary surv tac tail tee template top utf8-to-latin1
+ unflatten uniq unspace unsparsify
1mFUNCTION LIST0m
abs acos acosh antimode any append apply arrayify asin asinh asserting_absent
@@ -225,13 +224,14 @@ MILLER(1) MILLER(1)
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
- splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
- strfntime_local strftime strftime_local string strip strlen strpntime
- strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
- sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
- typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
- urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
- .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
+ splitax splitkv splitkvx splitnv splitnvx sqrt ssub stat stddev strfntime
+ strfntime_local strftime strftime_local string strip strlen strmatch strmatchx
+ strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
+ sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper
+ truncate typeof unflatten unformat unformatx upntime uptime urand urand32
+ urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % &
+ && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ |
+ || ~
1mCOMMENTS-IN-DATA FLAGS0m
Miller lets you put comments in your data, such as
@@ -255,12 +255,14 @@ MILLER(1) MILLER(1)
within the input.
--pass-comments-with {string}
Immediately print commented lines within input, with
- specified prefix.
+ specified prefix. For CSV input format, the prefix
+ must be a single character.
--skip-comments Ignore commented lines (prefixed by `#`) within the
input.
--skip-comments-with {string}
Ignore commented lines within input, with specified
- prefix.
+ prefix. For CSV input format, the prefix must be a
+ single character.
1mCOMPRESSED-DATA FLAGS0m
Miller offers a few different ways to handle reading data files
@@ -339,6 +341,10 @@ MILLER(1) MILLER(1)
recreate missing headers.
--lazy-quotes Accepts quotes appearing in unquoted fields, and
non-doubled quotes appearing in quoted fields.
+ --no-auto-unsparsify For CSV/TSV output: if the record keys change from
+ one row to another, emit a blank line and a new
+ header line. This is non-compliant with RFC 4180 but
+ it helpful for heterogeneous data.
--no-implicit-csv-header or --no-implicit-tsv-header
Opposite of `--implicit-csv-header`. This is the
default anyway -- the main use is for the flags to
@@ -353,6 +359,16 @@ MILLER(1) MILLER(1)
-N Keystroke-saver for `--implicit-csv-header
--headerless-csv-output`.
+1mDKVP-ONLY FLAGS0m
+ These are flags which are applicable to DKVP format.
+
+ --incr-key Without this option, keyless DKVP fields are keyed by
+ field number. For example: `a=10,b=20,30,d=40,50` is
+ ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With
+ this option, they're keyed by a running counter of
+ keyless fields. For example: `a=10,b=20,30,d=40,50`
+ is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`.
+
1mFILE-FORMAT FLAGS0m
See the File formats doc page, and or `mlr help file-formats`, for more
about file formats Miller supports.
@@ -365,9 +381,9 @@ MILLER(1) MILLER(1)
are overridden in all cases by setting output format to `format2`.
--asv or --asvlite Use ASV format for input and output data.
- --csv or -c Use CSV format for input and output data.
+ --csv or -c or --c2c Use CSV format for input and output data.
--csvlite Use CSV-lite format for input and output data.
- --dkvp Use DKVP format for input and output data.
+ --dkvp or --d2d Use DKVP format for input and output data.
--gen-field-name Specify field name for --igen. Defaults to "i".
--gen-start Specify start value for --igen. Defaults to 1.
--gen-step Specify step value for --igen. Defaults to 1.
@@ -382,6 +398,7 @@ MILLER(1) MILLER(1)
seqgen verb, which is more useful/intuitive.
--ijson Use JSON format for input data.
--ijsonl Use JSON Lines format for input data.
+ --imd or --imarkdown Use markdown-tabular format for input data.
--inidx Use NIDX format for input data.
--io {format name} Use format name for input and output data. For
example: `--io csv` is the same as `--csv`.
@@ -390,27 +407,27 @@ MILLER(1) MILLER(1)
--itsvlite Use TSV-lite format for input data.
--iusv or --iusvlite Use USV format for input data.
--ixtab Use XTAB format for input data.
- --json or -j Use JSON format for input and output data.
- --jsonl Use JSON Lines format for input and output data.
- --nidx Use NIDX format for input and output data.
+ --json or -j or --j2j Use JSON format for input and output data.
+ --jsonl or --l2l Use JSON Lines format for input and output data.
+ --nidx or --n2n Use NIDX format for input and output data.
--oasv or --oasvlite Use ASV format for output data.
--ocsv Use CSV format for output data.
--ocsvlite Use CSV-lite format for output data.
--odkvp Use DKVP format for output data.
--ojson Use JSON format for output data.
--ojsonl Use JSON Lines format for output data.
- --omd Use markdown-tabular format for output data.
+ --omd or --omarkdown Use markdown-tabular format for output data.
--onidx Use NIDX format for output data.
--opprint Use PPRINT format for output data.
--otsv Use TSV format for output data.
--otsvlite Use TSV-lite format for output data.
--ousv or --ousvlite Use USV format for output data.
--oxtab Use XTAB format for output data.
- --pprint Use PPRINT format for input and output data.
- --tsv or -t Use TSV format for input and output data.
+ --pprint or --p2p Use PPRINT format for input and output data.
+ --tsv or -t or --t2t Use TSV format for input and output data.
--tsvlite Use TSV-lite format for input and output data.
--usv or --usvlite Use USV format for input and output data.
- --xtab Use XTAB format for input and output data.
+ --xtab or --x2x Use XTAB format for input and output data.
--xvright Right-justify values for XTAB format.
-i {format name} Use format name for input data. For example: `-i csv`
is the same as `--icsv`.
@@ -420,7 +437,7 @@ MILLER(1) MILLER(1)
1mFLATTEN-UNFLATTEN FLAGS0m
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
- See the Flatten/unflatten doc page for more information.
+ See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
--flatsep or --jflatsep {string}
Separator for flattening multi-level JSON keys, e.g.
@@ -428,32 +445,31 @@ MILLER(1) MILLER(1)
formats. Defaults to `.`.
--no-auto-flatten When output is non-JSON, suppress the default
auto-flatten behavior. Default: if `$y = [7,8,9]`
- then this flattens to `y.1=7,y.2=8,y.3=9, and
+ then this flattens to `y.1=7,y.2=8,y.3=9`, and
similarly for maps. With `--no-auto-flatten`, instead
we get `$y=[1, 2, 3]`.
- --no-auto-unflatten When input non-JSON and output is JSON, suppress the
- default auto-unflatten behavior. Default: if the
+ --no-auto-unflatten When input is non-JSON and output is JSON, suppress
+ the default auto-unflatten behavior. Default: if the
input has `y.1=7,y.2=8,y.3=9` then this unflattens to
- `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
- `--no-auto-flatten`, instead we get
- `${y.1}=7,${y.2}=8,${y.3}=9`.
+ `$y=[7,8,9]`. With `--no-auto-flatten`, instead we
+ get `${y.1}=7,${y.2}=8,${y.3}=9`.
1mFORMAT-CONVERSION KEYSTROKE-SAVER FLAGS0m
As keystroke-savers for format-conversion you may use the following.
The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX,
- JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively. Note that markdown
- format is available for output only.
+ JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively.
- | In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
- +--------+-------+-------+--------+--------+--------+--------+--------+----------+
- | CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
- | TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
- | JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
- | JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m |
- | DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m |
- | NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m |
- | XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m |
- | PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m |
+ | In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
+ +----------+----------+----------+----------+-------+-------+-------+-------+--------+----------|
+ | CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
+ | TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
+ | JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
+ | JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m |
+ | DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m |
+ | NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m |
+ | XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m |
+ | PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m |
+ | Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
-p Keystroke-saver for `--nidx --fs space --repifs`.
-T Keystroke-saver for `--nidx --fs tab`.
@@ -468,8 +484,8 @@ MILLER(1) MILLER(1)
--jvstack Put one key-value pair per line for JSON output
(multi-line output). This is the default for JSON
output format.
- --no-jlistwrap Wrap JSON output in outermost `[ ]`. This is the
- default for JSON Lines output format.
+ --no-jlistwrap Do not wrap JSON output in outermost `[ ]`. This is
+ the default for JSON Lines output format.
--no-jvstack Put objects/arrays all on one line for JSON output.
This is the default for JSON Lines output format.
@@ -568,6 +584,7 @@ MILLER(1) MILLER(1)
since direct-to-screen output for large files has its
own overhead.
--no-hash-records See --hash-records.
+ --norc Do not load a .mlrrc file.
--nr-progress-mod {m} With m a positive integer: print filename and record
count to os.Stderr every m input records.
--ofmt {format} E.g. `%.18f`, `%.0f`, `%9.6e`. Please use
@@ -643,8 +660,8 @@ MILLER(1) MILLER(1)
How you can control colorization:
* Suppression/unsuppression:
- * Environment variable `export MLR_NO_COLOR=true` means don't color
- even if stdout+TTY.
+ * Environment variable `export MLR_NO_COLOR=true` or `export NO_COLOR=true`
+ means don't color even if stdout+TTY.
* Environment variable `export MLR_ALWAYS_COLOR=true` means do color
even if not stdout+TTY.
For example, you might want to use this when piping mlr output to `less -r`.
@@ -695,8 +712,10 @@ MILLER(1) MILLER(1)
1mPPRINT-ONLY FLAGS0m
These are flags which are applicable to PPRINT format.
- --barred Prints a border around PPRINT output (not available
- for input).
+ --barred or --barred-output
+ Prints a border around PPRINT output.
+ --barred-input When used in conjunction with --pprint, accepts
+ barred input.
--right Right-justifies all fields for PPRINT output.
1mPROFILING FLAGS0m
@@ -761,13 +780,13 @@ MILLER(1) MILLER(1)
- To avoid backslashing, you can use any of the following names:
ascii_esc = "\x1b"
- ascii_etx = "\x04"
+ ascii_etx = "\x03"
ascii_fs = "\x1c"
ascii_gs = "\x1d"
- ascii_null = "\x01"
+ ascii_null = "\x00"
ascii_rs = "\x1e"
- ascii_soh = "\x02"
- ascii_stx = "\x03"
+ ascii_soh = "\x01"
+ ascii_stx = "\x02"
ascii_us = "\x1f"
asv_fs = "\x1f"
asv_rs = "\x1e"
@@ -801,11 +820,12 @@ MILLER(1) MILLER(1)
csv "," N/A "\n"
csvlite "," N/A "\n"
dkvp "," "=" "\n"
+ gen "," N/A "\n"
json N/A N/A N/A
markdown " " N/A "\n"
nidx " " N/A "\n"
pprint " " N/A "\n"
- tsv " " N/A "\n"
+ tsv " " N/A "\n"
xtab "\n" " " "\n\n"
--fs {string} Specify FS for input and output.
@@ -986,6 +1006,7 @@ MILLER(1) MILLER(1)
Options:
-f {a,b,c} Field names for distinct count.
+ -x {a,b,c} Field names to exclude for distinct count: use each record's others instead.
-n Show only the number of distinct values. Not compatible with -u.
-o {name} Field name for output count. Default "count".
Ignored with -u.
@@ -1024,7 +1045,7 @@ MILLER(1) MILLER(1)
-r Treat field names as regular expressions. "ab", "a.*b" will
match any field name containing the substring "ab" or matching
"a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may
- be used. The -o flag is ignored when -r is present.
+ be used.
-h|--help Show this message.
Examples:
mlr cut -f hostname,status
@@ -1068,6 +1089,10 @@ MILLER(1) MILLER(1)
1mfilter0m
Usage: mlr filter [options] {DSL expression}
+ Lets you use a domain-specific language to programmatically filter which
+ stream records will be output.
+ See also: https://miller.readthedocs.io/en/latest/reference-verbs
+
Options:
-f {file name} File containing a DSL expression (see examples below). If the filename
is a directory, all *.mlr files in that directory are loaded.
@@ -1262,6 +1287,8 @@ MILLER(1) MILLER(1)
See also the `sub` and `ssub` verbs.
Options:
-f {a,b,c} Field names to convert.
+ -r {regex} Regular expression for field names to convert.
+ -a Convert all fields.
-h|--help Show this message.
1mhaving-fields0m
@@ -1335,6 +1362,8 @@ MILLER(1) MILLER(1)
--lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field
names from the left file. Automatically includes the join-field name(s). Helpful
for when you only want a limited subset of information from the left file.
+ Tip: you can use --lk "": this means the left file becomes solely a row-selector
+ for the input files.
--lp {text} Additional prefix for non-join output field names from
the left file
--rp {text} Additional prefix for non-join output field names from
@@ -1369,7 +1398,7 @@ MILLER(1) MILLER(1)
Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be
expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'.
Please use "mlr --usage-separator-options" for information on specifying separators.
- Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information
+ Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information
including examples.
1mlabel0m
@@ -1413,6 +1442,7 @@ MILLER(1) MILLER(1)
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
+ mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
@@ -1519,6 +1549,9 @@ MILLER(1) MILLER(1)
1mput0m
Usage: mlr put [options] {DSL expression}
+ Lets you use a domain-specific language to programmatically alter stream records.
+ See also: https://miller.readthedocs.io/en/latest/reference-verbs
+
Options:
-f {file name} File containing a DSL expression (see examples below). If the filename
is a directory, all *.mlr files in that directory are loaded.
@@ -1625,9 +1658,9 @@ MILLER(1) MILLER(1)
first-match replacement.
-h|--help Show this message.
Examples:
- mlr rename old_name,new_name'
- mlr rename old_name_1,new_name_1,old_name_2,new_name_2'
- mlr rename -r 'Date_[0-9]+,Date,' Rename all such fields to be "Date"
+ mlr rename old_name,new_name
+ mlr rename old_name_1,new_name_1,old_name_2,new_name_2
+ mlr rename -r 'Date_[0-9]+,Date' Rename all such fields to be "Date"
mlr rename -r '"Date_[0-9]+",Date' Same
mlr rename -r 'Date_([0-9]+).*,\1' Rename all such fields to be of the form 20151015
mlr rename -r '"name"i,Name' Rename "name", "Name", "NAME", etc. to "Name"
@@ -1817,6 +1850,7 @@ MILLER(1) MILLER(1)
-nf {comma-separated field names} Same as -n
-nr {comma-separated field names} Numerical descending; nulls sort first
-t {comma-separated field names} Natural ascending
+ -b Move sort fields to start of record, as in reorder -b
-tr|-rt {comma-separated field names} Natural descending
-h|--help Show this message.
@@ -1832,6 +1866,17 @@ MILLER(1) MILLER(1)
-r Recursively sort subobjects/submaps, e.g. for JSON input.
-h|--help Show this message.
+ 1msparsify0m
+ Usage: mlr sparsify [options]
+ Unsets fields for which the key is the empty string (or, optionally, another
+ specified value). Only makes sense with output format not being CSV or TSV.
+ Options:
+ -s {filler string} What values to remove. Defaults to the empty string.
+ -f {a,b,c} Specify field names to be operated on; any other fields won't be
+ modified. The default is to modify all fields.
+ -h|--help Show this message.
+ Example: if input is a=1,b=,c=3 then output is a=1,c=3.
+
1msplit0m
Usage: mlr split [options] {filename}
Options:
@@ -1880,6 +1925,8 @@ MILLER(1) MILLER(1)
the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.
Options:
-f {a,b,c} Field names to convert.
+ -r {regex} Regular expression for field names to convert.
+ -a Convert all fields.
-h|--help Show this message.
1mstats10m
@@ -1897,6 +1944,7 @@ MILLER(1) MILLER(1)
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
+ mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
@@ -1931,7 +1979,7 @@ MILLER(1) MILLER(1)
Example: mlr stats1 -a min,p10,p50,p90,max -f value -g size,shape
Example: mlr stats1 -a count,mode -f size
Example: mlr stats1 -a count,mode -f size -g shape
- Example: mlr stats1 -a count,mode --fr '^[a-h].*$' -gr '^k.*$'
+ Example: mlr stats1 -a count,mode --fr '^[a-h].*$' --gr '^k.*$'
This computes count and mode statistics on all field names beginning
with a through h, grouped by all field names starting with k.
@@ -2026,6 +2074,8 @@ MILLER(1) MILLER(1)
See also the `gsub` and `ssub` verbs.
Options:
-f {a,b,c} Field names to convert.
+ -r {regex} Regular expression for field names to convert.
+ -a Convert all fields.
-h|--help Show this message.
1msummary0m
@@ -2068,8 +2118,18 @@ MILLER(1) MILLER(1)
-a {mean,sum,etc.} Use only the specified summarizers.
-x {mean,sum,etc.} Use all summarizers, except the specified ones.
--all Use all available summarizers.
+ --transpose Show output with field names as column names..
-h|--help Show this message.
+ 1msurv0m
+ Usage: mlr surv -d {duration-field} -s {status-field}
+
+ Estimate Kaplan-Meier survival curve (right-censored).
+ Options:
+ -d {field} Name of duration field (time-to-event or censoring).
+ -s {field} Name of status field (0=censored, 1=event).
+ -h, --help Show this message.
+
1mtac0m
Usage: mlr tac [options]
Prints records in reverse order from the order in which they were encountered.
@@ -2152,6 +2212,7 @@ MILLER(1) MILLER(1)
Options:
-g {d,e,f} Group-by-field names for uniq counts.
+ -x {a,b,c} Field names to exclude for uniq: use each record's others instead.
-c Show repeat counts in addition to unique values.
-n Show only the number of distinct values.
-o {name} Field name for output count. Default "count".
@@ -2306,7 +2367,7 @@ MILLER(1) MILLER(1)
(class=math #args=1) Ceiling: nearest integer at or above.
1mclean_whitespace0m
- (class=string #args=1) Same as collapse_whitespace and strip.
+ (class=string #args=1) Same as collapse_whitespace and strip, followed by type inference.
1mcollapse_whitespace0m
(class=string #args=1) Strip repeated whitespace from string.
@@ -2398,9 +2459,14 @@ MILLER(1) MILLER(1)
$* = fmtifnum($*, "%.6f") formats numeric fields in the current record, leaving non-numeric ones alone
1mfmtnum0m
- (class=conversion #args=2) Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. '$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. This function recurses on array and map values.
- Example:
- $x = fmtnum($x, "%.6f")
+ (class=conversion #args=2) Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. '$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. Miller-specific extension: "%_d" and "%_f" for comma-separated thousands. This function recurses on array and map values.
+ Examples:
+ $y = fmtnum($x, "%.6f")
+ $o = fmtnum($n, "%d")
+ $o = fmtnum($n, "%12d")
+ $y = fmtnum($x, "%.6_f")
+ $o = fmtnum($n, "%_d")
+ $o = fmtnum($n, "%12_d")
1mfold0m
(class=higher-order-functions #args=3) Given a map or array as first argument and a function as second argument, accumulates entries into a final output -- for example, sum or product. For arrays, the function should take two arguments, for accumulated value and array element. For maps, it should take four arguments, for accumulated key and value, and map-element key and value; it should return the updated accumulator as a new key-value pair (i.e. a single-entry map). The start value for the accumulator is taken from the third argument.
@@ -2953,6 +3019,18 @@ MILLER(1) MILLER(1)
Example:
ssub("abc.def", ".", "X") gives "abcXdef"
+ 1mstat0m
+ (class=system #args=1) Returns a map containing information about the provided path: "name" with string value, "size" as decimal int value, "mode" as octal int value, "modtime" as int-valued epoch seconds, and "isdir" as boolean value.
+ Examples:
+ stat("./mlr") gives {
+ "name": "mlr",
+ "size": 38391584,
+ "mode": 0755,
+ "modtime": 1715207874,
+ "isdir": false
+ }
+ stat("./mlr")["size"] gives 38391584
+
1mstddev0m
(class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns empty string AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
@@ -2995,6 +3073,40 @@ MILLER(1) MILLER(1)
1mstrlen0m
(class=string #args=1) String length.
+ 1mstrmatch0m
+ (class=string #args=2) Boolean yes/no for whether the stringable first argument matches the regular-expression second argument. No regex captures are provided; please see `strmatch`.
+ Examples:
+ strmatch("a", "abc") is false
+ strmatch("abc", "a") is true
+ strmatch("abc", "a[a-z]c") is true
+ strmatch("abc", "(a).(c)") is true
+ strmatch(12345, "34") is true
+
+ 1mstrmatchx0m
+ (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in contrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here.
+ Examples:
+ strmatchx("a", "abc") returns:
+ {
+ "matched": false
+ }
+ strmatchx("abc", "a") returns:
+ {
+ "matched": true,
+ "full_capture": "a",
+ "full_start": 1,
+ "full_end": 1
+ }
+ strmatchx("[zy:3458]", "([a-z]+):([0-9]+)") returns:
+ {
+ "matched": true,
+ "full_capture": "zy:3458",
+ "full_start": 2,
+ "full_end": 8,
+ "captures": ["zy", "3458"],
+ "starts": [2, 5],
+ "ends": [3, 8]
+ }
+
1mstrpntime0m
(class=time #args=2) strpntime: Parses timestamp as integer nanoseconds since the epoch. See also strpntime_local.
Examples:
@@ -3647,7 +3759,5 @@ MILLER(1) MILLER(1)
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
https://miller.readthedocs.io
-
-
- 2023-12-13 MILLER(1)
+ 2026-01-02 4mMILLER24m(1)
diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt
index e7e3d3582..90bff3293 100644
--- a/docs/src/manpage.txt
+++ b/docs/src/manpage.txt
@@ -1,6 +1,4 @@
-MILLER(1) MILLER(1)
-
-
+4mMILLER24m(1) 4mMILLER24m(1)
1mNAME0m
Miller -- like awk, sed, cut, join, and sort for name-indexed data such
@@ -29,7 +27,7 @@ MILLER(1) MILLER(1)
insertion-ordered hash map. This encompasses a variety of data
formats, including but not limited to the familiar CSV, TSV, and JSON.
(Miller can handle positionally-indexed data as a special case.) This
- manpage documents mlr 6.10.0.
+ manpage documents mlr 6.16.0.
1mEXAMPLES0m
mlr --icsv --opprint cat example.csv
@@ -82,7 +80,7 @@ MILLER(1) MILLER(1)
| 4 5 6 | Record 2: "apple":"4", "bat":"5", "cog":"6"
+---------------------+
- Markdown tabular (supported for output only):
+ Markdown tabular:
+-----------------------+
| | apple | bat | cog | |
| | --- | --- | --- | |
@@ -126,6 +124,7 @@ MILLER(1) MILLER(1)
mlr help comments-in-data-flags
mlr help compressed-data-flags
mlr help csv/tsv-only-flags
+ mlr help dkvp-only-flags
mlr help file-format-flags
mlr help flatten-unflatten-flags
mlr help format-conversion-keystroke-saver-flags
@@ -178,9 +177,9 @@ MILLER(1) MILLER(1)
json-parse json-stringify join label latin1-to-utf8 least-frequent
merge-fields most-frequent nest nothing put regularize remove-empty-columns
rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle
- skip-trivial-records sort sort-within-records split ssub stats1 stats2 step
- sub summary tac tail tee template top utf8-to-latin1 unflatten uniq unspace
- unsparsify
+ skip-trivial-records sort sort-within-records sparsify split ssub stats1
+ stats2 step sub summary surv tac tail tee template top utf8-to-latin1
+ unflatten uniq unspace unsparsify
1mFUNCTION LIST0m
abs acos acosh antimode any append apply arrayify asin asinh asserting_absent
@@ -204,13 +203,14 @@ MILLER(1) MILLER(1)
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
- splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
- strfntime_local strftime strftime_local string strip strlen strpntime
- strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
- sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
- typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
- urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
- .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
+ splitax splitkv splitkvx splitnv splitnvx sqrt ssub stat stddev strfntime
+ strfntime_local strftime strftime_local string strip strlen strmatch strmatchx
+ strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
+ sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper
+ truncate typeof unflatten unformat unformatx upntime uptime urand urand32
+ urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % &
+ && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ |
+ || ~
1mCOMMENTS-IN-DATA FLAGS0m
Miller lets you put comments in your data, such as
@@ -234,12 +234,14 @@ MILLER(1) MILLER(1)
within the input.
--pass-comments-with {string}
Immediately print commented lines within input, with
- specified prefix.
+ specified prefix. For CSV input format, the prefix
+ must be a single character.
--skip-comments Ignore commented lines (prefixed by `#`) within the
input.
--skip-comments-with {string}
Ignore commented lines within input, with specified
- prefix.
+ prefix. For CSV input format, the prefix must be a
+ single character.
1mCOMPRESSED-DATA FLAGS0m
Miller offers a few different ways to handle reading data files
@@ -318,6 +320,10 @@ MILLER(1) MILLER(1)
recreate missing headers.
--lazy-quotes Accepts quotes appearing in unquoted fields, and
non-doubled quotes appearing in quoted fields.
+ --no-auto-unsparsify For CSV/TSV output: if the record keys change from
+ one row to another, emit a blank line and a new
+ header line. This is non-compliant with RFC 4180 but
+ it helpful for heterogeneous data.
--no-implicit-csv-header or --no-implicit-tsv-header
Opposite of `--implicit-csv-header`. This is the
default anyway -- the main use is for the flags to
@@ -332,6 +338,16 @@ MILLER(1) MILLER(1)
-N Keystroke-saver for `--implicit-csv-header
--headerless-csv-output`.
+1mDKVP-ONLY FLAGS0m
+ These are flags which are applicable to DKVP format.
+
+ --incr-key Without this option, keyless DKVP fields are keyed by
+ field number. For example: `a=10,b=20,30,d=40,50` is
+ ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With
+ this option, they're keyed by a running counter of
+ keyless fields. For example: `a=10,b=20,30,d=40,50`
+ is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`.
+
1mFILE-FORMAT FLAGS0m
See the File formats doc page, and or `mlr help file-formats`, for more
about file formats Miller supports.
@@ -344,9 +360,9 @@ MILLER(1) MILLER(1)
are overridden in all cases by setting output format to `format2`.
--asv or --asvlite Use ASV format for input and output data.
- --csv or -c Use CSV format for input and output data.
+ --csv or -c or --c2c Use CSV format for input and output data.
--csvlite Use CSV-lite format for input and output data.
- --dkvp Use DKVP format for input and output data.
+ --dkvp or --d2d Use DKVP format for input and output data.
--gen-field-name Specify field name for --igen. Defaults to "i".
--gen-start Specify start value for --igen. Defaults to 1.
--gen-step Specify step value for --igen. Defaults to 1.
@@ -361,6 +377,7 @@ MILLER(1) MILLER(1)
seqgen verb, which is more useful/intuitive.
--ijson Use JSON format for input data.
--ijsonl Use JSON Lines format for input data.
+ --imd or --imarkdown Use markdown-tabular format for input data.
--inidx Use NIDX format for input data.
--io {format name} Use format name for input and output data. For
example: `--io csv` is the same as `--csv`.
@@ -369,27 +386,27 @@ MILLER(1) MILLER(1)
--itsvlite Use TSV-lite format for input data.
--iusv or --iusvlite Use USV format for input data.
--ixtab Use XTAB format for input data.
- --json or -j Use JSON format for input and output data.
- --jsonl Use JSON Lines format for input and output data.
- --nidx Use NIDX format for input and output data.
+ --json or -j or --j2j Use JSON format for input and output data.
+ --jsonl or --l2l Use JSON Lines format for input and output data.
+ --nidx or --n2n Use NIDX format for input and output data.
--oasv or --oasvlite Use ASV format for output data.
--ocsv Use CSV format for output data.
--ocsvlite Use CSV-lite format for output data.
--odkvp Use DKVP format for output data.
--ojson Use JSON format for output data.
--ojsonl Use JSON Lines format for output data.
- --omd Use markdown-tabular format for output data.
+ --omd or --omarkdown Use markdown-tabular format for output data.
--onidx Use NIDX format for output data.
--opprint Use PPRINT format for output data.
--otsv Use TSV format for output data.
--otsvlite Use TSV-lite format for output data.
--ousv or --ousvlite Use USV format for output data.
--oxtab Use XTAB format for output data.
- --pprint Use PPRINT format for input and output data.
- --tsv or -t Use TSV format for input and output data.
+ --pprint or --p2p Use PPRINT format for input and output data.
+ --tsv or -t or --t2t Use TSV format for input and output data.
--tsvlite Use TSV-lite format for input and output data.
--usv or --usvlite Use USV format for input and output data.
- --xtab Use XTAB format for input and output data.
+ --xtab or --x2x Use XTAB format for input and output data.
--xvright Right-justify values for XTAB format.
-i {format name} Use format name for input data. For example: `-i csv`
is the same as `--icsv`.
@@ -399,7 +416,7 @@ MILLER(1) MILLER(1)
1mFLATTEN-UNFLATTEN FLAGS0m
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
- See the Flatten/unflatten doc page for more information.
+ See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
--flatsep or --jflatsep {string}
Separator for flattening multi-level JSON keys, e.g.
@@ -407,32 +424,31 @@ MILLER(1) MILLER(1)
formats. Defaults to `.`.
--no-auto-flatten When output is non-JSON, suppress the default
auto-flatten behavior. Default: if `$y = [7,8,9]`
- then this flattens to `y.1=7,y.2=8,y.3=9, and
+ then this flattens to `y.1=7,y.2=8,y.3=9`, and
similarly for maps. With `--no-auto-flatten`, instead
we get `$y=[1, 2, 3]`.
- --no-auto-unflatten When input non-JSON and output is JSON, suppress the
- default auto-unflatten behavior. Default: if the
+ --no-auto-unflatten When input is non-JSON and output is JSON, suppress
+ the default auto-unflatten behavior. Default: if the
input has `y.1=7,y.2=8,y.3=9` then this unflattens to
- `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
- `--no-auto-flatten`, instead we get
- `${y.1}=7,${y.2}=8,${y.3}=9`.
+ `$y=[7,8,9]`. With `--no-auto-flatten`, instead we
+ get `${y.1}=7,${y.2}=8,${y.3}=9`.
1mFORMAT-CONVERSION KEYSTROKE-SAVER FLAGS0m
As keystroke-savers for format-conversion you may use the following.
The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX,
- JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively. Note that markdown
- format is available for output only.
+ JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively.
- | In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
- +--------+-------+-------+--------+--------+--------+--------+--------+----------+
- | CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
- | TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
- | JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
- | JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m |
- | DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m |
- | NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m |
- | XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m |
- | PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m |
+ | In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
+ +----------+----------+----------+----------+-------+-------+-------+-------+--------+----------|
+ | CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
+ | TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
+ | JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
+ | JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m |
+ | DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m |
+ | NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m |
+ | XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m |
+ | PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m |
+ | Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
-p Keystroke-saver for `--nidx --fs space --repifs`.
-T Keystroke-saver for `--nidx --fs tab`.
@@ -447,8 +463,8 @@ MILLER(1) MILLER(1)
--jvstack Put one key-value pair per line for JSON output
(multi-line output). This is the default for JSON
output format.
- --no-jlistwrap Wrap JSON output in outermost `[ ]`. This is the
- default for JSON Lines output format.
+ --no-jlistwrap Do not wrap JSON output in outermost `[ ]`. This is
+ the default for JSON Lines output format.
--no-jvstack Put objects/arrays all on one line for JSON output.
This is the default for JSON Lines output format.
@@ -547,6 +563,7 @@ MILLER(1) MILLER(1)
since direct-to-screen output for large files has its
own overhead.
--no-hash-records See --hash-records.
+ --norc Do not load a .mlrrc file.
--nr-progress-mod {m} With m a positive integer: print filename and record
count to os.Stderr every m input records.
--ofmt {format} E.g. `%.18f`, `%.0f`, `%9.6e`. Please use
@@ -622,8 +639,8 @@ MILLER(1) MILLER(1)
How you can control colorization:
* Suppression/unsuppression:
- * Environment variable `export MLR_NO_COLOR=true` means don't color
- even if stdout+TTY.
+ * Environment variable `export MLR_NO_COLOR=true` or `export NO_COLOR=true`
+ means don't color even if stdout+TTY.
* Environment variable `export MLR_ALWAYS_COLOR=true` means do color
even if not stdout+TTY.
For example, you might want to use this when piping mlr output to `less -r`.
@@ -674,8 +691,10 @@ MILLER(1) MILLER(1)
1mPPRINT-ONLY FLAGS0m
These are flags which are applicable to PPRINT format.
- --barred Prints a border around PPRINT output (not available
- for input).
+ --barred or --barred-output
+ Prints a border around PPRINT output.
+ --barred-input When used in conjunction with --pprint, accepts
+ barred input.
--right Right-justifies all fields for PPRINT output.
1mPROFILING FLAGS0m
@@ -740,13 +759,13 @@ MILLER(1) MILLER(1)
- To avoid backslashing, you can use any of the following names:
ascii_esc = "\x1b"
- ascii_etx = "\x04"
+ ascii_etx = "\x03"
ascii_fs = "\x1c"
ascii_gs = "\x1d"
- ascii_null = "\x01"
+ ascii_null = "\x00"
ascii_rs = "\x1e"
- ascii_soh = "\x02"
- ascii_stx = "\x03"
+ ascii_soh = "\x01"
+ ascii_stx = "\x02"
ascii_us = "\x1f"
asv_fs = "\x1f"
asv_rs = "\x1e"
@@ -780,11 +799,12 @@ MILLER(1) MILLER(1)
csv "," N/A "\n"
csvlite "," N/A "\n"
dkvp "," "=" "\n"
+ gen "," N/A "\n"
json N/A N/A N/A
markdown " " N/A "\n"
nidx " " N/A "\n"
pprint " " N/A "\n"
- tsv " " N/A "\n"
+ tsv " " N/A "\n"
xtab "\n" " " "\n\n"
--fs {string} Specify FS for input and output.
@@ -965,6 +985,7 @@ MILLER(1) MILLER(1)
Options:
-f {a,b,c} Field names for distinct count.
+ -x {a,b,c} Field names to exclude for distinct count: use each record's others instead.
-n Show only the number of distinct values. Not compatible with -u.
-o {name} Field name for output count. Default "count".
Ignored with -u.
@@ -1003,7 +1024,7 @@ MILLER(1) MILLER(1)
-r Treat field names as regular expressions. "ab", "a.*b" will
match any field name containing the substring "ab" or matching
"a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may
- be used. The -o flag is ignored when -r is present.
+ be used.
-h|--help Show this message.
Examples:
mlr cut -f hostname,status
@@ -1047,6 +1068,10 @@ MILLER(1) MILLER(1)
1mfilter0m
Usage: mlr filter [options] {DSL expression}
+ Lets you use a domain-specific language to programmatically filter which
+ stream records will be output.
+ See also: https://miller.readthedocs.io/en/latest/reference-verbs
+
Options:
-f {file name} File containing a DSL expression (see examples below). If the filename
is a directory, all *.mlr files in that directory are loaded.
@@ -1241,6 +1266,8 @@ MILLER(1) MILLER(1)
See also the `sub` and `ssub` verbs.
Options:
-f {a,b,c} Field names to convert.
+ -r {regex} Regular expression for field names to convert.
+ -a Convert all fields.
-h|--help Show this message.
1mhaving-fields0m
@@ -1314,6 +1341,8 @@ MILLER(1) MILLER(1)
--lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field
names from the left file. Automatically includes the join-field name(s). Helpful
for when you only want a limited subset of information from the left file.
+ Tip: you can use --lk "": this means the left file becomes solely a row-selector
+ for the input files.
--lp {text} Additional prefix for non-join output field names from
the left file
--rp {text} Additional prefix for non-join output field names from
@@ -1348,7 +1377,7 @@ MILLER(1) MILLER(1)
Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be
expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'.
Please use "mlr --usage-separator-options" for information on specifying separators.
- Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information
+ Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information
including examples.
1mlabel0m
@@ -1392,6 +1421,7 @@ MILLER(1) MILLER(1)
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
+ mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
@@ -1498,6 +1528,9 @@ MILLER(1) MILLER(1)
1mput0m
Usage: mlr put [options] {DSL expression}
+ Lets you use a domain-specific language to programmatically alter stream records.
+ See also: https://miller.readthedocs.io/en/latest/reference-verbs
+
Options:
-f {file name} File containing a DSL expression (see examples below). If the filename
is a directory, all *.mlr files in that directory are loaded.
@@ -1604,9 +1637,9 @@ MILLER(1) MILLER(1)
first-match replacement.
-h|--help Show this message.
Examples:
- mlr rename old_name,new_name'
- mlr rename old_name_1,new_name_1,old_name_2,new_name_2'
- mlr rename -r 'Date_[0-9]+,Date,' Rename all such fields to be "Date"
+ mlr rename old_name,new_name
+ mlr rename old_name_1,new_name_1,old_name_2,new_name_2
+ mlr rename -r 'Date_[0-9]+,Date' Rename all such fields to be "Date"
mlr rename -r '"Date_[0-9]+",Date' Same
mlr rename -r 'Date_([0-9]+).*,\1' Rename all such fields to be of the form 20151015
mlr rename -r '"name"i,Name' Rename "name", "Name", "NAME", etc. to "Name"
@@ -1796,6 +1829,7 @@ MILLER(1) MILLER(1)
-nf {comma-separated field names} Same as -n
-nr {comma-separated field names} Numerical descending; nulls sort first
-t {comma-separated field names} Natural ascending
+ -b Move sort fields to start of record, as in reorder -b
-tr|-rt {comma-separated field names} Natural descending
-h|--help Show this message.
@@ -1811,6 +1845,17 @@ MILLER(1) MILLER(1)
-r Recursively sort subobjects/submaps, e.g. for JSON input.
-h|--help Show this message.
+ 1msparsify0m
+ Usage: mlr sparsify [options]
+ Unsets fields for which the key is the empty string (or, optionally, another
+ specified value). Only makes sense with output format not being CSV or TSV.
+ Options:
+ -s {filler string} What values to remove. Defaults to the empty string.
+ -f {a,b,c} Specify field names to be operated on; any other fields won't be
+ modified. The default is to modify all fields.
+ -h|--help Show this message.
+ Example: if input is a=1,b=,c=3 then output is a=1,c=3.
+
1msplit0m
Usage: mlr split [options] {filename}
Options:
@@ -1859,6 +1904,8 @@ MILLER(1) MILLER(1)
the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.
Options:
-f {a,b,c} Field names to convert.
+ -r {regex} Regular expression for field names to convert.
+ -a Convert all fields.
-h|--help Show this message.
1mstats10m
@@ -1876,6 +1923,7 @@ MILLER(1) MILLER(1)
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
+ mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
@@ -1910,7 +1958,7 @@ MILLER(1) MILLER(1)
Example: mlr stats1 -a min,p10,p50,p90,max -f value -g size,shape
Example: mlr stats1 -a count,mode -f size
Example: mlr stats1 -a count,mode -f size -g shape
- Example: mlr stats1 -a count,mode --fr '^[a-h].*$' -gr '^k.*$'
+ Example: mlr stats1 -a count,mode --fr '^[a-h].*$' --gr '^k.*$'
This computes count and mode statistics on all field names beginning
with a through h, grouped by all field names starting with k.
@@ -2005,6 +2053,8 @@ MILLER(1) MILLER(1)
See also the `gsub` and `ssub` verbs.
Options:
-f {a,b,c} Field names to convert.
+ -r {regex} Regular expression for field names to convert.
+ -a Convert all fields.
-h|--help Show this message.
1msummary0m
@@ -2047,8 +2097,18 @@ MILLER(1) MILLER(1)
-a {mean,sum,etc.} Use only the specified summarizers.
-x {mean,sum,etc.} Use all summarizers, except the specified ones.
--all Use all available summarizers.
+ --transpose Show output with field names as column names..
-h|--help Show this message.
+ 1msurv0m
+ Usage: mlr surv -d {duration-field} -s {status-field}
+
+ Estimate Kaplan-Meier survival curve (right-censored).
+ Options:
+ -d {field} Name of duration field (time-to-event or censoring).
+ -s {field} Name of status field (0=censored, 1=event).
+ -h, --help Show this message.
+
1mtac0m
Usage: mlr tac [options]
Prints records in reverse order from the order in which they were encountered.
@@ -2131,6 +2191,7 @@ MILLER(1) MILLER(1)
Options:
-g {d,e,f} Group-by-field names for uniq counts.
+ -x {a,b,c} Field names to exclude for uniq: use each record's others instead.
-c Show repeat counts in addition to unique values.
-n Show only the number of distinct values.
-o {name} Field name for output count. Default "count".
@@ -2285,7 +2346,7 @@ MILLER(1) MILLER(1)
(class=math #args=1) Ceiling: nearest integer at or above.
1mclean_whitespace0m
- (class=string #args=1) Same as collapse_whitespace and strip.
+ (class=string #args=1) Same as collapse_whitespace and strip, followed by type inference.
1mcollapse_whitespace0m
(class=string #args=1) Strip repeated whitespace from string.
@@ -2377,9 +2438,14 @@ MILLER(1) MILLER(1)
$* = fmtifnum($*, "%.6f") formats numeric fields in the current record, leaving non-numeric ones alone
1mfmtnum0m
- (class=conversion #args=2) Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. '$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. This function recurses on array and map values.
- Example:
- $x = fmtnum($x, "%.6f")
+ (class=conversion #args=2) Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. '$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. Miller-specific extension: "%_d" and "%_f" for comma-separated thousands. This function recurses on array and map values.
+ Examples:
+ $y = fmtnum($x, "%.6f")
+ $o = fmtnum($n, "%d")
+ $o = fmtnum($n, "%12d")
+ $y = fmtnum($x, "%.6_f")
+ $o = fmtnum($n, "%_d")
+ $o = fmtnum($n, "%12_d")
1mfold0m
(class=higher-order-functions #args=3) Given a map or array as first argument and a function as second argument, accumulates entries into a final output -- for example, sum or product. For arrays, the function should take two arguments, for accumulated value and array element. For maps, it should take four arguments, for accumulated key and value, and map-element key and value; it should return the updated accumulator as a new key-value pair (i.e. a single-entry map). The start value for the accumulator is taken from the third argument.
@@ -2932,6 +2998,18 @@ MILLER(1) MILLER(1)
Example:
ssub("abc.def", ".", "X") gives "abcXdef"
+ 1mstat0m
+ (class=system #args=1) Returns a map containing information about the provided path: "name" with string value, "size" as decimal int value, "mode" as octal int value, "modtime" as int-valued epoch seconds, and "isdir" as boolean value.
+ Examples:
+ stat("./mlr") gives {
+ "name": "mlr",
+ "size": 38391584,
+ "mode": 0755,
+ "modtime": 1715207874,
+ "isdir": false
+ }
+ stat("./mlr")["size"] gives 38391584
+
1mstddev0m
(class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns empty string AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
@@ -2974,6 +3052,40 @@ MILLER(1) MILLER(1)
1mstrlen0m
(class=string #args=1) String length.
+ 1mstrmatch0m
+ (class=string #args=2) Boolean yes/no for whether the stringable first argument matches the regular-expression second argument. No regex captures are provided; please see `strmatch`.
+ Examples:
+ strmatch("a", "abc") is false
+ strmatch("abc", "a") is true
+ strmatch("abc", "a[a-z]c") is true
+ strmatch("abc", "(a).(c)") is true
+ strmatch(12345, "34") is true
+
+ 1mstrmatchx0m
+ (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in contrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here.
+ Examples:
+ strmatchx("a", "abc") returns:
+ {
+ "matched": false
+ }
+ strmatchx("abc", "a") returns:
+ {
+ "matched": true,
+ "full_capture": "a",
+ "full_start": 1,
+ "full_end": 1
+ }
+ strmatchx("[zy:3458]", "([a-z]+):([0-9]+)") returns:
+ {
+ "matched": true,
+ "full_capture": "zy:3458",
+ "full_start": 2,
+ "full_end": 8,
+ "captures": ["zy", "3458"],
+ "starts": [2, 5],
+ "ends": [3, 8]
+ }
+
1mstrpntime0m
(class=time #args=2) strpntime: Parses timestamp as integer nanoseconds since the epoch. See also strpntime_local.
Examples:
@@ -3626,6 +3738,4 @@ MILLER(1) MILLER(1)
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
https://miller.readthedocs.io
-
-
- 2023-12-13 MILLER(1)
+ 2026-01-02 4mMILLER24m(1)
diff --git a/docs/src/miller-as-library.md b/docs/src/miller-as-library.md
index c17872ba0..3b09a4bbc 100644
--- a/docs/src/miller-as-library.md
+++ b/docs/src/miller-as-library.md
@@ -50,8 +50,8 @@ package main
import (
"fmt"
- "github.com/johnkerl/miller/pkg/bifs"
- "github.com/johnkerl/miller/pkg/mlrval"
+ "github.com/johnkerl/miller/v6/pkg/bifs"
+ "github.com/johnkerl/miller/v6/pkg/mlrval"
)
func main() {
@@ -86,11 +86,11 @@ import (
"fmt"
"os"
- "github.com/johnkerl/miller/pkg/bifs"
- "github.com/johnkerl/miller/pkg/cli"
- "github.com/johnkerl/miller/pkg/input"
- "github.com/johnkerl/miller/pkg/output"
- "github.com/johnkerl/miller/pkg/types"
+ "github.com/johnkerl/miller/v6/pkg/bifs"
+ "github.com/johnkerl/miller/v6/pkg/cli"
+ "github.com/johnkerl/miller/v6/pkg/input"
+ "github.com/johnkerl/miller/v6/pkg/output"
+ "github.com/johnkerl/miller/v6/pkg/types"
)
// Put your record-processing logic here.
@@ -128,7 +128,7 @@ func custom_options() *cli.TOptions {
func run_custom_processor(
fileNames []string,
options *cli.TOptions,
- record_processor func (irac *types.RecordAndContext) (*types.RecordAndContext, error),
+ record_processor func(irac *types.RecordAndContext) (*types.RecordAndContext, error),
) error {
outputStream := os.Stdout
outputIsStdout := true
diff --git a/docs/src/miller-as-library/main1.go b/docs/src/miller-as-library/main1.go
index c56f5a0db..68823b9f2 100644
--- a/docs/src/miller-as-library/main1.go
+++ b/docs/src/miller-as-library/main1.go
@@ -3,8 +3,8 @@ package main
import (
"fmt"
- "github.com/johnkerl/miller/pkg/bifs"
- "github.com/johnkerl/miller/pkg/mlrval"
+ "github.com/johnkerl/miller/v6/pkg/bifs"
+ "github.com/johnkerl/miller/v6/pkg/mlrval"
)
func main() {
diff --git a/docs/src/miller-as-library/main2.go b/docs/src/miller-as-library/main2.go
index c460a174a..8434f14bd 100644
--- a/docs/src/miller-as-library/main2.go
+++ b/docs/src/miller-as-library/main2.go
@@ -7,11 +7,11 @@ import (
"fmt"
"os"
- "github.com/johnkerl/miller/pkg/bifs"
- "github.com/johnkerl/miller/pkg/cli"
- "github.com/johnkerl/miller/pkg/input"
- "github.com/johnkerl/miller/pkg/output"
- "github.com/johnkerl/miller/pkg/types"
+ "github.com/johnkerl/miller/v6/pkg/bifs"
+ "github.com/johnkerl/miller/v6/pkg/cli"
+ "github.com/johnkerl/miller/v6/pkg/input"
+ "github.com/johnkerl/miller/v6/pkg/output"
+ "github.com/johnkerl/miller/v6/pkg/types"
)
// Put your record-processing logic here.
@@ -49,7 +49,7 @@ func custom_options() *cli.TOptions {
func run_custom_processor(
fileNames []string,
options *cli.TOptions,
- record_processor func (irac *types.RecordAndContext) (*types.RecordAndContext, error),
+ record_processor func(irac *types.RecordAndContext) (*types.RecordAndContext, error),
) error {
outputStream := os.Stdout
outputIsStdout := true
diff --git a/docs/src/miller-as-library/main3.go b/docs/src/miller-as-library/main3.go
index 07d4be50e..617488c33 100644
--- a/docs/src/miller-as-library/main3.go
+++ b/docs/src/miller-as-library/main3.go
@@ -7,11 +7,11 @@ import (
"fmt"
"os"
- "github.com/johnkerl/miller/pkg/cli"
- "github.com/johnkerl/miller/pkg/input"
- "github.com/johnkerl/miller/pkg/output"
- "github.com/johnkerl/miller/pkg/transformers"
- "github.com/johnkerl/miller/pkg/types"
+ "github.com/johnkerl/miller/v6/pkg/cli"
+ "github.com/johnkerl/miller/v6/pkg/input"
+ "github.com/johnkerl/miller/v6/pkg/output"
+ "github.com/johnkerl/miller/v6/pkg/transformers"
+ "github.com/johnkerl/miller/v6/pkg/types"
)
func convert_csv_to_json(fileNames []string) error {
@@ -89,10 +89,10 @@ func convert_csv_to_json(fileNames []string) error {
case ierr := <-inputErrorChannel:
retval = ierr
break
- case _ = <-dataProcessingErrorChannel:
+ case <-dataProcessingErrorChannel:
retval = errors.New("exiting due to data error") // details already printed
break
- case _ = <-doneWritingChannel:
+ case <-doneWritingChannel:
done = true
break
}
diff --git a/docs/src/miller-on-windows.md b/docs/src/miller-on-windows.md
index b45ce5c43..8ffb6a44b 100644
--- a/docs/src/miller-on-windows.md
+++ b/docs/src/miller-on-windows.md
@@ -18,7 +18,7 @@ Quick links:
## Native builds as of Miller 6
-Miller was originally developed for Unix-like operating systems including Linux and MacOS. Since Miller 5.2.0 which was the first version to support Windows at all, that support has been partial. But as of version 6.0.0, Miller builds directly on Windows.
+Miller was originally developed for Unix-like operating systems, including Linux and MacOS. Since Miller 5.2.0, which was the first version to support Windows at all, that support has been partial. But as of version 6.0.0, Miller builds directly on Windows.
**The experience is now almost the same on Windows as it is on Linux, NetBSD/FreeBSD, and MacOS.**
@@ -28,7 +28,7 @@ See [Installation](installing-miller.md) for how to get a copy of `mlr.exe`.
## Setup
-Simply place `mlr.exe` somewhere within your `PATH` variable.
+Place `mlr.exe` somewhere within your `PATH` variable.

@@ -38,7 +38,7 @@ To use Miller from within MSYS2/Cygwin, also make sure `mlr.exe` is within the `
## Differences
-The Windows-support code within Miller makes effort to support Linux/Unix/MacOS-like command-line syntax including single-quoting of expressions for `mlr put` and `mlr filter` -- and in the examples above, this often works. However, there are still some cases where more complex expressions aren't successfully parsed from the Windows prompt, even though they are from MSYS2:
+The Windows-support code within Miller makes an effort to support Linux/Unix/MacOS-like command-line syntax, including single-quoting of expressions for `mlr put` and `mlr filter` -- and in the examples above, this often works. However, there are still some cases where more complex expressions aren't successfully parsed from the Windows prompt, even though they are from MSYS2:

diff --git a/docs/src/miller-on-windows.md.in b/docs/src/miller-on-windows.md.in
index 1bd135d25..4b80ab7ae 100644
--- a/docs/src/miller-on-windows.md.in
+++ b/docs/src/miller-on-windows.md.in
@@ -2,7 +2,7 @@
## Native builds as of Miller 6
-Miller was originally developed for Unix-like operating systems including Linux and MacOS. Since Miller 5.2.0 which was the first version to support Windows at all, that support has been partial. But as of version 6.0.0, Miller builds directly on Windows.
+Miller was originally developed for Unix-like operating systems, including Linux and MacOS. Since Miller 5.2.0, which was the first version to support Windows at all, that support has been partial. But as of version 6.0.0, Miller builds directly on Windows.
**The experience is now almost the same on Windows as it is on Linux, NetBSD/FreeBSD, and MacOS.**
@@ -12,7 +12,7 @@ See [Installation](installing-miller.md) for how to get a copy of `mlr.exe`.
## Setup
-Simply place `mlr.exe` somewhere within your `PATH` variable.
+Place `mlr.exe` somewhere within your `PATH` variable.

@@ -22,7 +22,7 @@ To use Miller from within MSYS2/Cygwin, also make sure `mlr.exe` is within the `
## Differences
-The Windows-support code within Miller makes effort to support Linux/Unix/MacOS-like command-line syntax including single-quoting of expressions for `mlr put` and `mlr filter` -- and in the examples above, this often works. However, there are still some cases where more complex expressions aren't successfully parsed from the Windows prompt, even though they are from MSYS2:
+The Windows-support code within Miller makes an effort to support Linux/Unix/MacOS-like command-line syntax, including single-quoting of expressions for `mlr put` and `mlr filter` -- and in the examples above, this often works. However, there are still some cases where more complex expressions aren't successfully parsed from the Windows prompt, even though they are from MSYS2:

diff --git a/docs/src/miller-programming-language.md b/docs/src/miller-programming-language.md
index f8d2f027b..2b87c5106 100644
--- a/docs/src/miller-programming-language.md
+++ b/docs/src/miller-programming-language.md
@@ -16,11 +16,11 @@ Quick links:
# Intro to Miller's programming language
-In the [Miller in 10 minutes](10min.md) page we took a tour of some of Miller's most-used [verbs](reference-verbs.md) including `cat`, `head`, `tail`, `cut`, and `sort`. These are analogs of familiar system commands, but empowered by field-name indexing and file-format awareness: the system `sort` command only knows about lines and column names like `1,2,3,4`, while `mlr sort` knows about CSV/TSV/JSON/etc records, and field names like `color,shape,flag,index`.
+On the [Miller in 10 minutes](10min.md) page, we took a tour of some of Miller's most-used [verbs](reference-verbs.md), including `cat`, `head`, `tail`, `cut`, and `sort`. These are analogs of familiar system commands, but empowered by field-name indexing and file-format awareness: the system `sort` command only knows about lines and column names like `1,2,3,4`, while `mlr sort` knows about CSV/TSV/JSON/etc records, and field names like `color,shape,flag,index`.
-We also caught a glimpse of Miller's `put` and `filter` verbs. These two are special since they let you express statements using Miller's programming language. It's a *embedded domain-specific language* since it's inside Miller: often referred to simply as the *Miller DSL*.
+We also caught a glimpse of Miller's `put` and `filter` verbs. These two are special because they allow you to express statements using Miller's programming language. It's an *embedded domain-specific language* since it's inside Miller: often referred to simply as the *Miller DSL*.
-In the [DSL reference](reference-dsl.md) page we have a complete reference to Miller's programming language. For now, let's take a quick look at key features -- you can use as few or as many features as you like.
+On the [DSL reference](reference-dsl.md) page, we have a complete reference to Miller's programming language. For now, let's take a quick look at key features -- you can use as few or as many features as you like.
## Records and fields
@@ -45,9 +45,9 @@ purple square false 10 91 72.3735 8.2430 596.5747605000001
When we type that, a few things are happening:
-* We refer to fields in the input data using a dollar sign and then the field name, e.g. `$quantity`. (If a field name contains special characters like a dot or slash, just use curly braces: `${field.name}`.)
+* We refer to fields in the input data using a dollar sign and then the field name, e.g., `$quantity`. (If a field name contains special characters like a dot or slash, just use curly braces: `${field.name}`.)
* The expression `$cost = $quantity * $rate` is executed once per record of the data file. Our [example.csv](./example.csv) has 10 records so this expression was executed 10 times, with the field names `$quantity` and `$rate` each time bound to the current record's values for those fields.
-* On the left-hand side we have the new field name `$cost` which didn't come from the input data. Assignments to new variables result in a new field being placed after all the other ones. If we'd assigned to an existing field name, it would have been updated in-place.
+* On the left-hand side, we have the new field name `$cost`, which didn't come from the input data. Assignments to new variables result in a new field being placed after all the other ones. If we'd assigned to an existing field name, it would have been updated in place.
* The entire expression is surrounded by single quotes (with an adjustment needed on [Windows](miller-on-windows.md)), to get it past the system shell. Inside those, only double quotes have meaning in Miller's programming language.
## Multi-line statements, and statements-from-file
@@ -91,9 +91,9 @@ yellow circle true 9 8700 63.5058 8.3350 529.3208430000001
purple square false 10 9100 72.3735 8.2430 596.5747605000001
-Anything from a `#` character to end of line is a code comment.
+Anything from a `#` character to the end of the line is a code comment.
-One of Miller's key features is the ability to express data-transformation right there at the keyboard, interactively. But if you find yourself using expressions repeatedly, you can put everything between the single quotes into a file and refer to that using `put -f`:
+One of Miller's key features is the ability to express data transformation right there at the keyboard, interactively. But if you find yourself using expressions repeatedly, you can put everything between the single quotes into a file and refer to that using `put -f`:
cat dsl-example.mlr @@ -120,13 +120,13 @@ yellow circle true 9 8700 63.5058 8.3350 529.3208430000001 purple square false 10 9100 72.3735 8.2430 596.5747605000001-This becomes particularly important on Windows. Quite a bit of effort was put into making Miller on Windows be able to handle the kinds of single-quoted expressions we're showing here, but if you get syntax-error messages on Windows using examples in this documentation, you can put the parts between single quotes into a file and refer to that using `mlr put -f` -- or, use the triple-double-quote trick as described in the [Miller on Windows page](miller-on-windows.md). +This becomes particularly important on Windows. Quite a bit of effort was put into making Miller on Windows be able to handle the kinds of single-quoted expressions we're showing here. Still, if you get syntax-error messages on Windows using examples in this documentation, you can put the parts between single quotes into a file and refer to that using `mlr put -f` -- or, use the triple-double-quote trick as described in the [Miller on Windows page](miller-on-windows.md). ## Out-of-stream variables, begin, and end -Above we saw that your expression is executed once per record -- if a file has a million records, your expression will be executed a million times, once for each record. But you can mark statements to only be executed once, either before the record stream begins, or after the record stream is ended. If you know about [AWK](https://en.wikipedia.org/wiki/AWK), you might have noticed that Miller's programming language is loosely inspired by it, including the `begin` and `end` statements. +Above, we saw that your expression is executed once per record: if a file has a million records, your expression will be executed a million times, once for each record. But you can mark statements only to be executed once, either before the record stream begins or after the record stream is ended. If you know about [AWK](https://en.wikipedia.org/wiki/AWK), you might have noticed that Miller's programming language is loosely inspired by it, including the `begin` and `end` statements. -Above we also saw that names like `$quantity` are bound to each record in turn. +Above, we also saw that names like `$quantity` are bound to each record in turn. To make `begin` and `end` statements useful, we need somewhere to put things that persist across the duration of the record stream, and a way to emit them. Miller uses [**out-of-stream variables**](reference-dsl-variables.md#out-of-stream-variables) (or **oosvars** for short) whose names start with an `@` sigil, along with the [`emit`](reference-dsl-output-statements.md#emit-statements) keyword to write them into the output record stream: @@ -209,8 +209,8 @@ So, take this sum/count example as an indication of the kinds of things you can Also inspired by [AWK](https://en.wikipedia.org/wiki/AWK), the Miller DSL has the following special [**context variables**](reference-dsl-variables.md#built-in-variables): * `FILENAME` -- the filename the current record came from. Especially useful in things like `mlr ... *.csv`. -* `FILENUM` -- similarly, but integer 1,2,3,... rather than filenam.e -* `NF` -- the number of fields in the current record. Note that if you assign `$newcolumn = some value` then `NF` will increment. +* `FILENUM` -- similarly, but integer 1,2,3,... rather than filename. +* `NF` -- the number of fields in the current record. Note that if you assign `$newcolumn = some value`, then `NF` will increment. * `NR` -- starting from 1, counter of how many records processed so far. * `FNR` -- similar, but resets to 1 at the start of each file. @@ -290,12 +290,12 @@ purple square false 10 91 72.3735 8.2430 3628800 Note that here we used the `-f` flag to `put` to load our function definition, and also the `-e` flag to add another statement on the command line. (We could have also put `$fact = factorial(NR)` inside -`factorial-example.mlr` but that would have made that file less flexible for our +`factorial-example.mlr`, but that would have made that file less flexible for our future use.) ## If-statements, loops, and local variables -Suppose you want to only compute sums conditionally -- you can use an `if` statement: +Suppose you want only to compute sums conditionally -- you can use an `if` statement:
cat if-example.mlr
@@ -331,7 +331,7 @@ page](reference-dsl-control-structures.md#for-loops), Miller has a few kinds of
for-loops. In addition to the usual 3-part `for (i = 0; i < 10; i += 1)` kind
that many programming languages have, Miller also lets you loop over
[maps](reference-main-maps.md) and [arrays](reference-main-arrays.md). We
-haven't encountered maps and arrays yet in this introduction, but for now it
+haven't encountered maps and arrays yet in this introduction, but for now, it
suffices to know that `$*` is a special variable holding the current record as
a map:
@@ -375,14 +375,14 @@ Here we used the local variables `k` and `v`. Now we've seen four kinds of varia
* Local variables like `k`
* Built-in context variables like `NF` and `NR`
-If you're curious about scope and extent of local variables, you can read more in the [section on variables](reference-dsl-variables.md).
+If you're curious about the scope and extent of local variables, you can read more in the [section on variables](reference-dsl-variables.md).
## Arithmetic
Numbers in Miller's programming language are intended to operate with the principle of least surprise:
* Internally, numbers are either 64-bit signed integers or double-precision floating-point.
-* Sums, differences, and products of integers are also integers (so `2*3=6` not `6.0`) -- unless the result of the operation would overflow a 64-bit signed integer in which case the result is automatically converted to float. (If you ever want integer-to-integer arithmetic, use `x .+ y`, `x .* y`, etc.)
+* Sums, differences, and products of integers are also integers (so `2*3=6` not `6.0`) -- unless the result of the operation would overflow a 64-bit signed integer, in which case the result is automatically converted to float. (If you ever want integer-to-integer arithmetic, use `x .+ y`, `x .* y`, etc.)
* Quotients of integers are integers if the division is exact, else floating-point: so `6/2=3` but `7/2=3.5`.
You can read more about this in the [arithmetic reference](reference-main-arithmetic.md).
@@ -397,7 +397,7 @@ see more in the [null-data reference](reference-main-null-data.md) but the
basic idea is:
* Adding a number to absent gives the number back. This means you don't have to put `@sum = 0` in your `begin` blocks.
-* Any variable which has the absent value is not assigned. This means you don't have to check presence of things from one record to the next.
+* Any variable that has the absent value is not assigned. This means you don't have to check the presence of things from one record to the next.
For example, you can sum up all the `$a` values across records without having to check whether they're present or not:
diff --git a/docs/src/miller-programming-language.md.in b/docs/src/miller-programming-language.md.in
index eb5702f3b..91b9499bf 100644
--- a/docs/src/miller-programming-language.md.in
+++ b/docs/src/miller-programming-language.md.in
@@ -1,10 +1,10 @@
# Intro to Miller's programming language
-In the [Miller in 10 minutes](10min.md) page we took a tour of some of Miller's most-used [verbs](reference-verbs.md) including `cat`, `head`, `tail`, `cut`, and `sort`. These are analogs of familiar system commands, but empowered by field-name indexing and file-format awareness: the system `sort` command only knows about lines and column names like `1,2,3,4`, while `mlr sort` knows about CSV/TSV/JSON/etc records, and field names like `color,shape,flag,index`.
+On the [Miller in 10 minutes](10min.md) page, we took a tour of some of Miller's most-used [verbs](reference-verbs.md), including `cat`, `head`, `tail`, `cut`, and `sort`. These are analogs of familiar system commands, but empowered by field-name indexing and file-format awareness: the system `sort` command only knows about lines and column names like `1,2,3,4`, while `mlr sort` knows about CSV/TSV/JSON/etc records, and field names like `color,shape,flag,index`.
-We also caught a glimpse of Miller's `put` and `filter` verbs. These two are special since they let you express statements using Miller's programming language. It's a *embedded domain-specific language* since it's inside Miller: often referred to simply as the *Miller DSL*.
+We also caught a glimpse of Miller's `put` and `filter` verbs. These two are special because they allow you to express statements using Miller's programming language. It's an *embedded domain-specific language* since it's inside Miller: often referred to simply as the *Miller DSL*.
-In the [DSL reference](reference-dsl.md) page we have a complete reference to Miller's programming language. For now, let's take a quick look at key features -- you can use as few or as many features as you like.
+On the [DSL reference](reference-dsl.md) page, we have a complete reference to Miller's programming language. For now, let's take a quick look at key features -- you can use as few or as many features as you like.
## Records and fields
@@ -16,9 +16,9 @@ GENMD-EOF
When we type that, a few things are happening:
-* We refer to fields in the input data using a dollar sign and then the field name, e.g. `$quantity`. (If a field name contains special characters like a dot or slash, just use curly braces: `${field.name}`.)
+* We refer to fields in the input data using a dollar sign and then the field name, e.g., `$quantity`. (If a field name contains special characters like a dot or slash, just use curly braces: `${field.name}`.)
* The expression `$cost = $quantity * $rate` is executed once per record of the data file. Our [example.csv](./example.csv) has 10 records so this expression was executed 10 times, with the field names `$quantity` and `$rate` each time bound to the current record's values for those fields.
-* On the left-hand side we have the new field name `$cost` which didn't come from the input data. Assignments to new variables result in a new field being placed after all the other ones. If we'd assigned to an existing field name, it would have been updated in-place.
+* On the left-hand side, we have the new field name `$cost`, which didn't come from the input data. Assignments to new variables result in a new field being placed after all the other ones. If we'd assigned to an existing field name, it would have been updated in place.
* The entire expression is surrounded by single quotes (with an adjustment needed on [Windows](miller-on-windows.md)), to get it past the system shell. Inside those, only double quotes have meaning in Miller's programming language.
## Multi-line statements, and statements-from-file
@@ -36,9 +36,9 @@ mlr --c2p put '
' example.csv
GENMD-EOF
-Anything from a `#` character to end of line is a code comment.
+Anything from a `#` character to the end of the line is a code comment.
-One of Miller's key features is the ability to express data-transformation right there at the keyboard, interactively. But if you find yourself using expressions repeatedly, you can put everything between the single quotes into a file and refer to that using `put -f`:
+One of Miller's key features is the ability to express data transformation right there at the keyboard, interactively. But if you find yourself using expressions repeatedly, you can put everything between the single quotes into a file and refer to that using `put -f`:
GENMD-RUN-COMMAND
cat dsl-example.mlr
@@ -48,13 +48,13 @@ GENMD-RUN-COMMAND
mlr --c2p put -f dsl-example.mlr example.csv
GENMD-EOF
-This becomes particularly important on Windows. Quite a bit of effort was put into making Miller on Windows be able to handle the kinds of single-quoted expressions we're showing here, but if you get syntax-error messages on Windows using examples in this documentation, you can put the parts between single quotes into a file and refer to that using `mlr put -f` -- or, use the triple-double-quote trick as described in the [Miller on Windows page](miller-on-windows.md).
+This becomes particularly important on Windows. Quite a bit of effort was put into making Miller on Windows be able to handle the kinds of single-quoted expressions we're showing here. Still, if you get syntax-error messages on Windows using examples in this documentation, you can put the parts between single quotes into a file and refer to that using `mlr put -f` -- or, use the triple-double-quote trick as described in the [Miller on Windows page](miller-on-windows.md).
## Out-of-stream variables, begin, and end
-Above we saw that your expression is executed once per record -- if a file has a million records, your expression will be executed a million times, once for each record. But you can mark statements to only be executed once, either before the record stream begins, or after the record stream is ended. If you know about [AWK](https://en.wikipedia.org/wiki/AWK), you might have noticed that Miller's programming language is loosely inspired by it, including the `begin` and `end` statements.
+Above, we saw that your expression is executed once per record: if a file has a million records, your expression will be executed a million times, once for each record. But you can mark statements only to be executed once, either before the record stream begins or after the record stream is ended. If you know about [AWK](https://en.wikipedia.org/wiki/AWK), you might have noticed that Miller's programming language is loosely inspired by it, including the `begin` and `end` statements.
-Above we also saw that names like `$quantity` are bound to each record in turn.
+Above, we also saw that names like `$quantity` are bound to each record in turn.
To make `begin` and `end` statements useful, we need somewhere to put things that persist across the duration of the record stream, and a way to emit them. Miller uses [**out-of-stream variables**](reference-dsl-variables.md#out-of-stream-variables) (or **oosvars** for short) whose names start with an `@` sigil, along with the [`emit`](reference-dsl-output-statements.md#emit-statements) keyword to write them into the output record stream:
@@ -94,8 +94,8 @@ So, take this sum/count example as an indication of the kinds of things you can
Also inspired by [AWK](https://en.wikipedia.org/wiki/AWK), the Miller DSL has the following special [**context variables**](reference-dsl-variables.md#built-in-variables):
* `FILENAME` -- the filename the current record came from. Especially useful in things like `mlr ... *.csv`.
-* `FILENUM` -- similarly, but integer 1,2,3,... rather than filenam.e
-* `NF` -- the number of fields in the current record. Note that if you assign `$newcolumn = some value` then `NF` will increment.
+* `FILENUM` -- similarly, but integer 1,2,3,... rather than filename.
+* `NF` -- the number of fields in the current record. Note that if you assign `$newcolumn = some value`, then `NF` will increment.
* `NR` -- starting from 1, counter of how many records processed so far.
* `FNR` -- similar, but resets to 1 at the start of each file.
@@ -130,12 +130,12 @@ GENMD-EOF
Note that here we used the `-f` flag to `put` to load our function
definition, and also the `-e` flag to add another statement on the command
line. (We could have also put `$fact = factorial(NR)` inside
-`factorial-example.mlr` but that would have made that file less flexible for our
+`factorial-example.mlr`, but that would have made that file less flexible for our
future use.)
## If-statements, loops, and local variables
-Suppose you want to only compute sums conditionally -- you can use an `if` statement:
+Suppose you want only to compute sums conditionally -- you can use an `if` statement:
GENMD-RUN-COMMAND
cat if-example.mlr
@@ -152,7 +152,7 @@ page](reference-dsl-control-structures.md#for-loops), Miller has a few kinds of
for-loops. In addition to the usual 3-part `for (i = 0; i < 10; i += 1)` kind
that many programming languages have, Miller also lets you loop over
[maps](reference-main-maps.md) and [arrays](reference-main-arrays.md). We
-haven't encountered maps and arrays yet in this introduction, but for now it
+haven't encountered maps and arrays yet in this introduction, but for now, it
suffices to know that `$*` is a special variable holding the current record as
a map:
@@ -175,14 +175,14 @@ Here we used the local variables `k` and `v`. Now we've seen four kinds of varia
* Local variables like `k`
* Built-in context variables like `NF` and `NR`
-If you're curious about scope and extent of local variables, you can read more in the [section on variables](reference-dsl-variables.md).
+If you're curious about the scope and extent of local variables, you can read more in the [section on variables](reference-dsl-variables.md).
## Arithmetic
Numbers in Miller's programming language are intended to operate with the principle of least surprise:
* Internally, numbers are either 64-bit signed integers or double-precision floating-point.
-* Sums, differences, and products of integers are also integers (so `2*3=6` not `6.0`) -- unless the result of the operation would overflow a 64-bit signed integer in which case the result is automatically converted to float. (If you ever want integer-to-integer arithmetic, use `x .+ y`, `x .* y`, etc.)
+* Sums, differences, and products of integers are also integers (so `2*3=6` not `6.0`) -- unless the result of the operation would overflow a 64-bit signed integer, in which case the result is automatically converted to float. (If you ever want integer-to-integer arithmetic, use `x .+ y`, `x .* y`, etc.)
* Quotients of integers are integers if the division is exact, else floating-point: so `6/2=3` but `7/2=3.5`.
You can read more about this in the [arithmetic reference](reference-main-arithmetic.md).
@@ -197,7 +197,7 @@ see more in the [null-data reference](reference-main-null-data.md) but the
basic idea is:
* Adding a number to absent gives the number back. This means you don't have to put `@sum = 0` in your `begin` blocks.
-* Any variable which has the absent value is not assigned. This means you don't have to check presence of things from one record to the next.
+* Any variable that has the absent value is not assigned. This means you don't have to check the presence of things from one record to the next.
For example, you can sum up all the `$a` values across records without having to check whether they're present or not:
diff --git a/docs/src/mlr.1 b/docs/src/mlr.1
index 0a026dd4f..5aa5bd16f 100644
--- a/docs/src/mlr.1
+++ b/docs/src/mlr.1
@@ -722,7 +722,7 @@ Passes through input records with specified fields included/excluded.
-r Treat field names as regular expressions. "ab", "a.*b" will
match any field name containing the substring "ab" or matching
"a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may
- be used. The -o flag is ignored when -r is present.
+ be used.
Examples:
mlr cut -f hostname,status
mlr cut -x -f hostname,status
diff --git a/docs/src/new-in-miller-6.md b/docs/src/new-in-miller-6.md
index 32633b6f8..86a52a40d 100644
--- a/docs/src/new-in-miller-6.md
+++ b/docs/src/new-in-miller-6.md
@@ -24,43 +24,23 @@ TL;DRs: [install](installing-miller.md), [binaries](https://github.com/johnkerl/
### Performance
-Performance is on par with Miller 5 for simple processing, and is far better than Miller 5 for
-complex processing chains -- the latter due to improved multicore utilization. CSV I/O is notably
-improved. See the [Performance benchmarks](#performance-benchmarks) section at the bottom of this
-page for details.
+Performance is on par with Miller 5 for simple processing, and is far better than Miller 5 for complex processing chains -- the latter due to improved multicore utilization. CSV I/O is notably improved. See the [Performance benchmarks](#performance-benchmarks) section at the bottom of this page for details.
### Documentation improvements
Documentation (what you're reading here) and online help (`mlr --help`) have been completely reworked.
-In the initial release, the focus was convincing users already familiar with
-`awk`/`grep`/`cut` that Miller was a viable alternative -- but over time it's
-become clear that many Miller users aren't expert with those tools. The focus
-has shifted toward a higher quantity of more introductory/accessible material
-for command-line data processing.
+In the initial release, the focus was on convincing users already familiar with `awk`, `grep`, and `cut` that Miller was a viable alternative; however, over time, it has become clear that many Miller users aren't experts with those tools. The focus has shifted toward a higher quantity of more introductory/accessible material for command-line data processing.
-Similarly, the FAQ/recipe material has been expanded to include more, and
-simpler, use-cases including resolved questions from
-[Miller Issues](https://github.com/johnkerl/miller/issues)
-and
-[Miller Discussions](https://github.com/johnkerl/miller/discussions);
-more complex/niche material has been pushed farther down. The long reference
-pages have been split up into separate pages. (See also
-[Structure of these documents](structure-of-these-documents.md).)
+Similarly, the FAQ/recipe material has been expanded to include more, and simpler, use-cases, including resolved questions from [Miller Issues](https://github.com/johnkerl/miller/issues) and [Miller Discussions](https://github.com/johnkerl/miller/discussions); more complex/niche material has been pushed farther down. The lengthy reference pages have been divided into separate pages. (See also [Structure of these documents](structure-of-these-documents.md).)
-One of the main feedback themes from the 2021 Miller User Survey was that some
-things should be easier to find. Namely, on each doc page there's now a banner
-across the top with things that should be one click away from the landing page
-(or any page): command-line flags, verbs, functions, glossary/acronyms, and a
-finder for docs by release.
+One of the main feedback themes from the 2021 Miller User Survey was that some things should be easier to find. Namely, on each doc page, there's now a banner across the top with things that should be one click away from the landing page (or any page): command-line flags, verbs, functions, glossary/acronyms, and a finder for docs by release.
-Since CSV is overwhelmingly the most popular data format for Miller, it is
-now discussed first, and more examples use CSV.
+Since CSV is overwhelmingly the most popular data format for Miller, it is now discussed first, and more examples use CSV.
### Improved Windows experience
-Stronger support for Windows (with or without MSYS2), with a couple of
-exceptions. See [Miller on Windows](miller-on-windows.md) for more information.
+Stronger support for Windows (with or without MSYS2), with a couple of exceptions. See [Miller on Windows](miller-on-windows.md) for more information.
Binaries are reliably available using GitHub Actions: see also [Installation](installing-miller.md).
@@ -89,9 +69,7 @@ Parse error on token ">" at line 63 column 7.
### Scripting
-Scripting is now easier -- support for `#!` with `sh`, as always, along with now support for `#!` with `mlr -s`. For
-Windows, `mlr -s` can also be used. These help reduce backslash-clutter and let you do more while typing less.
-See the [scripting page](scripting.md).
+Scripting is now easier -- support for `#!` with `sh`, as always, along with now support for `#!` with `mlr -s`. For Windows, `mlr -s` can also be used. These help reduce backslash clutter and let you do more while typing less. See the [scripting page](scripting.md).
### REPL
@@ -143,7 +121,7 @@ the `TZ` environment variable. Please see [DSL datetime/timezone functions](refe
### In-process support for compressed input
-In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z`, `.bz2`, and `.zst` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information.
+In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly, for `.z`, `.bz2`, and `.zst` files. Please refer to the page on [Compressed Data](reference-main-compressed-data.md) for more information.
### Support for reading web URLs
@@ -171,9 +149,7 @@ purple,triangle,false,7,65,80.1405,5.8240
### Improved JSON / JSON Lines support, and arrays
-Arrays are now supported in Miller's `put`/`filter` programming language, as
-described in the [Arrays reference](reference-main-arrays.md). (Also, `array` is
-now a keyword so this is no longer usable as a local-variable or UDF name.)
+Arrays are now supported in Miller's `put`/`filter` programming language, as described in the [Arrays reference](reference-main-arrays.md). (Also, `array` is now a keyword, so this is no longer usable as a local variable or UDF name.)
JSON support is improved:
@@ -196,24 +172,13 @@ See also the [Arrays reference](reference-main-arrays.md) for more information.
### Improved numeric conversion
-The most central part of Miller 6 is a deep refactor of how data values are parsed
-from file contents, how types are inferred, and how they're converted back to
-text into output files.
+The most central part of Miller 6 is a deep refactor of how data values are parsed from file contents, how types are inferred, and how they're converted back to text into output files.
This was all initiated by [https://github.com/johnkerl/miller/issues/151](https://github.com/johnkerl/miller/issues/151).
-In Miller 5 and below, all values were stored as strings, then only converted
-to int/float as-needed, for example when a particular field was referenced in
-the `stats1` or `put` verbs. This led to awkwardnesses such as the `-S`
-and `-F` flags for `put` and `filter`.
+In Miller 5 and below, all values were stored as strings, then only converted to int/float as needed, for example, when a particular field was referenced in the `stats1` or `put` verbs. This led to awkwardnesses such as the `-S` and `-F` flags for `put` and `filter`.
-In Miller 6, things parseable as int/float are treated as such from the moment
-the input data is read, and these are passed along through the verb chain. All
-values are typed from when they're read, and their types are passed along.
-Meanwhile the original string representation of each value is also retained. If
-a numeric field isn't modified during the processing chain, it's printed out
-the way it arrived. Also, quoted values in JSON strings are flagged as being
-strings throughout the processing chain.
+In Miller 6, values parseable as integers or floating-point numbers are treated as such from the moment the input data is read, and these are passed along through the verb chain. All values are typed from when they're read, and their types are passed along. Meanwhile, the original string representation of each value is also retained. If a numeric field isn't modified during the processing chain, it's printed out the way it arrived. Additionally, quoted values in JSON strings are consistently flagged as strings throughout the processing chain.
For example (see [https://github.com/johnkerl/miller/issues/178](https://github.com/johnkerl/miller/issues/178)) you can now do
@@ -242,30 +207,21 @@ For example (see [https://github.com/johnkerl/miller/issues/178](https://github.
### Deduping of repeated field names
-By default, field names are deduped for all file formats except JSON / JSON Lines. So if you
-have an input record with `x=8,x=9` then the second field's key is renamed to
-`x_2` and so on -- the record scans as `x=8,x_2=9`. Use `mlr
---no-dedupe-field-names` to suppress this, and have the record be scanned as
-`x=9`.
+By default, field names are deduplicated for all file formats except JSON / JSON Lines. So if you have an input record with `x=8,x=9`, then the second field's key is renamed to `x_2` and so on -- the record scans as `x=8,x_2=9`. Use `mlr --no-dedupe-field-names` to suppress this, and have the record be scanned as `x=9`.
-For JSON and JSON Lines, the last duplicated key in an input record is always retained,
-regardless of `mlr --no-dedupe-field-names`: `{"x":8,"x":9}` scans as if it
-were `{"x":9}`.
+For JSON and JSON Lines, the last duplicated key in an input record is always retained, regardless of `mlr --no-dedupe-field-names`: `{"x":8,"x":9}` scans as if it were `{"x":9}`.
### Regex support for IFS and IPS
-You can now split fields on whitespace when whitespace is a mix of tabs and
-spaces. As well, you can use regular expressions for the input field-separator
-and the input pair-separator. Please see the section on
-[multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
+You can now split fields on whitespace when whitespace is a mix of tabs and spaces. As well, you can use regular expressions for the input field-separator and the input pair-separator. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
-In particular, for NIDX format, the default IFS now allows splitting on one or more of space or tab.
+In particular, for NIDX format, the default `IFS` now allows splitting on one or more of space or tab.
### Case-folded sorting options
-The [sort](reference-verbs.md#sort) verb now accepts `-c` and `-cr` options for case-folded ascending/descending sort, respetively.
+The [sort](reference-verbs.md#sort) verb now accepts `-c` and `-cr` options for case-folded ascending/descending sort, respectively.
-### New DSL functions / operators
+### New DSL functions and operators
* Higher-order functions [`select`](reference-dsl-builtin-functions.md#select), [`apply`](reference-dsl-builtin-functions.md#apply), [`reduce`](reference-dsl-builtin-functions.md#reduce), [`fold`](reference-dsl-builtin-functions.md#fold), and [`sort`](reference-dsl-builtin-functions.md#sort). See the [sorting page](sorting.md) and the [higher-order-functions page](reference-dsl-higher-order-functions.md) for more information.
@@ -293,30 +249,30 @@ The following differences are rather technical. If they don't sound familiar to
### Line endings
-The `--auto` flag is now ignored. Before, if a file had CR/LF (Windows-style) line endings on input (on any platform), it would have the same on output; likewise, LF (Unix-style) line endings. Now, files with CR/LF or LF line endings are processed on any platform, but the output line-ending is for the platform. E.g. reading CR/LF files on Linux will now produce LF output.
+The `--auto` flag is now ignored. Before, if a file had CR/LF (Windows-style) line endings on input (on any platform), it would have the same on output; likewise, LF (Unix-style) line endings. Now, files with CR/LF or LF line endings are processed on any platform, but the output line ending is for the platform. E.g., reading CR/LF files on Linux will now produce LF output.
### IFS and IPS as regular expressions
-IFS and IPS can be regular expressions now. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
+IFS and IPS can now be regular expressions. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
### JSON and JSON Lines formatting
* `--jknquoteint` and `jquoteall` are ignored; they were workarounds for the (now much-improved) type-inference and type-tracking in Miller 6.
* `--json-fatal-arrays-on-input`, `--json-map-arrays-on-input`, and `--json-skip-arrays-on-input` are ignored; Miller 6 now supports arrays fully.
* See also `mlr help legacy-flags` or the [legacy-flags reference](reference-main-flag-list.md#legacy-flags).
-* Miller 5 accepted input records either with or without enclosing `[...]`; on output, by default it produced single-line records without outermost `[...]`. Miller 5 let you customize output formatting using `--jvstack` (multi-line records) and `--jlistwrap` (write outermost `[...]`). _Thus, Miller 5's JSON output format, with default flags, was in fact [JSON Lines](file-formats.md#json-lines) all along._
+* Miller 5 accepted input records either with or without enclosing `[...]`; on output, by default, it produced single-line records without outermost `[...]`. Miller 5 lets you customize output formatting using `--jvstack` (multi-line records) and `--jlistwrap` (write outermost `[...]`). _Thus, Miller 5's JSON output format, with default flags, was in fact [JSON Lines](file-formats.md#json-lines) all along._
* In Miller 6, [JSON Lines](file-formats.md#json-lines) is acknowledged explicitly.
* On input, your records are accepted whether or not they have outermost `[...]`, and regardless of line breaks, whether the specified input format is JSON or JSON Lines. (This is similar to [jq](https://stedolan.github.io/jq/).)
* With `--ojson`, output records are written multiline (pretty-printed), with outermost `[...]`.
* With `--ojsonl`, output records are written single-line, without outermost `[...]`.
* This makes `--jvstack` and `--jlistwrap` unnecessary. However, if you want outermost `[...]` with single-line records, you can use `--ojson --no-jvstack`.
-* Miller 5 tolerated trailing commas, which are not compliant with the JSON specification: for example, `{"x":1,"y":2,}`. Miller 6 uses a JSON parser which is compliant with the JSON specification and does not accept trailing commas.
+* Miller 5 tolerated trailing commas, which are not compliant with the JSON specification: for example, `{"x":1,"y":2,}`. Miller 6 uses a JSON parser that is compliant with the JSON specification and does not accept trailing commas.
### Type-inference
* The `-S` and `-F` flags to `mlr put` and `mlr filter` are ignored, since type-inference is no longer done in `mlr put` and `mlr filter`, but rather, when records are first read. You can use `mlr -S` and `mlr -A`, respectively, instead to control type-inference within the record-readers.
* Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as decimal integers.
-* Any numbers prefix with `0o`, e.g. `0o377`, are already treated as octal regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled.
+* Any numbers prefixed with `0o`, e.g. `0o377`, are already treated as octal, regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled.
* See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags).
### Emit statements
@@ -341,13 +297,12 @@ This works in Miller 6 (and worked in Miller 5 as well) and is supported:
input=1
-Please see the [section on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf)
-for more information.
+Please see the [section on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information.
## Developer-specific aspects
* Miller has been ported from C to Go. Developer notes: [https://github.com/johnkerl/miller/blob/main/README-dev.md](https://github.com/johnkerl/miller/blob/main/README-dev.md).
-* Regression testing has been completely reworked, including regression-testing now running fully on Windows (alongside Linux and Mac) [on each GitHub commit](https://github.com/johnkerl/miller/actions).
+* Regression testing has been completely reworked, including regression-testing now running fully on Windows (alongside Linux and Mac) [on each github.commit](https://github.com/johnkerl/miller/actions).
## Performance benchmarks
diff --git a/docs/src/new-in-miller-6.md.in b/docs/src/new-in-miller-6.md.in
index c450a9622..2da9d3feb 100644
--- a/docs/src/new-in-miller-6.md.in
+++ b/docs/src/new-in-miller-6.md.in
@@ -8,43 +8,23 @@ TL;DRs: [install](installing-miller.md), [binaries](https://github.com/johnkerl/
### Performance
-Performance is on par with Miller 5 for simple processing, and is far better than Miller 5 for
-complex processing chains -- the latter due to improved multicore utilization. CSV I/O is notably
-improved. See the [Performance benchmarks](#performance-benchmarks) section at the bottom of this
-page for details.
+Performance is on par with Miller 5 for simple processing, and is far better than Miller 5 for complex processing chains -- the latter due to improved multicore utilization. CSV I/O is notably improved. See the [Performance benchmarks](#performance-benchmarks) section at the bottom of this page for details.
### Documentation improvements
Documentation (what you're reading here) and online help (`mlr --help`) have been completely reworked.
-In the initial release, the focus was convincing users already familiar with
-`awk`/`grep`/`cut` that Miller was a viable alternative -- but over time it's
-become clear that many Miller users aren't expert with those tools. The focus
-has shifted toward a higher quantity of more introductory/accessible material
-for command-line data processing.
+In the initial release, the focus was on convincing users already familiar with `awk`, `grep`, and `cut` that Miller was a viable alternative; however, over time, it has become clear that many Miller users aren't experts with those tools. The focus has shifted toward a higher quantity of more introductory/accessible material for command-line data processing.
-Similarly, the FAQ/recipe material has been expanded to include more, and
-simpler, use-cases including resolved questions from
-[Miller Issues](https://github.com/johnkerl/miller/issues)
-and
-[Miller Discussions](https://github.com/johnkerl/miller/discussions);
-more complex/niche material has been pushed farther down. The long reference
-pages have been split up into separate pages. (See also
-[Structure of these documents](structure-of-these-documents.md).)
+Similarly, the FAQ/recipe material has been expanded to include more, and simpler, use-cases, including resolved questions from [Miller Issues](https://github.com/johnkerl/miller/issues) and [Miller Discussions](https://github.com/johnkerl/miller/discussions); more complex/niche material has been pushed farther down. The lengthy reference pages have been divided into separate pages. (See also [Structure of these documents](structure-of-these-documents.md).)
-One of the main feedback themes from the 2021 Miller User Survey was that some
-things should be easier to find. Namely, on each doc page there's now a banner
-across the top with things that should be one click away from the landing page
-(or any page): command-line flags, verbs, functions, glossary/acronyms, and a
-finder for docs by release.
+One of the main feedback themes from the 2021 Miller User Survey was that some things should be easier to find. Namely, on each doc page, there's now a banner across the top with things that should be one click away from the landing page (or any page): command-line flags, verbs, functions, glossary/acronyms, and a finder for docs by release.
-Since CSV is overwhelmingly the most popular data format for Miller, it is
-now discussed first, and more examples use CSV.
+Since CSV is overwhelmingly the most popular data format for Miller, it is now discussed first, and more examples use CSV.
### Improved Windows experience
-Stronger support for Windows (with or without MSYS2), with a couple of
-exceptions. See [Miller on Windows](miller-on-windows.md) for more information.
+Stronger support for Windows (with or without MSYS2), with a couple of exceptions. See [Miller on Windows](miller-on-windows.md) for more information.
Binaries are reliably available using GitHub Actions: see also [Installation](installing-miller.md).
@@ -73,9 +53,7 @@ GENMD-EOF
### Scripting
-Scripting is now easier -- support for `#!` with `sh`, as always, along with now support for `#!` with `mlr -s`. For
-Windows, `mlr -s` can also be used. These help reduce backslash-clutter and let you do more while typing less.
-See the [scripting page](scripting.md).
+Scripting is now easier -- support for `#!` with `sh`, as always, along with now support for `#!` with `mlr -s`. For Windows, `mlr -s` can also be used. These help reduce backslash clutter and let you do more while typing less. See the [scripting page](scripting.md).
### REPL
@@ -125,7 +103,7 @@ the `TZ` environment variable. Please see [DSL datetime/timezone functions](refe
### In-process support for compressed input
-In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z`, `.bz2`, and `.zst` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information.
+In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly, for `.z`, `.bz2`, and `.zst` files. Please refer to the page on [Compressed Data](reference-main-compressed-data.md) for more information.
### Support for reading web URLs
@@ -140,9 +118,7 @@ GENMD-EOF
### Improved JSON / JSON Lines support, and arrays
-Arrays are now supported in Miller's `put`/`filter` programming language, as
-described in the [Arrays reference](reference-main-arrays.md). (Also, `array` is
-now a keyword so this is no longer usable as a local-variable or UDF name.)
+Arrays are now supported in Miller's `put`/`filter` programming language, as described in the [Arrays reference](reference-main-arrays.md). (Also, `array` is now a keyword, so this is no longer usable as a local variable or UDF name.)
JSON support is improved:
@@ -165,24 +141,13 @@ See also the [Arrays reference](reference-main-arrays.md) for more information.
### Improved numeric conversion
-The most central part of Miller 6 is a deep refactor of how data values are parsed
-from file contents, how types are inferred, and how they're converted back to
-text into output files.
+The most central part of Miller 6 is a deep refactor of how data values are parsed from file contents, how types are inferred, and how they're converted back to text into output files.
This was all initiated by [https://github.com/johnkerl/miller/issues/151](https://github.com/johnkerl/miller/issues/151).
-In Miller 5 and below, all values were stored as strings, then only converted
-to int/float as-needed, for example when a particular field was referenced in
-the `stats1` or `put` verbs. This led to awkwardnesses such as the `-S`
-and `-F` flags for `put` and `filter`.
+In Miller 5 and below, all values were stored as strings, then only converted to int/float as needed, for example, when a particular field was referenced in the `stats1` or `put` verbs. This led to awkwardnesses such as the `-S` and `-F` flags for `put` and `filter`.
-In Miller 6, things parseable as int/float are treated as such from the moment
-the input data is read, and these are passed along through the verb chain. All
-values are typed from when they're read, and their types are passed along.
-Meanwhile the original string representation of each value is also retained. If
-a numeric field isn't modified during the processing chain, it's printed out
-the way it arrived. Also, quoted values in JSON strings are flagged as being
-strings throughout the processing chain.
+In Miller 6, values parseable as integers or floating-point numbers are treated as such from the moment the input data is read, and these are passed along through the verb chain. All values are typed from when they're read, and their types are passed along. Meanwhile, the original string representation of each value is also retained. If a numeric field isn't modified during the processing chain, it's printed out the way it arrived. Additionally, quoted values in JSON strings are consistently flagged as strings throughout the processing chain.
For example (see [https://github.com/johnkerl/miller/issues/178](https://github.com/johnkerl/miller/issues/178)) you can now do
@@ -196,30 +161,21 @@ GENMD-EOF
### Deduping of repeated field names
-By default, field names are deduped for all file formats except JSON / JSON Lines. So if you
-have an input record with `x=8,x=9` then the second field's key is renamed to
-`x_2` and so on -- the record scans as `x=8,x_2=9`. Use `mlr
---no-dedupe-field-names` to suppress this, and have the record be scanned as
-`x=9`.
+By default, field names are deduplicated for all file formats except JSON / JSON Lines. So if you have an input record with `x=8,x=9`, then the second field's key is renamed to `x_2` and so on -- the record scans as `x=8,x_2=9`. Use `mlr --no-dedupe-field-names` to suppress this, and have the record be scanned as `x=9`.
-For JSON and JSON Lines, the last duplicated key in an input record is always retained,
-regardless of `mlr --no-dedupe-field-names`: `{"x":8,"x":9}` scans as if it
-were `{"x":9}`.
+For JSON and JSON Lines, the last duplicated key in an input record is always retained, regardless of `mlr --no-dedupe-field-names`: `{"x":8,"x":9}` scans as if it were `{"x":9}`.
### Regex support for IFS and IPS
-You can now split fields on whitespace when whitespace is a mix of tabs and
-spaces. As well, you can use regular expressions for the input field-separator
-and the input pair-separator. Please see the section on
-[multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
+You can now split fields on whitespace when whitespace is a mix of tabs and spaces. As well, you can use regular expressions for the input field-separator and the input pair-separator. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
-In particular, for NIDX format, the default IFS now allows splitting on one or more of space or tab.
+In particular, for NIDX format, the default `IFS` now allows splitting on one or more of space or tab.
### Case-folded sorting options
-The [sort](reference-verbs.md#sort) verb now accepts `-c` and `-cr` options for case-folded ascending/descending sort, respetively.
+The [sort](reference-verbs.md#sort) verb now accepts `-c` and `-cr` options for case-folded ascending/descending sort, respectively.
-### New DSL functions / operators
+### New DSL functions and operators
* Higher-order functions [`select`](reference-dsl-builtin-functions.md#select), [`apply`](reference-dsl-builtin-functions.md#apply), [`reduce`](reference-dsl-builtin-functions.md#reduce), [`fold`](reference-dsl-builtin-functions.md#fold), and [`sort`](reference-dsl-builtin-functions.md#sort). See the [sorting page](sorting.md) and the [higher-order-functions page](reference-dsl-higher-order-functions.md) for more information.
@@ -247,30 +203,30 @@ The following differences are rather technical. If they don't sound familiar to
### Line endings
-The `--auto` flag is now ignored. Before, if a file had CR/LF (Windows-style) line endings on input (on any platform), it would have the same on output; likewise, LF (Unix-style) line endings. Now, files with CR/LF or LF line endings are processed on any platform, but the output line-ending is for the platform. E.g. reading CR/LF files on Linux will now produce LF output.
+The `--auto` flag is now ignored. Before, if a file had CR/LF (Windows-style) line endings on input (on any platform), it would have the same on output; likewise, LF (Unix-style) line endings. Now, files with CR/LF or LF line endings are processed on any platform, but the output line ending is for the platform. E.g., reading CR/LF files on Linux will now produce LF output.
### IFS and IPS as regular expressions
-IFS and IPS can be regular expressions now. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
+IFS and IPS can now be regular expressions. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
### JSON and JSON Lines formatting
* `--jknquoteint` and `jquoteall` are ignored; they were workarounds for the (now much-improved) type-inference and type-tracking in Miller 6.
* `--json-fatal-arrays-on-input`, `--json-map-arrays-on-input`, and `--json-skip-arrays-on-input` are ignored; Miller 6 now supports arrays fully.
* See also `mlr help legacy-flags` or the [legacy-flags reference](reference-main-flag-list.md#legacy-flags).
-* Miller 5 accepted input records either with or without enclosing `[...]`; on output, by default it produced single-line records without outermost `[...]`. Miller 5 let you customize output formatting using `--jvstack` (multi-line records) and `--jlistwrap` (write outermost `[...]`). _Thus, Miller 5's JSON output format, with default flags, was in fact [JSON Lines](file-formats.md#json-lines) all along._
+* Miller 5 accepted input records either with or without enclosing `[...]`; on output, by default, it produced single-line records without outermost `[...]`. Miller 5 lets you customize output formatting using `--jvstack` (multi-line records) and `--jlistwrap` (write outermost `[...]`). _Thus, Miller 5's JSON output format, with default flags, was in fact [JSON Lines](file-formats.md#json-lines) all along._
* In Miller 6, [JSON Lines](file-formats.md#json-lines) is acknowledged explicitly.
* On input, your records are accepted whether or not they have outermost `[...]`, and regardless of line breaks, whether the specified input format is JSON or JSON Lines. (This is similar to [jq](https://stedolan.github.io/jq/).)
* With `--ojson`, output records are written multiline (pretty-printed), with outermost `[...]`.
* With `--ojsonl`, output records are written single-line, without outermost `[...]`.
* This makes `--jvstack` and `--jlistwrap` unnecessary. However, if you want outermost `[...]` with single-line records, you can use `--ojson --no-jvstack`.
-* Miller 5 tolerated trailing commas, which are not compliant with the JSON specification: for example, `{"x":1,"y":2,}`. Miller 6 uses a JSON parser which is compliant with the JSON specification and does not accept trailing commas.
+* Miller 5 tolerated trailing commas, which are not compliant with the JSON specification: for example, `{"x":1,"y":2,}`. Miller 6 uses a JSON parser that is compliant with the JSON specification and does not accept trailing commas.
### Type-inference
* The `-S` and `-F` flags to `mlr put` and `mlr filter` are ignored, since type-inference is no longer done in `mlr put` and `mlr filter`, but rather, when records are first read. You can use `mlr -S` and `mlr -A`, respectively, instead to control type-inference within the record-readers.
* Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as decimal integers.
-* Any numbers prefix with `0o`, e.g. `0o377`, are already treated as octal regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled.
+* Any numbers prefixed with `0o`, e.g. `0o377`, are already treated as octal, regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled.
* See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags).
### Emit statements
@@ -290,13 +246,12 @@ GENMD-RUN-COMMAND
mlr -n put 'end {@input={"a":1}; emit1 {"input":@input["a"]}}'
GENMD-EOF
-Please see the [section on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf)
-for more information.
+Please see the [section on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information.
## Developer-specific aspects
* Miller has been ported from C to Go. Developer notes: [https://github.com/johnkerl/miller/blob/main/README-dev.md](https://github.com/johnkerl/miller/blob/main/README-dev.md).
-* Regression testing has been completely reworked, including regression-testing now running fully on Windows (alongside Linux and Mac) [on each GitHub commit](https://github.com/johnkerl/miller/actions).
+* Regression testing has been completely reworked, including regression-testing now running fully on Windows (alongside Linux and Mac) [on each github.commit](https://github.com/johnkerl/miller/actions).
## Performance benchmarks
diff --git a/docs/src/online-help.md b/docs/src/online-help.md
index 5bbee15a1..bb8185e10 100644
--- a/docs/src/online-help.md
+++ b/docs/src/online-help.md
@@ -55,6 +55,7 @@ Flags:
mlr help comments-in-data-flags
mlr help compressed-data-flags
mlr help csv/tsv-only-flags
+ mlr help dkvp-only-flags
mlr help file-format-flags
mlr help flatten-unflatten-flags
mlr help format-conversion-keystroke-saver-flags
@@ -230,6 +231,7 @@ Options:
-nf {comma-separated field names} Same as -n
-nr {comma-separated field names} Numerical descending; nulls sort first
-t {comma-separated field names} Natural ascending
+-b Move sort fields to start of record, as in reorder -b
-tr|-rt {comma-separated field names} Natural descending
-h|--help Show this message.
diff --git a/docs/src/operating-on-all-records.md b/docs/src/operating-on-all-records.md
index 6663f1c18..668dcc367 100644
--- a/docs/src/operating-on-all-records.md
+++ b/docs/src/operating-on-all-records.md
@@ -274,8 +274,6 @@ array will have [null-gaps](reference-main-arrays.md) in it:
"value": 54
}
]
-[
-]
You can index `@records` by `@count` rather than `NR` to get a contiguous array:
diff --git a/docs/src/originality.md b/docs/src/originality.md
index 7ceb77ab3..6e7fd8c49 100644
--- a/docs/src/originality.md
+++ b/docs/src/originality.md
@@ -16,7 +16,7 @@ Quick links:
# How original is Miller?
-It isn't. Miller is one of many, many participants in the online-analytical-processing culture. Other key participants include `awk`, SQL, spreadsheets, etc. etc. etc. Far from being an original concept, Miller explicitly strives to imitate several existing tools:
+It isn't. Miller is just one of many participants in the online analytical processing culture. Other key participants include `awk`, SQL, spreadsheets, etc. etc. etc. Far from being an original concept, Miller explicitly strives to imitate several existing tools:
**The Unix toolkit**: Intentional similarities as described in [Unix-toolkit Context](unix-toolkit-context.md).
@@ -26,7 +26,7 @@ Recipes abound for command-line data analysis using the Unix toolkit. Here are j
* [http://www.gregreda.com/2013/07/15/unix-commands-for-data-science](http://www.gregreda.com/2013/07/15/unix-commands-for-data-science)
* [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools)
-**RecordStream**: Miller owes particular inspiration to [RecordStream](https://github.com/benbernard/RecordStream). The key difference is that RecordStream is a Perl-based tool for manipulating JSON (including requiring it to separately manipulate other formats such as CSV into and out of JSON), while Miller is fast Go which handles its formats natively. The similarities include the `sort`, `stats1` (analog of RecordStream's `collate`), and `delta` operations, as well as `filter` and `put`, and pretty-print formatting.
+**RecordStream**: Miller owes particular inspiration to [RecordStream](https://github.com/benbernard/RecordStream). The key difference is that RecordStream is a Perl-based tool for manipulating JSON (including requiring it to separately manipulate other formats such as CSV into and out of JSON), while Miller is a fast Go tool that handles its formats natively. The similarities include the `sort`, `stats1` (analogous to RecordStream's `collate`), and `delta` operations, as well as `filter` and `put`, and the use of pretty-print formatting.
**stats_m**: A third source of lineage is my Python [stats_m](https://github.com/johnkerl/scripts-math/tree/master/stats) module. This includes simple single-pass algorithms which form Miller's `stats1` and `stats2` subcommands.
@@ -35,21 +35,21 @@ Recipes abound for command-line data analysis using the Unix toolkit. Here are j
**Added value**: Miller's added values include:
* Name-indexing, compared to the Unix toolkit's positional indexing.
-* Raw speed, compared to `awk`, RecordStream, `stats_m`, or various other kinds of Python/Ruby/etc. scripts one can easily create.
+* Raw speed, compared to `awk`, RecordStream, `stats_m`, or various other kinds of Python/Ruby/etc. scripts that one can easily create.
* Compact keystroking for many common tasks, with a decent amount of flexibility.
-* Ability to handle text files on the Unix pipe, without need for creating database tables, compared to SQL databases.
+* Ability to handle text files on the Unix pipe, without the need for creating database tables, compared to SQL databases.
* Various file formats, and on-the-fly format conversion.
**jq**: Miller does for name-indexed text what [jq](https://stedolan.github.io/jq/) does for JSON. If you're not already familiar with `jq`, please check it out!.
**What about similar tools?**
-Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). Last I knew it doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well. As it turns out, I learned about most of these after writing Miller.
+Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). Last I knew, it doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well. As it turns out, I learned about most of these after writing Miller.
-**What about DOTADIW?** One of the key points of the [Unix philosophy](http://en.wikipedia.org/wiki/Unix_philosophy) is that a tool should do one thing and do it well. Hence `sort` and `cut` do just one thing. Why does Miller put `awk`-like processing, a few SQL-like operations, and statistical reduction all into one tool? This is a fair question. First note that many standard tools, such as `awk` and `perl`, do quite a few things -- as does `jq`. But I could have pushed for putting format awareness and name-indexing options into `cut`, `awk`, and so on (so you could do `cut -f hostname,uptime` or `awk '{sum += $x*$y}END{print sum}'`). Patching `cut`, `sort`, etc. on multiple operating systems is a non-starter in terms of uptake. Moreover, it makes sense for me to have Miller be a tool which collects together format-aware record-stream processing into one place, with good reuse of Miller-internal library code for its various features.
+**What about DOTADIW?** One of the key points of the [Unix philosophy](http://en.wikipedia.org/wiki/Unix_philosophy) is that a tool should do one thing and do it well. Hence, `sort` and `cut` do just one thing. Why does Miller put `awk`-like processing, a few SQL-like operations, and statistical reduction all into one tool? This is a fair question. First, note that many standard tools, such as `awk` and `perl`, do quite a few things -- as does `jq`. But I could have pushed for putting format awareness and name-indexing options into `cut`, `awk`, and so on (so you could do `cut -f hostname,uptime` or `awk '{sum += $x*$y}END{print sum}'`). Patching `cut`, `sort`, etc., on multiple operating systems is a non-starter in terms of uptake. Moreover, it makes sense for me to have Miller be a tool that collects together format-aware record-stream processing into one place, with good reuse of Miller's internal library code for its various features.
-**Why not use Perl/Python/Ruby etc.?** Maybe you should. With those tools you'll get far more expressive power, and sufficiently quick turnaround time for small-to-medium-sized data. Using Miller you'll get something less than a complete programming language, but which is fast, with moderate amounts of flexibility and much less keystroking.
+**Why not use Perl/Python/Ruby, etc.?** Maybe you should. With those tools, you'll gain significantly more expressive power and a sufficiently quick turnaround time for small to medium-sized datasets. Using Miller, you'll get something less than a complete programming language, but which is fast, with moderate amounts of flexibility and much less keystroking.
-When I was first developing Miller I made a survey of several languages. Using low-level implementation languages like C, Go, Rust, and Nim, I'd need to create my own domain-specific language (DSL) which would always be less featured than a full programming language, but I'd get better performance. Using high-level interpreted languages such as Perl/Python/Ruby I'd get the language's `eval` for free and I wouldn't need a DSL; Miller would have mainly been a set of format-specific I/O hooks. If I'd gotten good enough performance from the latter I'd have done it without question and Miller would be far more flexible. But low-level languages win the performance criteria by a landslide so we have Miller in Go with a custom DSL.
+When I was first developing Miller, I made a survey of several languages. Using low-level implementation languages like C, Go, Rust, and Nim, I'd need to create my own domain-specific language (DSL), which would always be less featured than a full programming language, but I'd get better performance. Using high-level interpreted languages such as Perl/Python/Ruby, I'd get the language's `eval` for free and I wouldn't need a DSL; Miller would have mainly been a set of format-specific I/O hooks. If I'd gotten good enough performance from the latter, I'd have done it without question, and Miller would be far more flexible. But low-level languages win the performance criteria by a landslide, so we have Miller in Go with a custom DSL.
-**No, really, why one more command-line data-manipulation tool?** I wrote Miller because I was frustrated with tools like `grep`, `sed`, and so on being *line-aware* without being *format-aware*. The single most poignant example I can think of is seeing people grep data lines out of their CSV files and sadly losing their header lines. While some lighter-than-SQL processing is very nice to have, at core I wanted the format-awareness of [RecordStream](https://github.com/benbernard/RecordStream) combined with the raw speed of the Unix toolkit. Miller does precisely that.
+**No, really, why one more command-line data-manipulation tool?** I wrote Miller because I was frustrated with tools like `grep`, `sed`, and so on being *line-aware* without being *format-aware*. The single most poignant example I can think of is seeing people grep data lines from their CSV files and sadly losing their header lines. While some lighter-than-SQL processing is very nice to have, at core I wanted the format-awareness of [RecordStream](https://github.com/benbernard/RecordStream) combined with the raw speed of the Unix toolkit. Miller does precisely that.
diff --git a/docs/src/originality.md.in b/docs/src/originality.md.in
index d6825a9d1..15875e183 100644
--- a/docs/src/originality.md.in
+++ b/docs/src/originality.md.in
@@ -1,6 +1,6 @@
# How original is Miller?
-It isn't. Miller is one of many, many participants in the online-analytical-processing culture. Other key participants include `awk`, SQL, spreadsheets, etc. etc. etc. Far from being an original concept, Miller explicitly strives to imitate several existing tools:
+It isn't. Miller is just one of many participants in the online analytical processing culture. Other key participants include `awk`, SQL, spreadsheets, etc. etc. etc. Far from being an original concept, Miller explicitly strives to imitate several existing tools:
**The Unix toolkit**: Intentional similarities as described in [Unix-toolkit Context](unix-toolkit-context.md).
@@ -10,7 +10,7 @@ Recipes abound for command-line data analysis using the Unix toolkit. Here are j
* [http://www.gregreda.com/2013/07/15/unix-commands-for-data-science](http://www.gregreda.com/2013/07/15/unix-commands-for-data-science)
* [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools)
-**RecordStream**: Miller owes particular inspiration to [RecordStream](https://github.com/benbernard/RecordStream). The key difference is that RecordStream is a Perl-based tool for manipulating JSON (including requiring it to separately manipulate other formats such as CSV into and out of JSON), while Miller is fast Go which handles its formats natively. The similarities include the `sort`, `stats1` (analog of RecordStream's `collate`), and `delta` operations, as well as `filter` and `put`, and pretty-print formatting.
+**RecordStream**: Miller owes particular inspiration to [RecordStream](https://github.com/benbernard/RecordStream). The key difference is that RecordStream is a Perl-based tool for manipulating JSON (including requiring it to separately manipulate other formats such as CSV into and out of JSON), while Miller is a fast Go tool that handles its formats natively. The similarities include the `sort`, `stats1` (analogous to RecordStream's `collate`), and `delta` operations, as well as `filter` and `put`, and the use of pretty-print formatting.
**stats_m**: A third source of lineage is my Python [stats_m](https://github.com/johnkerl/scripts-math/tree/master/stats) module. This includes simple single-pass algorithms which form Miller's `stats1` and `stats2` subcommands.
@@ -19,21 +19,21 @@ Recipes abound for command-line data analysis using the Unix toolkit. Here are j
**Added value**: Miller's added values include:
* Name-indexing, compared to the Unix toolkit's positional indexing.
-* Raw speed, compared to `awk`, RecordStream, `stats_m`, or various other kinds of Python/Ruby/etc. scripts one can easily create.
+* Raw speed, compared to `awk`, RecordStream, `stats_m`, or various other kinds of Python/Ruby/etc. scripts that one can easily create.
* Compact keystroking for many common tasks, with a decent amount of flexibility.
-* Ability to handle text files on the Unix pipe, without need for creating database tables, compared to SQL databases.
+* Ability to handle text files on the Unix pipe, without the need for creating database tables, compared to SQL databases.
* Various file formats, and on-the-fly format conversion.
**jq**: Miller does for name-indexed text what [jq](https://stedolan.github.io/jq/) does for JSON. If you're not already familiar with `jq`, please check it out!.
**What about similar tools?**
-Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). Last I knew it doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well. As it turns out, I learned about most of these after writing Miller.
+Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). Last I knew, it doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well. As it turns out, I learned about most of these after writing Miller.
-**What about DOTADIW?** One of the key points of the [Unix philosophy](http://en.wikipedia.org/wiki/Unix_philosophy) is that a tool should do one thing and do it well. Hence `sort` and `cut` do just one thing. Why does Miller put `awk`-like processing, a few SQL-like operations, and statistical reduction all into one tool? This is a fair question. First note that many standard tools, such as `awk` and `perl`, do quite a few things -- as does `jq`. But I could have pushed for putting format awareness and name-indexing options into `cut`, `awk`, and so on (so you could do `cut -f hostname,uptime` or `awk '{sum += $x*$y}END{print sum}'`). Patching `cut`, `sort`, etc. on multiple operating systems is a non-starter in terms of uptake. Moreover, it makes sense for me to have Miller be a tool which collects together format-aware record-stream processing into one place, with good reuse of Miller-internal library code for its various features.
+**What about DOTADIW?** One of the key points of the [Unix philosophy](http://en.wikipedia.org/wiki/Unix_philosophy) is that a tool should do one thing and do it well. Hence, `sort` and `cut` do just one thing. Why does Miller put `awk`-like processing, a few SQL-like operations, and statistical reduction all into one tool? This is a fair question. First, note that many standard tools, such as `awk` and `perl`, do quite a few things -- as does `jq`. But I could have pushed for putting format awareness and name-indexing options into `cut`, `awk`, and so on (so you could do `cut -f hostname,uptime` or `awk '{sum += $x*$y}END{print sum}'`). Patching `cut`, `sort`, etc., on multiple operating systems is a non-starter in terms of uptake. Moreover, it makes sense for me to have Miller be a tool that collects together format-aware record-stream processing into one place, with good reuse of Miller's internal library code for its various features.
-**Why not use Perl/Python/Ruby etc.?** Maybe you should. With those tools you'll get far more expressive power, and sufficiently quick turnaround time for small-to-medium-sized data. Using Miller you'll get something less than a complete programming language, but which is fast, with moderate amounts of flexibility and much less keystroking.
+**Why not use Perl/Python/Ruby, etc.?** Maybe you should. With those tools, you'll gain significantly more expressive power and a sufficiently quick turnaround time for small to medium-sized datasets. Using Miller, you'll get something less than a complete programming language, but which is fast, with moderate amounts of flexibility and much less keystroking.
-When I was first developing Miller I made a survey of several languages. Using low-level implementation languages like C, Go, Rust, and Nim, I'd need to create my own domain-specific language (DSL) which would always be less featured than a full programming language, but I'd get better performance. Using high-level interpreted languages such as Perl/Python/Ruby I'd get the language's `eval` for free and I wouldn't need a DSL; Miller would have mainly been a set of format-specific I/O hooks. If I'd gotten good enough performance from the latter I'd have done it without question and Miller would be far more flexible. But low-level languages win the performance criteria by a landslide so we have Miller in Go with a custom DSL.
+When I was first developing Miller, I made a survey of several languages. Using low-level implementation languages like C, Go, Rust, and Nim, I'd need to create my own domain-specific language (DSL), which would always be less featured than a full programming language, but I'd get better performance. Using high-level interpreted languages such as Perl/Python/Ruby, I'd get the language's `eval` for free and I wouldn't need a DSL; Miller would have mainly been a set of format-specific I/O hooks. If I'd gotten good enough performance from the latter, I'd have done it without question, and Miller would be far more flexible. But low-level languages win the performance criteria by a landslide, so we have Miller in Go with a custom DSL.
-**No, really, why one more command-line data-manipulation tool?** I wrote Miller because I was frustrated with tools like `grep`, `sed`, and so on being *line-aware* without being *format-aware*. The single most poignant example I can think of is seeing people grep data lines out of their CSV files and sadly losing their header lines. While some lighter-than-SQL processing is very nice to have, at core I wanted the format-awareness of [RecordStream](https://github.com/benbernard/RecordStream) combined with the raw speed of the Unix toolkit. Miller does precisely that.
+**No, really, why one more command-line data-manipulation tool?** I wrote Miller because I was frustrated with tools like `grep`, `sed`, and so on being *line-aware* without being *format-aware*. The single most poignant example I can think of is seeing people grep data lines from their CSV files and sadly losing their header lines. While some lighter-than-SQL processing is very nice to have, at core I wanted the format-awareness of [RecordStream](https://github.com/benbernard/RecordStream) combined with the raw speed of the Unix toolkit. Miller does precisely that.
diff --git a/docs/src/output-colorization.md b/docs/src/output-colorization.md
index 6282d2172..e94cfe91a 100644
--- a/docs/src/output-colorization.md
+++ b/docs/src/output-colorization.md
@@ -50,7 +50,7 @@ described below:
* Suppression/unsuppression:
- * `export MLR_NO_COLOR=true` means Miller won't color even when it normally would.
+ * `export MLR_NO_COLOR=true` or `export NO_COLOR=true` means Miller won't color even when it normally would.
* `export MLR_ALWAYS_COLOR=true` means Miller will color even when it normally would not. For example, you might want to use this when piping `mlr` output to `less -r`.
* Command-line flags `--no-color` or `-M`, `--always-color` or `-C`.
* On Windows, replace `export` with `set`
diff --git a/docs/src/output-colorization.md.in b/docs/src/output-colorization.md.in
index e60a53887..8c32c3f9b 100644
--- a/docs/src/output-colorization.md.in
+++ b/docs/src/output-colorization.md.in
@@ -34,7 +34,7 @@ described below:
* Suppression/unsuppression:
- * `export MLR_NO_COLOR=true` means Miller won't color even when it normally would.
+ * `export MLR_NO_COLOR=true` or `export NO_COLOR=true` means Miller won't color even when it normally would.
* `export MLR_ALWAYS_COLOR=true` means Miller will color even when it normally would not. For example, you might want to use this when piping `mlr` output to `less -r`.
* Command-line flags `--no-color` or `-M`, `--always-color` or `-C`.
* On Windows, replace `export` with `set`
diff --git a/docs/src/questions-about-joins.md b/docs/src/questions-about-joins.md
index b8bde2d46..e3974877e 100644
--- a/docs/src/questions-about-joins.md
+++ b/docs/src/questions-about-joins.md
@@ -118,9 +118,7 @@ However, if we ask for left-unpaireds, since there's no `color` column, we get a
id,code,color
4,ff0000,red
2,00ff00,green
-
-id,code
-3,0000ff
+3,0000ff,
To fix this, we can use **unsparsify**:
diff --git a/docs/src/record-heterogeneity.md b/docs/src/record-heterogeneity.md
index d02a52448..1eb7eb0b5 100644
--- a/docs/src/record-heterogeneity.md
+++ b/docs/src/record-heterogeneity.md
@@ -16,12 +16,11 @@ Quick links:
# Record-heterogeneity
-We think of CSV tables as rectangular: if there are 17 columns in the header
-then there are 17 columns for every row, else the data have a formatting error.
+We think of CSV tables as rectangular: if there are 17 columns in the header, then there are 17 columns for every row, else the data has a formatting error.
But heterogeneous data abound -- log-file entries, JSON documents, no-SQL
databases such as MongoDB, etc. -- not to mention **data-cleaning
-opportunities** we'll look at in this page. Miller offers several ways to
+opportunities** we'll look at on this page. Miller offers several ways to
handle data heterogeneity.
## Terminology, examples, and solutions
@@ -56,7 +55,7 @@ It has three records (written here using JSON Lines formatting):
Here every row has the same keys, in the same order: `a,b,c`.
-These are also sometimes called **rectangular** since if we pretty-print them we get a nice rectangle:
+These are also sometimes called **rectangular** since if we pretty-print them, we get a nice rectangle:
mlr --icsv --opprint cat data/het/hom.csv @@ -94,7 +93,7 @@ a,b,c This example is still homogeneous, though: every row has the same keys, in the same order: `a,b,c`. Empty values don't make the data heterogeneous. -Note however that we can use the [`fill-empty`](reference-verbs.md#fill-empty) verb to make these +Note, however, that we can use the [`fill-empty`](reference-verbs.md#fill-empty) verb to make these values non-empty, if we like:@@ -109,7 +108,7 @@ filler 8 9 ### Ragged data -Next let's look at non-well-formed CSV files. For a third example: +Next, let's look at non-well-formed CSV files. For a third example:cat data/het/ragged.csv @@ -130,17 +129,11 @@ If you `mlr --csv cat` this, you'll get an error message: a,b,c 1,2,3 mlr: mlr: CSV header/data length mismatch 3 != 2 at filename data/het/ragged.csv row 3. -.-There are two kinds of raggedness here. Since CSVs form records by zipping the -keys from the header line together with the values from each data line, the -second record has a missing value for key `c` (which ought to be fillable), -while the third record has a value `10` with no key for it. +There are two kinds of raggedness here. Since CSVs form records by zipping the keys from the header line, together with the values from each data line, the second record has a missing value for key `c` (which ought to be fillable), while the third record has a value `10` with no key for it. -Using the [`--allow-ragged-csv-input` flag](reference-main-flag-list.md#csv-only-flags) -we can fill values in too-short rows, and provide a key (column number starting -with 1) for too-long rows: +Using the [`--allow-ragged-csv-input` flag](reference-main-flag-list.md#csv-only-flags), we can fill values in too-short rows and provide a key (column number starting with 1) for too-long rows:mlr --icsv --ojson --allow-ragged-csv-input cat data/het/ragged.csv @@ -187,7 +180,7 @@ This kind of data arises often in practice. One reason is that, while many programming languages (including the Miller DSL) [preserve insertion order](reference-main-maps.md#insertion-order-is-preserved) in maps; others do not. So someone might have written `{"a":4,"b":5,"c":6}` in the source code, -but the data may not have printed that way into a given data file. +but the data may not have been printed that way into a given data file. We can use the [`regularize`](reference-verbs.md#regularize) or [`sort-within-records`](reference-verbs.md#sort-within-records) verb to order @@ -204,13 +197,13 @@ the keys: The `regularize` verb tries to re-order subsequent rows to look like the first (whatever order that is); the `sort-within-records` verb simply uses -alphabetical order (which is the same in the above example where the first +alphabetical order (which is the same in the above example, where the first record has keys in the order `a,b,c`). ### Sparse data Here's another frequently occurring situation -- quite often, systems will log -data for items which are present, but won't log data for items which aren't. +data for items that are present, but won't log data for items that aren't.mlr --json cat data/het/sparse.json @@ -237,8 +230,7 @@ data for items which are present, but won't log data for items which aren't. This data is called **sparse** (from the [data-storage term](https://en.wikipedia.org/wiki/Sparse_matrix)). -We can use the [`unsparsify`](reference-verbs.md#unsparsify) verb to make sure -every record has the same keys: +We can use the [`unsparsify`](reference-verbs.md#unsparsify) verb to make sure every record has the same keys:mlr --json unsparsify data/het/sparse.json @@ -283,12 +275,11 @@ xy55.east - /dev/sda1 failover true ## Reading and writing heterogeneous data -In the previous sections we saw different kinds of data heterogeneity, and ways -to transform the data to make it homogeneous. +In the previous sections, we saw different kinds of data heterogeneity and ways to transform the data to make it homogeneous. ### Non-rectangular file formats: JSON, XTAB, NIDX, DKVP -For these formats, record-heterogeneity comes naturally: +For these formats, record heterogeneity comes naturally:cat data/het/sparse.json @@ -372,16 +363,15 @@ record_count=150,resource=/path/to/second/file ### Rectangular file formats: CSV and pretty-print -CSV and pretty-print formats expect rectangular structure. But Miller lets you +CSV and pretty-print formats expect a rectangular structure. But Miller lets you process non-rectangular using CSV and pretty-print. -Miller simply prints a newline and a new header when there is a schema change --- where by _schema_ we mean simply the list of record keys in the order they -are encountered. When there is no schema change, you get CSV per se as a -special case. Likewise, Miller reads heterogeneous CSV or pretty-print input -the same way. The difference between CSV and CSV-lite is that the former is -[RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter readily -handles heterogeneous data (which is non-compliant). For example: +For CSV-lite and TSV-lite, Miller prints a newline and a new header when there is a schema +change -- where by _schema_ we mean the list of record keys in the order they are +encountered. When there is no schema change, you get CSV per se as a special case. Likewise, Miller +reads heterogeneous CSV or pretty-print input the same way. The difference between CSV and CSV-lite +is that the former is [RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter +readily handles heterogeneous data (which is non-compliant). For example:cat data/het.json @@ -446,30 +436,52 @@ record_count resource 150 /path/to/second/file-Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if there are implicit header changes (no intervening blank line and new header line) as seen above -- you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). ++mlr --ijson --ocsvlite group-like data/het.json +++resource,loadsec,ok +/path/to/file,0.45,true +/path/to/second/file,0.32,true +/some/other/path,0.97,false + +record_count,resource +100,/path/to/file +150,/path/to/second/file +-mlr --csv --ragged cat data/het/ragged.csv +mlr --ijson --ocsv group-like data/het.json +++resource,loadsec,ok +/path/to/file,0.45,true +/path/to/second/file,0.32,true +/some/other/path,0.97,false +mlr: CSV schema change: first keys "resource,loadsec,ok"; current keys "record_count,resource" +mlr: exiting due to data error. ++ +Miller handles explicit header changes as shown. If your CSV input contains ragged data -- if there are implicit header changes (no intervening blank line and new header line) as seen above -- you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). + ++mlr --csv --allow-ragged-csv-input cat data/het/ragged.csva,b,c 1,2,3 - -a,b -4,5 - -a,b,c,4 +4,5, 7,8,9,10## Processing heterogeneous data Above we saw how to make heterogeneous data homogeneous, and then how to print heterogeneous data. -As for other processing, record-heterogeneity is not a problem for Miller. +As for other processing, record heterogeneity is not a problem for Miller. Miller operates on specified fields and takes the rest along: for example, if -you are sorting on the `count` field then all records in the input stream must -have a `count` field but the other fields can vary, and moreover the sorted-on +you are sorting on the `count` field, then all records in the input stream must +have a `count` field, but the other fields can vary---and moreover the sorted-on field name(s) don't need to be in the same position on each line:diff --git a/docs/src/record-heterogeneity.md.in b/docs/src/record-heterogeneity.md.in index 1aab9dfaa..e3c128b57 100644 --- a/docs/src/record-heterogeneity.md.in +++ b/docs/src/record-heterogeneity.md.in @@ -1,11 +1,10 @@ # Record-heterogeneity -We think of CSV tables as rectangular: if there are 17 columns in the header -then there are 17 columns for every row, else the data have a formatting error. +We think of CSV tables as rectangular: if there are 17 columns in the header, then there are 17 columns for every row, else the data has a formatting error. But heterogeneous data abound -- log-file entries, JSON documents, no-SQL databases such as MongoDB, etc. -- not to mention **data-cleaning -opportunities** we'll look at in this page. Miller offers several ways to +opportunities** we'll look at on this page. Miller offers several ways to handle data heterogeneity. ## Terminology, examples, and solutions @@ -29,7 +28,7 @@ GENMD-EOF Here every row has the same keys, in the same order: `a,b,c`. -These are also sometimes called **rectangular** since if we pretty-print them we get a nice rectangle: +These are also sometimes called **rectangular** since if we pretty-print them, we get a nice rectangle: GENMD-RUN-COMMAND mlr --icsv --opprint cat data/het/hom.csv @@ -50,7 +49,7 @@ GENMD-EOF This example is still homogeneous, though: every row has the same keys, in the same order: `a,b,c`. Empty values don't make the data heterogeneous. -Note however that we can use the [`fill-empty`](reference-verbs.md#fill-empty) verb to make these +Note, however, that we can use the [`fill-empty`](reference-verbs.md#fill-empty) verb to make these values non-empty, if we like: GENMD-RUN-COMMAND @@ -59,7 +58,7 @@ GENMD-EOF ### Ragged data -Next let's look at non-well-formed CSV files. For a third example: +Next, let's look at non-well-formed CSV files. For a third example: GENMD-RUN-COMMAND cat data/het/ragged.csv @@ -71,14 +70,9 @@ GENMD-RUN-COMMAND-TOLERATING-ERROR mlr --csv cat data/het/ragged.csv GENMD-EOF -There are two kinds of raggedness here. Since CSVs form records by zipping the -keys from the header line together with the values from each data line, the -second record has a missing value for key `c` (which ought to be fillable), -while the third record has a value `10` with no key for it. +There are two kinds of raggedness here. Since CSVs form records by zipping the keys from the header line, together with the values from each data line, the second record has a missing value for key `c` (which ought to be fillable), while the third record has a value `10` with no key for it. -Using the [`--allow-ragged-csv-input` flag](reference-main-flag-list.md#csv-only-flags) -we can fill values in too-short rows, and provide a key (column number starting -with 1) for too-long rows: +Using the [`--allow-ragged-csv-input` flag](reference-main-flag-list.md#csv-only-flags), we can fill values in too-short rows and provide a key (column number starting with 1) for too-long rows: GENMD-RUN-COMMAND-TOLERATING-ERROR mlr --icsv --ojson --allow-ragged-csv-input cat data/het/ragged.csv @@ -101,7 +95,7 @@ This kind of data arises often in practice. One reason is that, while many programming languages (including the Miller DSL) [preserve insertion order](reference-main-maps.md#insertion-order-is-preserved) in maps; others do not. So someone might have written `{"a":4,"b":5,"c":6}` in the source code, -but the data may not have printed that way into a given data file. +but the data may not have been printed that way into a given data file. We can use the [`regularize`](reference-verbs.md#regularize) or [`sort-within-records`](reference-verbs.md#sort-within-records) verb to order @@ -113,13 +107,13 @@ GENMD-EOF The `regularize` verb tries to re-order subsequent rows to look like the first (whatever order that is); the `sort-within-records` verb simply uses -alphabetical order (which is the same in the above example where the first +alphabetical order (which is the same in the above example, where the first record has keys in the order `a,b,c`). ### Sparse data Here's another frequently occurring situation -- quite often, systems will log -data for items which are present, but won't log data for items which aren't. +data for items that are present, but won't log data for items that aren't. GENMD-RUN-COMMAND mlr --json cat data/het/sparse.json @@ -127,8 +121,7 @@ GENMD-EOF This data is called **sparse** (from the [data-storage term](https://en.wikipedia.org/wiki/Sparse_matrix)). -We can use the [`unsparsify`](reference-verbs.md#unsparsify) verb to make sure -every record has the same keys: +We can use the [`unsparsify`](reference-verbs.md#unsparsify) verb to make sure every record has the same keys: GENMD-RUN-COMMAND mlr --json unsparsify data/het/sparse.json @@ -142,12 +135,11 @@ GENMD-EOF ## Reading and writing heterogeneous data -In the previous sections we saw different kinds of data heterogeneity, and ways -to transform the data to make it homogeneous. +In the previous sections, we saw different kinds of data heterogeneity and ways to transform the data to make it homogeneous. ### Non-rectangular file formats: JSON, XTAB, NIDX, DKVP -For these formats, record-heterogeneity comes naturally: +For these formats, record heterogeneity comes naturally: GENMD-RUN-COMMAND cat data/het/sparse.json @@ -177,16 +169,15 @@ GENMD-EOF ### Rectangular file formats: CSV and pretty-print -CSV and pretty-print formats expect rectangular structure. But Miller lets you +CSV and pretty-print formats expect a rectangular structure. But Miller lets you process non-rectangular using CSV and pretty-print. -Miller simply prints a newline and a new header when there is a schema change --- where by _schema_ we mean simply the list of record keys in the order they -are encountered. When there is no schema change, you get CSV per se as a -special case. Likewise, Miller reads heterogeneous CSV or pretty-print input -the same way. The difference between CSV and CSV-lite is that the former is -[RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter readily -handles heterogeneous data (which is non-compliant). For example: +For CSV-lite and TSV-lite, Miller prints a newline and a new header when there is a schema +change -- where by _schema_ we mean the list of record keys in the order they are +encountered. When there is no schema change, you get CSV per se as a special case. Likewise, Miller +reads heterogeneous CSV or pretty-print input the same way. The difference between CSV and CSV-lite +is that the former is [RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter +readily handles heterogeneous data (which is non-compliant). For example: GENMD-RUN-COMMAND cat data/het.json @@ -200,20 +191,28 @@ GENMD-RUN-COMMAND mlr --ijson --opprint group-like data/het.json GENMD-EOF -Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if there are implicit header changes (no intervening blank line and new header line) as seen above -- you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). +GENMD-RUN-COMMAND +mlr --ijson --ocsvlite group-like data/het.json +GENMD-EOF GENMD-RUN-COMMAND-TOLERATING-ERROR -mlr --csv --ragged cat data/het/ragged.csv +mlr --ijson --ocsv group-like data/het.json +GENMD-EOF + +Miller handles explicit header changes as shown. If your CSV input contains ragged data -- if there are implicit header changes (no intervening blank line and new header line) as seen above -- you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). + +GENMD-RUN-COMMAND +mlr --csv --allow-ragged-csv-input cat data/het/ragged.csv GENMD-EOF ## Processing heterogeneous data Above we saw how to make heterogeneous data homogeneous, and then how to print heterogeneous data. -As for other processing, record-heterogeneity is not a problem for Miller. +As for other processing, record heterogeneity is not a problem for Miller. Miller operates on specified fields and takes the rest along: for example, if -you are sorting on the `count` field then all records in the input stream must -have a `count` field but the other fields can vary, and moreover the sorted-on +you are sorting on the `count` field, then all records in the input stream must +have a `count` field, but the other fields can vary---and moreover the sorted-on field name(s) don't need to be in the same position on each line: GENMD-RUN-COMMAND diff --git a/docs/src/reference-dsl-builtin-functions.md b/docs/src/reference-dsl-builtin-functions.md index 8c3b49640..880ffb19f 100644 --- a/docs/src/reference-dsl-builtin-functions.md +++ b/docs/src/reference-dsl-builtin-functions.md @@ -16,9 +16,7 @@ Quick links: # DSL built-in functions -These are functions in the [Miller programming language](miller-programming-language.md) -that you can call when you use `mlr put` and `mlr filter`. For example, when you type - +These are functions in the [Miller programming language](miller-programming-language.md) that you can call when you use `mlr put` and `mlr filter`. For example, when you typemlr --icsv --opprint --from example.csv put ' $color = toupper($color); @@ -43,26 +41,13 @@ the `toupper` and `gsub` bits are _functions_. ## Overview -At the command line, you can use `mlr -f` and `mlr -F` for information much -like what's on this page. +At the command line, you can use `mlr -f` and `mlr -F` for information much like what's on this page. -Each function takes a specific number of arguments, as shown below, except for -functions marked as variadic such as `min` and `max`. (The latter compute min -and max of any number of arguments.) There is no notion of optional or -default-on-absent arguments. All argument-passing is positional rather than by -name; arguments are passed by value, not by reference. +Each function takes a specific number of arguments, as shown below, except for functions marked as variadic, such as `min` and `max`. (The latter compute the min and max of any number of arguments.) There is no notion of optional or default-on-absent arguments. All argument-passing is positional rather than by name; arguments are passed by value, not by reference. -At the command line, you can get a list of all functions using `mlr -f`, with -details using `mlr -F`. (Or, `mlr help usage-functions-by-class` to get -details in the order shown on this page.) You can get detail for a given -function using `mlr help function namegoeshere`, e.g. `mlr help function -gsub`. +At the command line, you can get a list of all functions using `mlr -f`, with details using `mlr -F`. (Or, `mlr help usage-functions-by-class` to get details in the order shown on this page.) You can get details for a given function using `mlr help function namegoeshere`, e.g., `mlr help function gsub`. -Operators are listed here along with functions. In this case, the -argument-count is the number of items involved in the infix operator, e.g. we -say `x+y` so the details for the `+` operator say that its number of arguments -is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary -`? :` operator shows an argument-count of 3. +Operators are listed here along with functions. In this case, the argument count refers to the number of items involved in the infix operator. For example, we say `x+y`, so the details for the `+` operator indicate that it has two arguments. Unary operators such as `!` and `~` show argument-count of 1; the ternary `? :` operator shows an argument count of 3. ## Functions by class @@ -75,8 +60,8 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary * [**Higher-order-functions functions**](#higher-order-functions-functions): [any](#any), [apply](#apply), [every](#every), [fold](#fold), [reduce](#reduce), [select](#select), [sort](#sort). * [**Math functions**](#math-functions): [abs](#abs), [acos](#acos), [acosh](#acosh), [asin](#asin), [asinh](#asinh), [atan](#atan), [atan2](#atan2), [atanh](#atanh), [cbrt](#cbrt), [ceil](#ceil), [cos](#cos), [cosh](#cosh), [erf](#erf), [erfc](#erfc), [exp](#exp), [expm1](#expm1), [floor](#floor), [invqnorm](#invqnorm), [log](#log), [log10](#log10), [log1p](#log1p), [logifit](#logifit), [max](#max), [min](#min), [qnorm](#qnorm), [round](#round), [roundm](#roundm), [sgn](#sgn), [sin](#sin), [sinh](#sinh), [sqrt](#sqrt), [tan](#tan), [tanh](#tanh), [urand](#urand), [urand32](#urand32), [urandelement](#urandelement), [urandint](#urandint), [urandrange](#urandrange). * [**Stats functions**](#stats-functions): [antimode](#antimode), [count](#count), [distinct_count](#distinct_count), [kurtosis](#kurtosis), [maxlen](#maxlen), [mean](#mean), [meaneb](#meaneb), [median](#median), [minlen](#minlen), [mode](#mode), [null_count](#null_count), [percentile](#percentile), [percentiles](#percentiles), [skewness](#skewness), [sort_collection](#sort_collection), [stddev](#stddev), [sum](#sum), [sum2](#sum2), [sum3](#sum3), [sum4](#sum4), [variance](#variance). -* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot). -* [**System functions**](#system-functions): [exec](#exec), [hostname](#hostname), [os](#os), [system](#system), [version](#version). +* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [strmatch](#strmatch), [strmatchx](#strmatchx), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot). +* [**System functions**](#system-functions): [exec](#exec), [hostname](#hostname), [os](#os), [stat](#stat), [system](#system), [version](#version). * [**Time functions**](#time-functions): [dhms2fsec](#dhms2fsec), [dhms2sec](#dhms2sec), [fsec2dhms](#fsec2dhms), [fsec2hms](#fsec2hms), [gmt2localtime](#gmt2localtime), [gmt2nsec](#gmt2nsec), [gmt2sec](#gmt2sec), [hms2fsec](#hms2fsec), [hms2sec](#hms2sec), [localtime2gmt](#localtime2gmt), [localtime2nsec](#localtime2nsec), [localtime2sec](#localtime2sec), [nsec2gmt](#nsec2gmt), [nsec2gmtdate](#nsec2gmtdate), [nsec2localdate](#nsec2localdate), [nsec2localtime](#nsec2localtime), [sec2dhms](#sec2dhms), [sec2gmt](#sec2gmt), [sec2gmtdate](#sec2gmtdate), [sec2hms](#sec2hms), [sec2localdate](#sec2localdate), [sec2localtime](#sec2localtime), [strfntime](#strfntime), [strfntime_local](#strfntime_local), [strftime](#strftime), [strftime_local](#strftime_local), [strpntime](#strpntime), [strpntime_local](#strpntime_local), [strptime](#strptime), [strptime_local](#strptime_local), [sysntime](#sysntime), [systime](#systime), [systimeint](#systimeint), [upntime](#upntime), [uptime](#uptime). * [**Typing functions**](#typing-functions): [asserting_absent](#asserting_absent), [asserting_array](#asserting_array), [asserting_bool](#asserting_bool), [asserting_boolean](#asserting_boolean), [asserting_empty](#asserting_empty), [asserting_empty_map](#asserting_empty_map), [asserting_error](#asserting_error), [asserting_float](#asserting_float), [asserting_int](#asserting_int), [asserting_map](#asserting_map), [asserting_nonempty_map](#asserting_nonempty_map), [asserting_not_array](#asserting_not_array), [asserting_not_empty](#asserting_not_empty), [asserting_not_map](#asserting_not_map), [asserting_not_null](#asserting_not_null), [asserting_null](#asserting_null), [asserting_numeric](#asserting_numeric), [asserting_present](#asserting_present), [asserting_string](#asserting_string), [is_absent](#is_absent), [is_array](#is_array), [is_bool](#is_bool), [is_boolean](#is_boolean), [is_empty](#is_empty), [is_empty_map](#is_empty_map), [is_error](#is_error), [is_float](#is_float), [is_int](#is_int), [is_map](#is_map), [is_nan](#is_nan), [is_nonempty_map](#is_nonempty_map), [is_not_array](#is_not_array), [is_not_empty](#is_not_empty), [is_not_map](#is_not_map), [is_not_null](#is_not_null), [is_null](#is_null), [is_numeric](#is_numeric), [is_present](#is_present), [is_string](#is_string), [typeof](#typeof). @@ -534,9 +519,14 @@ $* = fmtifnum($*, "%.6f") formats numeric fields in the current record, leaving ### fmtnum+### strmatch +-fmtnum (class=conversion #args=2) Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. '$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. This function recurses on array and map values. -Example: -$x = fmtnum($x, "%.6f") +fmtnum (class=conversion #args=2) Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. '$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. Miller-specific extension: "%_d" and "%_f" for comma-separated thousands. This function recurses on array and map values. +Examples: +$y = fmtnum($x, "%.6f") +$o = fmtnum($n, "%d") +$o = fmtnum($n, "%12d") +$y = fmtnum($x, "%.6_f") +$o = fmtnum($n, "%_d") +$o = fmtnum($n, "%12_d")@@ -1209,7 +1199,7 @@ capitalize (class=string #args=1) Convert string's first character to uppercase ### clean_whitespace-clean_whitespace (class=string #args=1) Same as collapse_whitespace and strip. +clean_whitespace (class=string #args=1) Same as collapse_whitespace and strip, followed by type inference.@@ -1350,6 +1340,46 @@ strlen (class=string #args=1) String length.+strmatch (class=string #args=2) Boolean yes/no for whether the stringable first argument matches the regular-expression second argument. No regex captures are provided; please see `strmatch`. +Examples: +strmatch("a", "abc") is false +strmatch("abc", "a") is true +strmatch("abc", "a[a-z]c") is true +strmatch("abc", "(a).(c)") is true +strmatch(12345, "34") is true ++ + +### strmatchx ++strmatchx (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in contrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here. +Examples: +strmatchx("a", "abc") returns: + { + "matched": false + } +strmatchx("abc", "a") returns: + { + "matched": true, + "full_capture": "a", + "full_start": 1, + "full_end": 1 + } +strmatchx("[zy:3458]", "([a-z]+):([0-9]+)") returns: + { + "matched": true, + "full_capture": "zy:3458", + "full_start": 2, + "full_end": 8, + "captures": ["zy", "3458"], + "starts": [2, 5], + "ends": [3, 8] + } ++ + ### subsub (class=string #args=3) '$name = sub($name, "old", "new")': replace once (first match, if there are multiple matches), with support for regular expressions. Capture groups \1 through \9 in the new part are matched from (...) in the old part, and must be used within the same call to sub -- they don't persist for subsequent DSL statements. See also =~ and regextract. See also "Regular expressions" at https://miller.readthedocs.io. @@ -1457,6 +1487,21 @@ os (class=system #args=0) Returns the operating-system name as a string.+### stat ++stat (class=system #args=1) Returns a map containing information about the provided path: "name" with string value, "size" as decimal int value, "mode" as octal int value, "modtime" as int-valued epoch seconds, and "isdir" as boolean value. +Examples: +stat("./mlr") gives { + "name": "mlr", + "size": 38391584, + "mode": 0755, + "modtime": 1715207874, + "isdir": false +} +stat("./mlr")["size"] gives 38391584 ++ + ### systemsystem (class=system #args=1) Run command string, yielding its stdout minus final carriage return. diff --git a/docs/src/reference-dsl-builtin-functions.md.in b/docs/src/reference-dsl-builtin-functions.md.in index 4bb51082c..b535cd907 100644 --- a/docs/src/reference-dsl-builtin-functions.md.in +++ b/docs/src/reference-dsl-builtin-functions.md.in @@ -1,8 +1,6 @@ # DSL built-in functions -These are functions in the [Miller programming language](miller-programming-language.md) -that you can call when you use `mlr put` and `mlr filter`. For example, when you type - +These are functions in the [Miller programming language](miller-programming-language.md) that you can call when you use `mlr put` and `mlr filter`. For example, when you type GENMD-RUN-COMMAND mlr --icsv --opprint --from example.csv put ' $color = toupper($color); @@ -14,25 +12,12 @@ the `toupper` and `gsub` bits are _functions_. ## Overview -At the command line, you can use `mlr -f` and `mlr -F` for information much -like what's on this page. +At the command line, you can use `mlr -f` and `mlr -F` for information much like what's on this page. -Each function takes a specific number of arguments, as shown below, except for -functions marked as variadic such as `min` and `max`. (The latter compute min -and max of any number of arguments.) There is no notion of optional or -default-on-absent arguments. All argument-passing is positional rather than by -name; arguments are passed by value, not by reference. +Each function takes a specific number of arguments, as shown below, except for functions marked as variadic, such as `min` and `max`. (The latter compute the min and max of any number of arguments.) There is no notion of optional or default-on-absent arguments. All argument-passing is positional rather than by name; arguments are passed by value, not by reference. -At the command line, you can get a list of all functions using `mlr -f`, with -details using `mlr -F`. (Or, `mlr help usage-functions-by-class` to get -details in the order shown on this page.) You can get detail for a given -function using `mlr help function namegoeshere`, e.g. `mlr help function -gsub`. +At the command line, you can get a list of all functions using `mlr -f`, with details using `mlr -F`. (Or, `mlr help usage-functions-by-class` to get details in the order shown on this page.) You can get details for a given function using `mlr help function namegoeshere`, e.g., `mlr help function gsub`. -Operators are listed here along with functions. In this case, the -argument-count is the number of items involved in the infix operator, e.g. we -say `x+y` so the details for the `+` operator say that its number of arguments -is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary -`? :` operator shows an argument-count of 3. +Operators are listed here along with functions. In this case, the argument count refers to the number of items involved in the infix operator. For example, we say `x+y`, so the details for the `+` operator indicate that it has two arguments. Unary operators such as `!` and `~` show argument-count of 1; the ternary `? :` operator shows an argument count of 3. GENMD-RUN-CONTENT-GENERATOR(./mk-func-info.rb) diff --git a/docs/src/reference-dsl-complexity.md b/docs/src/reference-dsl-complexity.md index 5fb579155..de97fa3f0 100644 --- a/docs/src/reference-dsl-complexity.md +++ b/docs/src/reference-dsl-complexity.md @@ -16,34 +16,9 @@ Quick links: # A note on the complexity of Miller's expression language -One of Miller's strengths is its brevity: it's much quicker -- and less -error-prone -- to type `mlr stats1 -a sum -f x,y -g a,b` than having to track -summation variables as in `awk`, or using Miller's [out-of-stream -variables](reference-dsl-variables.md#out-of-stream-variables). And the more -language features Miller's put-DSL has (for-loops, if-statements, nested -control structures, user-defined functions, etc.) then the *less* powerful it -begins to seem: because of the other programming-language features it *doesn't* -have (classes, exceptions, and so on). +One of Miller's strengths is its brevity: it's much quicker -- and less error-prone -- to type `mlr stats1 -a sum -f x,y -g a,b` than having to track summation variables as in `awk`, or using Miller's [out-of-stream variables](reference-dsl-variables.md#out-of-stream-variables). And the more language features Miller's put-DSL has (for-loops, if-statements, nested control structures, user-defined functions, etc.), then the *less* powerful it begins to seem: because of the other programming-language features it *doesn't* have (classes, exceptions, and so on). -When I was originally prototyping Miller in 2015, the primary decision I had -was whether to hand-code in a low-level language like C or Rust or Go, with my -own hand-rolled DSL, or whether to use a higher-level language (like Python or -Lua or Nim) and let the `put` statements be handled by the implementation -language's own `eval`: the implementation language would take the place of a -DSL. Multiple performance experiments showed me I could get better throughput -using the former, by a wide margin. So Miller is Go under the hood with a -hand-rolled DSL. +When I was initially prototyping Miller in 2015, the primary decision I had was whether to hand-code in a low-level language like C or Rust or Go, with my hand-rolled DSL, or whether to use a higher-level language (like Python or Lua or Nim) and let the `put` statements be handled by the implementation language's own `eval`: the implementation language would take the place of a DSL. Multiple performance experiments showed me I could get better throughput using the former, by a wide margin. So Miller is Go under the hood with a hand-rolled DSL. -I do want to keep focusing on what Miller is good at -- concise notation, low -latency, and high throughput -- and not add too much in terms of -high-level-language features to the DSL. That said, some sort of -customizability is a basic thing to want. As of 4.1.0 we have recursive -`for`/`while`/`if` [structures](reference-dsl-control-structures.md) on about -the same complexity level as `awk`; as of 5.0.0 we have [user-defined -functions](reference-dsl-user-defined-functions.md) and [map-valued -variables](reference-dsl-variables.md), again on about the same complexity level -as `awk` along with optional type-declaration syntax; as of Miller 6 we have -full support for [arrays](reference-main-arrays.md). While I'm excited by these -powerful language features, I hope to keep new features focused on Miller's -sweet spot which is speed plus simplicity. +I want to continue focusing on what Miller excels at — concise notation, low latency, and high throughput — and not add too many high-level language features to the DSL. That said, some customizability is a basic thing to want. As of 4.1.0, we have recursive `for`/`while`/`if` [structures](reference-dsl-control-structures.md) on about the same complexity level as `awk`; as of 5.0.0, we have [user-defined functions](reference-dsl-user-defined-functions.md) and [map-valued variables](reference-dsl-variables.md), again on about the same complexity level as `awk` along with optional type-declaration syntax; as of Miller 6, we have full support for [arrays](reference-main-arrays.md). While I'm excited by these powerful language features, I hope to keep new features focused on Miller's sweet spot, which is speed plus simplicity. diff --git a/docs/src/reference-dsl-complexity.md.in b/docs/src/reference-dsl-complexity.md.in index 81251b436..3087e00c1 100644 --- a/docs/src/reference-dsl-complexity.md.in +++ b/docs/src/reference-dsl-complexity.md.in @@ -1,33 +1,8 @@ # A note on the complexity of Miller's expression language -One of Miller's strengths is its brevity: it's much quicker -- and less -error-prone -- to type `mlr stats1 -a sum -f x,y -g a,b` than having to track -summation variables as in `awk`, or using Miller's [out-of-stream -variables](reference-dsl-variables.md#out-of-stream-variables). And the more -language features Miller's put-DSL has (for-loops, if-statements, nested -control structures, user-defined functions, etc.) then the *less* powerful it -begins to seem: because of the other programming-language features it *doesn't* -have (classes, exceptions, and so on). +One of Miller's strengths is its brevity: it's much quicker -- and less error-prone -- to type `mlr stats1 -a sum -f x,y -g a,b` than having to track summation variables as in `awk`, or using Miller's [out-of-stream variables](reference-dsl-variables.md#out-of-stream-variables). And the more language features Miller's put-DSL has (for-loops, if-statements, nested control structures, user-defined functions, etc.), then the *less* powerful it begins to seem: because of the other programming-language features it *doesn't* have (classes, exceptions, and so on). -When I was originally prototyping Miller in 2015, the primary decision I had -was whether to hand-code in a low-level language like C or Rust or Go, with my -own hand-rolled DSL, or whether to use a higher-level language (like Python or -Lua or Nim) and let the `put` statements be handled by the implementation -language's own `eval`: the implementation language would take the place of a -DSL. Multiple performance experiments showed me I could get better throughput -using the former, by a wide margin. So Miller is Go under the hood with a -hand-rolled DSL. +When I was initially prototyping Miller in 2015, the primary decision I had was whether to hand-code in a low-level language like C or Rust or Go, with my hand-rolled DSL, or whether to use a higher-level language (like Python or Lua or Nim) and let the `put` statements be handled by the implementation language's own `eval`: the implementation language would take the place of a DSL. Multiple performance experiments showed me I could get better throughput using the former, by a wide margin. So Miller is Go under the hood with a hand-rolled DSL. -I do want to keep focusing on what Miller is good at -- concise notation, low -latency, and high throughput -- and not add too much in terms of -high-level-language features to the DSL. That said, some sort of -customizability is a basic thing to want. As of 4.1.0 we have recursive -`for`/`while`/`if` [structures](reference-dsl-control-structures.md) on about -the same complexity level as `awk`; as of 5.0.0 we have [user-defined -functions](reference-dsl-user-defined-functions.md) and [map-valued -variables](reference-dsl-variables.md), again on about the same complexity level -as `awk` along with optional type-declaration syntax; as of Miller 6 we have -full support for [arrays](reference-main-arrays.md). While I'm excited by these -powerful language features, I hope to keep new features focused on Miller's -sweet spot which is speed plus simplicity. +I want to continue focusing on what Miller excels at — concise notation, low latency, and high throughput — and not add too many high-level language features to the DSL. That said, some customizability is a basic thing to want. As of 4.1.0, we have recursive `for`/`while`/`if` [structures](reference-dsl-control-structures.md) on about the same complexity level as `awk`; as of 5.0.0, we have [user-defined functions](reference-dsl-user-defined-functions.md) and [map-valued variables](reference-dsl-variables.md), again on about the same complexity level as `awk` along with optional type-declaration syntax; as of Miller 6, we have full support for [arrays](reference-main-arrays.md). While I'm excited by these powerful language features, I hope to keep new features focused on Miller's sweet spot, which is speed plus simplicity. diff --git a/docs/src/reference-dsl-control-structures.md b/docs/src/reference-dsl-control-structures.md index 16de01613..60bb52d95 100644 --- a/docs/src/reference-dsl-control-structures.md +++ b/docs/src/reference-dsl-control-structures.md @@ -18,7 +18,7 @@ Quick links: ## Pattern-action blocks -These are reminiscent of `awk` syntax. They can be used to allow assignments to be done only when appropriate -- e.g. for math-function domain restrictions, regex-matching, and so on: +These are reminiscent of `awk` syntax. They can be used to allow assignments to be done only when appropriate -- e.g., for math-function domain restrictions, regex-matching, and so on:mlr cat data/put-gating-example-1.dkvp @@ -64,7 +64,7 @@ a=some other name a=xyz_789,b=left_xyz,c=right_789-This produces heteregenous output which Miller, of course, has no problems with (see [Record Heterogeneity](record-heterogeneity.md)). But if you want homogeneous output, the curly braces can be replaced with a semicolon between the expression and the body statements. This causes `put` to evaluate the boolean expression (along with any side effects, namely, regex-captures `\1`, `\2`, etc.) but doesn't use it as a criterion for whether subsequent assignments should be executed. Instead, subsequent assignments are done unconditionally: +This produces heterogeneous output which Miller, of course, has no problems with (see [Record Heterogeneity](record-heterogeneity.md)). But if you want homogeneous output, the curly braces can be replaced with a semicolon between the expression and the body statements. This causes `put` to evaluate the boolean expression (along with any side effects, namely, regex-captures `\1`, `\2`, etc.) but doesn't use it as a criterion for whether subsequent assignments should be executed. Instead, subsequent assignments are done unconditionally:mlr --opprint put ' @@ -172,7 +172,7 @@ records](operating-on-all-records.md) for some options. ## For-loops -While Miller's `while` and `do-while` statements are much as in many other languages, `for` loops are more idiosyncratic to Miller. They are loops over key-value pairs, whether in stream records, out-of-stream variables, local variables, or map-literals: more reminiscent of `foreach`, as in (for example) PHP. There are **for-loops over map keys** and **for-loops over key-value tuples**. Additionally, Miller has a **C-style triple-for loop** with initialize, test, and update statements. Each is described below. +While Miller's `while` and `do-while` statements are much like those in many other languages, `for` loops are more idiosyncratic to Miller. They are loops over key-value pairs, whether in stream records, out-of-stream variables, local variables, or map-literals: more reminiscent of `foreach`, as in (for example) PHP. There are **for-loops over map keys** and **for-loops over key-value tuples**. Additionally, Miller has a **C-style triple-for loop** with initialize, test, and update statements. Each is described below. As with `while` and `do-while`, a `break` or `continue` within nested control structures will propagate to the innermost loop enclosing them, if any, and a `break` or `continue` outside a loop is a syntax error that will be flagged as soon as the expression is parsed, before any input records are ingested. @@ -260,11 +260,9 @@ value: true valuetype: bool ### Key-value for-loops -For [maps](reference-main-maps.md), the first loop variable is the key and the -second is the value; for [arrays](reference-main-arrays.md), the first loop -variable is the (1-up) array index and the second is the value. +For [maps](reference-main-maps.md), the first loop variable is the key, and the second is the value. For [arrays](reference-main-arrays.md), the first loop variable is the (1-based) array index, and the second is the value. -Single-level keys may be gotten at using either `for(k,v)` or `for((k),v)`; multi-level keys may be gotten at using `for((k1,k2,k3),v)` and so on. The `v` variable will be bound to a scalar value (non-array/non-map) if the map stops at that level, or to a map-valued or array-valued variable if the map goes deeper. If the map isn't deep enough then the loop body won't be executed. +Single-level keys may be obtained using either `for(k,v)` or `for((k),v)`; multi-level keys may be obtained using `for((k1,k2,k3),v)` and so on. The `v` variable will be bound to a scalar value (non-array/non-map) if the map stops at that level, or to a map-valued or array-valued variable if the map goes deeper. If the map isn't deep enough then the loop body won't be executed.cat data/for-srec-example.tbl @@ -333,7 +331,7 @@ eks wye 4 0.381399 0.134188 4.515587 18.062348 wye pan 5 0.573288 0.863624 6.4369119999999995 25.747647999999998-It can be confusing to modify the stream record while iterating over a copy of it, so instead you might find it simpler to use a local variable in the loop and only update the stream record after the loop: +It can be confusing to modify the stream record while iterating over a copy of it, so instead, you might find it simpler to use a local variable in the loop and only update the stream record after the loop:mlr --from data/small --opprint put ' @@ -355,7 +353,7 @@ eks wye 4 0.381399 0.134188 4.515587 wye pan 5 0.573288 0.863624 6.4369119999999995-You can also start iterating on sub-maps of an out-of-stream or local variable; you can loop over nested keys; you can loop over all out-of-stream variables. The bound variables are bound to a copy of the sub-map as it was before the loop started. The sub-map is specified by square-bracketed indices after `in`, and additional deeper indices are bound to loop key-variables. The terminal values are bound to the loop value-variable whenever the keys are not too shallow. The value-variable may refer to a terminal (string, number) or it may be map-valued if the map goes deeper. Example indexing is as follows: +You can also start iterating on sub-maps of an out-of-stream or local variable; you can loop over nested keys; you can loop over all out-of-stream variables. The bound variables are bound to a copy of the sub-map as it was before the loop started. The sub-map is specified by square-bracketed indices after `in`, and additional deeper indices are bound to loop key variables. The terminal values are bound to the loop value variable whenever the keys are not too shallow. The value variable may refer to a terminal (string, number) or it may be map-valued if the map goes deeper. Example indexing is as follows:# Parentheses are optional for single key: @@ -516,15 +514,15 @@ wye pan 5 0.573288 0.863624 15 31 Notes: -* In `for (start; continuation; update) { body }`, the start, continuation, and update statements may be empty, single statements, or multiple comma-separated statements. If the continuation is empty (e.g. `for(i=1;;i+=1)`) it defaults to true. +* In `for (start; continuation; update) { body }`, the start, continuation, and update statements may be empty, single statements, or multiple comma-separated statements. If the continuation is empty (e.g. `for(i=1;;i+=1)`), it defaults to true. * In particular, you may use `$`-variables and/or `@`-variables in the start, continuation, and/or update steps (as well as the body, of course). -* The typedecls such as `int` or `num` are optional. If a typedecl is provided (for a local variable), it binds a variable scoped to the for-loop regardless of whether a same-name variable is present in outer scope. If a typedecl is not provided, then the variable is scoped to the for-loop if no same-name variable is present in outer scope, or if a same-name variable is present in outer scope then it is modified. +* The typedecls such as `int` or `num` are optional. If a typedecl is provided (for a local variable), it binds a variable scoped to the for-loop regardless of whether a same-name variable is present in the outer scope. If a typedecl is not provided, then the variable is scoped to the for-loop if no same-name variable is present in the outer scope, or if a same-name variable is present in the outer scope, then it is modified. * Miller has no `++` or `--` operators. -* As with all `for`/`if`/`while` statements in Miller, the curly braces are required even if the body is a single statement, or empty. +* As with all `for`/`if`/`while` statements in Miller, the curly braces are required even if the body is a single statement or empty. ## Begin/end blocks diff --git a/docs/src/reference-dsl-control-structures.md.in b/docs/src/reference-dsl-control-structures.md.in index b7161804c..caffa9bdf 100644 --- a/docs/src/reference-dsl-control-structures.md.in +++ b/docs/src/reference-dsl-control-structures.md.in @@ -2,7 +2,7 @@ ## Pattern-action blocks -These are reminiscent of `awk` syntax. They can be used to allow assignments to be done only when appropriate -- e.g. for math-function domain restrictions, regex-matching, and so on: +These are reminiscent of `awk` syntax. They can be used to allow assignments to be done only when appropriate -- e.g., for math-function domain restrictions, regex-matching, and so on: GENMD-RUN-COMMAND mlr cat data/put-gating-example-1.dkvp @@ -24,7 +24,7 @@ mlr put ' data/put-gating-example-2.dkvp GENMD-EOF -This produces heteregenous output which Miller, of course, has no problems with (see [Record Heterogeneity](record-heterogeneity.md)). But if you want homogeneous output, the curly braces can be replaced with a semicolon between the expression and the body statements. This causes `put` to evaluate the boolean expression (along with any side effects, namely, regex-captures `\1`, `\2`, etc.) but doesn't use it as a criterion for whether subsequent assignments should be executed. Instead, subsequent assignments are done unconditionally: +This produces heterogeneous output which Miller, of course, has no problems with (see [Record Heterogeneity](record-heterogeneity.md)). But if you want homogeneous output, the curly braces can be replaced with a semicolon between the expression and the body statements. This causes `put` to evaluate the boolean expression (along with any side effects, namely, regex-captures `\1`, `\2`, etc.) but doesn't use it as a criterion for whether subsequent assignments should be executed. Instead, subsequent assignments are done unconditionally: GENMD-RUN-COMMAND mlr --opprint put ' @@ -120,7 +120,7 @@ records](operating-on-all-records.md) for some options. ## For-loops -While Miller's `while` and `do-while` statements are much as in many other languages, `for` loops are more idiosyncratic to Miller. They are loops over key-value pairs, whether in stream records, out-of-stream variables, local variables, or map-literals: more reminiscent of `foreach`, as in (for example) PHP. There are **for-loops over map keys** and **for-loops over key-value tuples**. Additionally, Miller has a **C-style triple-for loop** with initialize, test, and update statements. Each is described below. +While Miller's `while` and `do-while` statements are much like those in many other languages, `for` loops are more idiosyncratic to Miller. They are loops over key-value pairs, whether in stream records, out-of-stream variables, local variables, or map-literals: more reminiscent of `foreach`, as in (for example) PHP. There are **for-loops over map keys** and **for-loops over key-value tuples**. Additionally, Miller has a **C-style triple-for loop** with initialize, test, and update statements. Each is described below. As with `while` and `do-while`, a `break` or `continue` within nested control structures will propagate to the innermost loop enclosing them, if any, and a `break` or `continue` outside a loop is a syntax error that will be flagged as soon as the expression is parsed, before any input records are ingested. @@ -165,11 +165,9 @@ GENMD-EOF ### Key-value for-loops -For [maps](reference-main-maps.md), the first loop variable is the key and the -second is the value; for [arrays](reference-main-arrays.md), the first loop -variable is the (1-up) array index and the second is the value. +For [maps](reference-main-maps.md), the first loop variable is the key, and the second is the value. For [arrays](reference-main-arrays.md), the first loop variable is the (1-based) array index, and the second is the value. -Single-level keys may be gotten at using either `for(k,v)` or `for((k),v)`; multi-level keys may be gotten at using `for((k1,k2,k3),v)` and so on. The `v` variable will be bound to a scalar value (non-array/non-map) if the map stops at that level, or to a map-valued or array-valued variable if the map goes deeper. If the map isn't deep enough then the loop body won't be executed. +Single-level keys may be obtained using either `for(k,v)` or `for((k),v)`; multi-level keys may be obtained using `for((k1,k2,k3),v)` and so on. The `v` variable will be bound to a scalar value (non-array/non-map) if the map stops at that level, or to a map-valued or array-valued variable if the map goes deeper. If the map isn't deep enough then the loop body won't be executed. GENMD-RUN-COMMAND cat data/for-srec-example.tbl @@ -210,7 +208,7 @@ mlr --from data/small --opprint put ' ' GENMD-EOF -It can be confusing to modify the stream record while iterating over a copy of it, so instead you might find it simpler to use a local variable in the loop and only update the stream record after the loop: +It can be confusing to modify the stream record while iterating over a copy of it, so instead, you might find it simpler to use a local variable in the loop and only update the stream record after the loop: GENMD-RUN-COMMAND mlr --from data/small --opprint put ' @@ -224,7 +222,7 @@ mlr --from data/small --opprint put ' ' GENMD-EOF -You can also start iterating on sub-maps of an out-of-stream or local variable; you can loop over nested keys; you can loop over all out-of-stream variables. The bound variables are bound to a copy of the sub-map as it was before the loop started. The sub-map is specified by square-bracketed indices after `in`, and additional deeper indices are bound to loop key-variables. The terminal values are bound to the loop value-variable whenever the keys are not too shallow. The value-variable may refer to a terminal (string, number) or it may be map-valued if the map goes deeper. Example indexing is as follows: +You can also start iterating on sub-maps of an out-of-stream or local variable; you can loop over nested keys; you can loop over all out-of-stream variables. The bound variables are bound to a copy of the sub-map as it was before the loop started. The sub-map is specified by square-bracketed indices after `in`, and additional deeper indices are bound to loop key variables. The terminal values are bound to the loop value variable whenever the keys are not too shallow. The value variable may refer to a terminal (string, number) or it may be map-valued if the map goes deeper. Example indexing is as follows: GENMD-INCLUDE-ESCAPED(data/for-oosvar-example-0a.txt) @@ -333,15 +331,15 @@ GENMD-EOF Notes: -* In `for (start; continuation; update) { body }`, the start, continuation, and update statements may be empty, single statements, or multiple comma-separated statements. If the continuation is empty (e.g. `for(i=1;;i+=1)`) it defaults to true. +* In `for (start; continuation; update) { body }`, the start, continuation, and update statements may be empty, single statements, or multiple comma-separated statements. If the continuation is empty (e.g. `for(i=1;;i+=1)`), it defaults to true. * In particular, you may use `$`-variables and/or `@`-variables in the start, continuation, and/or update steps (as well as the body, of course). -* The typedecls such as `int` or `num` are optional. If a typedecl is provided (for a local variable), it binds a variable scoped to the for-loop regardless of whether a same-name variable is present in outer scope. If a typedecl is not provided, then the variable is scoped to the for-loop if no same-name variable is present in outer scope, or if a same-name variable is present in outer scope then it is modified. +* The typedecls such as `int` or `num` are optional. If a typedecl is provided (for a local variable), it binds a variable scoped to the for-loop regardless of whether a same-name variable is present in the outer scope. If a typedecl is not provided, then the variable is scoped to the for-loop if no same-name variable is present in the outer scope, or if a same-name variable is present in the outer scope, then it is modified. * Miller has no `++` or `--` operators. -* As with all `for`/`if`/`while` statements in Miller, the curly braces are required even if the body is a single statement, or empty. +* As with all `for`/`if`/`while` statements in Miller, the curly braces are required even if the body is a single statement or empty. ## Begin/end blocks diff --git a/docs/src/reference-dsl-filter-statements.md b/docs/src/reference-dsl-filter-statements.md index 0a2de3dd3..3d2d733f2 100644 --- a/docs/src/reference-dsl-filter-statements.md +++ b/docs/src/reference-dsl-filter-statements.md @@ -36,7 +36,7 @@ red,square,true,2,15,79.2778,0.0130 red,circle,true,3,16,13.8103,2.9010-The former, of course, is a little easier to type. For another example: +The former is a little easier to type. For another example:mlr --csv put '@running_sum += $quantity; filter @running_sum > 500' example.csv diff --git a/docs/src/reference-dsl-filter-statements.md.in b/docs/src/reference-dsl-filter-statements.md.in index c3acd41e1..7f363593e 100644 --- a/docs/src/reference-dsl-filter-statements.md.in +++ b/docs/src/reference-dsl-filter-statements.md.in @@ -10,7 +10,7 @@ GENMD-RUN-COMMAND mlr --csv put 'filter NR==2 || NR==3' example.csv GENMD-EOF -The former, of course, is a little easier to type. For another example: +The former is a little easier to type. For another example: GENMD-RUN-COMMAND mlr --csv put '@running_sum += $quantity; filter @running_sum > 500' example.csv diff --git a/docs/src/reference-dsl-higher-order-functions.md b/docs/src/reference-dsl-higher-order-functions.md index d40cfd1e7..6e41bd281 100644 --- a/docs/src/reference-dsl-higher-order-functions.md +++ b/docs/src/reference-dsl-higher-order-functions.md @@ -29,23 +29,15 @@ As of [Miller 6](new-in-miller-6.md) you can use intuitive operations on arrays and maps, as an alternative to things which would otherwise require for-loops. -See also the [`get_keys`](reference-dsl-builtin-functions.md#get_keys) and -[`get_values`](reference-dsl-builtin-functions.md#get_values) functions which, -when given a map, return an array of its keys or an array of its values, -respectively. +See also the [`get_keys`](reference-dsl-builtin-functions.md#get_keys) and [`get_values`](reference-dsl-builtin-functions.md#get_values) functions which, when given a map, return an array of its keys or an array of its values, respectively. ## select -The [`select`](reference-dsl-builtin-functions.md#select) function takes a map -or array as its first argument and a function as second argument. It includes -each input element in the output if the function returns true. +The [`select`](reference-dsl-builtin-functions.md#select) function takes a map or array as its first argument and a function as its second argument. It includes each input element in the output if the function returns true. -For arrays, that function should take one argument, for array element; for -maps, it should take two, for map-element key and value. In either case it -should return a boolean. +For arrays, that function should take one argument, for an array element; for maps, it should take two, for a map element key and value. In either case, it should return a boolean. -A perhaps helpful analogy: the `select` function is to arrays and maps as the -[`filter`](reference-verbs.md#filter) is to records. +A perhaps helpful analogy: the `select` function is to arrays and maps as the [`filter`](reference-verbs.md#filter) is to records. Array examples: @@ -123,16 +115,11 @@ Values with last digit >= 5: ## apply -The [`apply`](reference-dsl-builtin-functions.md#apply) function takes a map -or array as its first argument and a function as second argument. It applies -the function to each element of the array or map. +The [`apply`](reference-dsl-builtin-functions.md#apply) function takes a map or array as its first argument and a function as its second argument. It applies the function to each element of the array or map. -For arrays, the function should take one argument, for array element; it should -return a new element. For maps, it should take two, for map-element key and -value. It should return a new key-value pair (i.e. a single-entry map). +For arrays, the function should take one argument, representing an array element, and return a new element. For maps, it should take two, for the map element key and value. It should return a new key-value pair (i.e., a single-entry map). -A perhaps helpful analogy: the `apply` function is to arrays and maps as the -[`put`](reference-verbs.md#put) is to records. +A perhaps helpful analogy: the `apply` function is to arrays and maps as the [`put`](reference-verbs.md#put) is to records. Array examples: @@ -232,17 +219,11 @@ Same, with upcased keys: ## reduce -The [`reduce`](reference-dsl-builtin-functions.md#reduce) function takes a map -or array as its first argument and a function as second argument. It accumulates entries into a final -output -- for example, sum or product. +The [`reduce`](reference-dsl-builtin-functions.md#reduce) function takes a map or array as its first argument and a function as its second argument. It accumulates entries into a final output, such as a sum or product. -For arrays, the function should take two arguments, for accumulated value and -array element; for maps, it should take four, for accumulated key and value -and map-element key and value. In either case it should return the updated -accumulator. +For arrays, the function should take two arguments, for the accumulated value and the array element; for maps, it should take four, for the accumulated key and value, and the map-element key and value. In either case it should return the updated accumulator. -The start value for the accumulator is the first element for arrays, or the -first element's key-value pair for maps. +The start value for the accumulator is the first element for arrays, or the first element's key-value pair for maps.mlr -n put ' @@ -370,10 +351,7 @@ String-join of values: ## fold -The [`fold`](reference-dsl-builtin-functions.md#fold) function is the same as -`reduce`, except that instead of the starting value for the accumulation being -taken from the first entry of the array/map, you specify it as the third -argument. +The [`fold`](reference-dsl-builtin-functions.md#fold) function is the same as `reduce`, except that instead of the starting value for the accumulation being taken from the first entry of the array/map, you specify it as the third argument.mlr -n put ' @@ -469,22 +447,13 @@ Sum of values with fold and 1000000 initial value: ## sort -The [`sort`](reference-dsl-builtin-functions.md#sort) function takes a map or -array as its first argument, and it can take a function as second argument. -Unlike the other higher-order functions, the second argument can be omitted -when the natural ordering is desired -- ordered by array element for arrays, or by -key for maps. +The [`sort`](reference-dsl-builtin-functions.md#sort) function takes a map or array as its first argument, and it can take a function as its second argument. Unlike the other higher-order functions, the second argument can be omitted when the natural ordering is desired -- ordered by array element for arrays, or by key for maps. -As a second option, character flags such as `r` for reverse or `c` for -case-folded lexical sort can be supplied as the second argument. +As a second option, character flags such as `r` for reverse or `c` for case-folded lexical sort can be supplied as the second argument. As a third option, a function can be supplied as the second argument. -For arrays, that function should take two arguments `a` and `b`, returning a -negative, zero, or positive number as `ab` respectively. -For maps, the function should take four arguments `ak`, `av`, `bk`, and `bv`, -again returning negative, zero, or positive, using `a` and `b`'s keys and -values. +For arrays, that function should take two arguments `a` and `b`, returning a negative, zero, or positive number as `ab` respectively. For maps, the function should take four arguments `ak`, `av`, `bk`, and `bv`, again returning negative, zero, or positive, using `a`'s and `b`'s keys and values. Array examples: @@ -703,9 +672,7 @@ red square false 6 64 77.1991 9.5310 ## Combined examples -Using a paradigm from the [page on operating on all -records](operating-on-all-records.md), we can retain a column from the input -data as an array, then apply some higher-order functions to it: +Using a paradigm from the [page on operating on all records](operating-on-all-records.md), we can retain a column from the input data as an array, then apply some higher-order functions to it:mlr --c2p cat example.csv @@ -776,7 +743,7 @@ Sorted, then cubed, then summed: ### Remember return -From other languages it's easy to accidentally write +From other languages, it's easy to write accidentallymlr -n put 'end { print select([1,2,3,4,5], func (e) { e >= 3 })}' @@ -833,7 +800,7 @@ but this does: 2187-### Built-in functions currently unsupported as arguments +### Built-in functions are currently unsupported as arguments [Built-in functions](reference-dsl-user-defined-functions.md) are, as of September 2021, a bit separate from [user-defined diff --git a/docs/src/reference-dsl-higher-order-functions.md.in b/docs/src/reference-dsl-higher-order-functions.md.in index ed044c006..de5ccbdf9 100644 --- a/docs/src/reference-dsl-higher-order-functions.md.in +++ b/docs/src/reference-dsl-higher-order-functions.md.in @@ -13,23 +13,15 @@ As of [Miller 6](new-in-miller-6.md) you can use intuitive operations on arrays and maps, as an alternative to things which would otherwise require for-loops. -See also the [`get_keys`](reference-dsl-builtin-functions.md#get_keys) and -[`get_values`](reference-dsl-builtin-functions.md#get_values) functions which, -when given a map, return an array of its keys or an array of its values, -respectively. +See also the [`get_keys`](reference-dsl-builtin-functions.md#get_keys) and [`get_values`](reference-dsl-builtin-functions.md#get_values) functions which, when given a map, return an array of its keys or an array of its values, respectively. ## select -The [`select`](reference-dsl-builtin-functions.md#select) function takes a map -or array as its first argument and a function as second argument. It includes -each input element in the output if the function returns true. +The [`select`](reference-dsl-builtin-functions.md#select) function takes a map or array as its first argument and a function as its second argument. It includes each input element in the output if the function returns true. -For arrays, that function should take one argument, for array element; for -maps, it should take two, for map-element key and value. In either case it -should return a boolean. +For arrays, that function should take one argument, for an array element; for maps, it should take two, for a map element key and value. In either case, it should return a boolean. -A perhaps helpful analogy: the `select` function is to arrays and maps as the -[`filter`](reference-verbs.md#filter) is to records. +A perhaps helpful analogy: the `select` function is to arrays and maps as the [`filter`](reference-verbs.md#filter) is to records. Array examples: @@ -75,16 +67,11 @@ GENMD-EOF ## apply -The [`apply`](reference-dsl-builtin-functions.md#apply) function takes a map -or array as its first argument and a function as second argument. It applies -the function to each element of the array or map. +The [`apply`](reference-dsl-builtin-functions.md#apply) function takes a map or array as its first argument and a function as its second argument. It applies the function to each element of the array or map. -For arrays, the function should take one argument, for array element; it should -return a new element. For maps, it should take two, for map-element key and -value. It should return a new key-value pair (i.e. a single-entry map). +For arrays, the function should take one argument, representing an array element, and return a new element. For maps, it should take two, for the map element key and value. It should return a new key-value pair (i.e., a single-entry map). -A perhaps helpful analogy: the `apply` function is to arrays and maps as the -[`put`](reference-verbs.md#put) is to records. +A perhaps helpful analogy: the `apply` function is to arrays and maps as the [`put`](reference-verbs.md#put) is to records. Array examples: @@ -134,17 +121,11 @@ GENMD-EOF ## reduce -The [`reduce`](reference-dsl-builtin-functions.md#reduce) function takes a map -or array as its first argument and a function as second argument. It accumulates entries into a final -output -- for example, sum or product. +The [`reduce`](reference-dsl-builtin-functions.md#reduce) function takes a map or array as its first argument and a function as its second argument. It accumulates entries into a final output, such as a sum or product. -For arrays, the function should take two arguments, for accumulated value and -array element; for maps, it should take four, for accumulated key and value -and map-element key and value. In either case it should return the updated -accumulator. +For arrays, the function should take two arguments, for the accumulated value and the array element; for maps, it should take four, for the accumulated key and value, and the map-element key and value. In either case it should return the updated accumulator. -The start value for the accumulator is the first element for arrays, or the -first element's key-value pair for maps. +The start value for the accumulator is the first element for arrays, or the first element's key-value pair for maps. GENMD-RUN-COMMAND mlr -n put ' @@ -213,10 +194,7 @@ GENMD-EOF ## fold -The [`fold`](reference-dsl-builtin-functions.md#fold) function is the same as -`reduce`, except that instead of the starting value for the accumulation being -taken from the first entry of the array/map, you specify it as the third -argument. +The [`fold`](reference-dsl-builtin-functions.md#fold) function is the same as `reduce`, except that instead of the starting value for the accumulation being taken from the first entry of the array/map, you specify it as the third argument. GENMD-RUN-COMMAND mlr -n put ' @@ -269,22 +247,13 @@ GENMD-EOF ## sort -The [`sort`](reference-dsl-builtin-functions.md#sort) function takes a map or -array as its first argument, and it can take a function as second argument. -Unlike the other higher-order functions, the second argument can be omitted -when the natural ordering is desired -- ordered by array element for arrays, or by -key for maps. +The [`sort`](reference-dsl-builtin-functions.md#sort) function takes a map or array as its first argument, and it can take a function as its second argument. Unlike the other higher-order functions, the second argument can be omitted when the natural ordering is desired -- ordered by array element for arrays, or by key for maps. -As a second option, character flags such as `r` for reverse or `c` for -case-folded lexical sort can be supplied as the second argument. +As a second option, character flags such as `r` for reverse or `c` for case-folded lexical sort can be supplied as the second argument. As a third option, a function can be supplied as the second argument. -For arrays, that function should take two arguments `a` and `b`, returning a -negative, zero, or positive number as `ab` respectively. -For maps, the function should take four arguments `ak`, `av`, `bk`, and `bv`, -again returning negative, zero, or positive, using `a` and `b`'s keys and -values. +For arrays, that function should take two arguments `a` and `b`, returning a negative, zero, or positive number as `ab` respectively. For maps, the function should take four arguments `ak`, `av`, `bk`, and `bv`, again returning negative, zero, or positive, using `a`'s and `b`'s keys and values. Array examples: @@ -379,9 +348,7 @@ GENMD-EOF ## Combined examples -Using a paradigm from the [page on operating on all -records](operating-on-all-records.md), we can retain a column from the input -data as an array, then apply some higher-order functions to it: +Using a paradigm from the [page on operating on all records](operating-on-all-records.md), we can retain a column from the input data as an array, then apply some higher-order functions to it: GENMD-RUN-COMMAND mlr --c2p cat example.csv @@ -426,7 +393,7 @@ GENMD-EOF ### Remember return -From other languages it's easy to accidentally write +From other languages, it's easy to write accidentally GENMD-RUN-COMMAND-TOLERATING-ERROR mlr -n put 'end { print select([1,2,3,4,5], func (e) { e >= 3 })}' @@ -465,7 +432,7 @@ mlr -n put ' ' GENMD-EOF -### Built-in functions currently unsupported as arguments +### Built-in functions are currently unsupported as arguments [Built-in functions](reference-dsl-user-defined-functions.md) are, as of September 2021, a bit separate from [user-defined diff --git a/docs/src/reference-dsl-operators.md b/docs/src/reference-dsl-operators.md index 921a02913..cdba1ca55 100644 --- a/docs/src/reference-dsl-operators.md +++ b/docs/src/reference-dsl-operators.md @@ -22,7 +22,7 @@ Operators are listed on the [DSL built-in functions page](reference-dsl-builtin- ## Operator precedence -Operators are listed in order of decreasing precedence, highest first. +Operators are listed in order of decreasing precedence, from highest to lowest. | Operators | Associativity | |-------------------------------|---------------| @@ -46,14 +46,13 @@ Operators are listed in order of decreasing precedence, highest first. | `? :` | right to left | | `=` | N/A for Miller (there is no $a=$b=$c) | -See also the [section on parsing and operator precedence in the REPL](repl.md#parsing-and-operator-precedence) -for information on how to examine operator precedence interactively. +See also the [section on parsing and operator precedence in the REPL](repl.md#parsing-and-operator-precedence) for information on how to examine operator precedence interactively. ## Operator and function semantics * Functions are often pass-throughs straight to the system-standard Go libraries. -* The [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max) functions are different from other multi-argument functions which return null if any of their inputs are null: for [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max), by contrast, if one argument is absent-null, the other is returned. Empty-null loses min or max against numeric or boolean; empty-null is less than any other string. +* The [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max) functions are different from other multi-argument functions, which return null if any of their inputs are null: for [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max), by contrast, if one argument is absent-null, the other is returned. Empty-null loses min or max against numeric or boolean; empty-null is less than any other string. * Symmetrically with respect to the bitwise OR, AND, and XOR operators [`|`](reference-dsl-builtin-functions.md#bitwise-or), @@ -71,7 +70,7 @@ for information on how to examine operator precedence interactively. The main use for the `.` operator is for string concatenation: `"abc" . "def"` is `"abc.def"`. -However, in Miller 6 it has optional use for map traversal. Example: +However, in Miller 6, it has an optional use for map traversal. Example:cat data/server-log.json @@ -109,8 +108,6 @@ However, in Miller 6 it has optional use for map traversal. Example:bar.baz bar.baz -[ -]This also works on the left-hand sides of assignment statements: @@ -148,7 +145,7 @@ This also works on the left-hand sides of assignment statements: A few caveats: -* This is why `.` has higher precedece than `+` in the table above -- in Miller 5 and below, where `.` was only used for concatenation, it had the same precedence as `+`. So you can now do this: +* This is why `.` has higher precedence than `+` in the table above -- in Miller 5 and below, where `.` was only used for concatenation, it had the same precedence as `+`. So you can now do this:mlr --json --from data/server-log.json put -q ' @@ -157,8 +154,6 @@ A few caveats:6989 -[ -]* However (awkwardly), if you want to use `.` for map-traversal as well as string-concatenation in the same statement, you'll need to insert parentheses, as the default associativity is left-to-right: @@ -170,8 +165,6 @@ A few caveats:(error) -[ -]@@ -181,6 +174,4 @@ A few caveats:GET -- api/check -[ -]diff --git a/docs/src/reference-dsl-operators.md.in b/docs/src/reference-dsl-operators.md.in index 73a92d9e1..a4b0322f4 100644 --- a/docs/src/reference-dsl-operators.md.in +++ b/docs/src/reference-dsl-operators.md.in @@ -6,7 +6,7 @@ Operators are listed on the [DSL built-in functions page](reference-dsl-builtin- ## Operator precedence -Operators are listed in order of decreasing precedence, highest first. +Operators are listed in order of decreasing precedence, from highest to lowest. | Operators | Associativity | |-------------------------------|---------------| @@ -30,14 +30,13 @@ Operators are listed in order of decreasing precedence, highest first. | `? :` | right to left | | `=` | N/A for Miller (there is no $a=$b=$c) | -See also the [section on parsing and operator precedence in the REPL](repl.md#parsing-and-operator-precedence) -for information on how to examine operator precedence interactively. +See also the [section on parsing and operator precedence in the REPL](repl.md#parsing-and-operator-precedence) for information on how to examine operator precedence interactively. ## Operator and function semantics * Functions are often pass-throughs straight to the system-standard Go libraries. -* The [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max) functions are different from other multi-argument functions which return null if any of their inputs are null: for [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max), by contrast, if one argument is absent-null, the other is returned. Empty-null loses min or max against numeric or boolean; empty-null is less than any other string. +* The [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max) functions are different from other multi-argument functions, which return null if any of their inputs are null: for [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max), by contrast, if one argument is absent-null, the other is returned. Empty-null loses min or max against numeric or boolean; empty-null is less than any other string. * Symmetrically with respect to the bitwise OR, AND, and XOR operators [`|`](reference-dsl-builtin-functions.md#bitwise-or), @@ -55,7 +54,7 @@ for information on how to examine operator precedence interactively. The main use for the `.` operator is for string concatenation: `"abc" . "def"` is `"abc.def"`. -However, in Miller 6 it has optional use for map traversal. Example: +However, in Miller 6, it has an optional use for map traversal. Example: GENMD-RUN-COMMAND cat data/server-log.json @@ -78,7 +77,7 @@ GENMD-EOF A few caveats: -* This is why `.` has higher precedece than `+` in the table above -- in Miller 5 and below, where `.` was only used for concatenation, it had the same precedence as `+`. So you can now do this: +* This is why `.` has higher precedence than `+` in the table above -- in Miller 5 and below, where `.` was only used for concatenation, it had the same precedence as `+`. So you can now do this: GENMD-RUN-COMMAND mlr --json --from data/server-log.json put -q ' diff --git a/docs/src/reference-dsl-output-statements.md b/docs/src/reference-dsl-output-statements.md index 0984b1fd5..cca9fc4ea 100644 --- a/docs/src/reference-dsl-output-statements.md +++ b/docs/src/reference-dsl-output-statements.md @@ -22,15 +22,15 @@ You can **output** variable-values or expressions in **five ways**: * Use **emit1**/**emit**/**emitp**/**emitf** to send out-of-stream variables' current values to the output record stream, e.g. `@sum += $x; emit1 @sum` which produces an extra record such as `sum=3.1648382`. These records, just like records from input file(s), participate in downstream [then-chaining](reference-main-then-chaining.md) to other verbs. -* Use the **print** or **eprint** keywords which immediately print an expression *directly to standard output or standard error*, respectively. Note that `dump`, `edump`, `print`, and `eprint` don't output records which participate in `then`-chaining; rather, they're just immediate prints to stdout/stderr. The `printn` and `eprintn` keywords are the same except that they don't print final newlines. Additionally, you can print to a specified file instead of stdout/stderr. +* Use the **print** or **eprint** keywords which immediately print an expression *directly to standard output or standard error*, respectively. Note that `dump`, `edump`, `print`, and `eprint` don't output records that participate in `then`-chaining; rather, they're just immediate prints to stdout/stderr. The `printn` and `eprintn` keywords are the same except that they don't print final newlines. Additionally, you can print to a specified file instead of stdout/stderr. * Use the **dump** or **edump** keywords, which *immediately print all out-of-stream variables as a JSON data structure to the standard output or standard error* (respectively). -* Use **tee** which formats the current stream record (not just an arbitrary string as with **print**) to a specific file. +* Use **tee**, which formats the current stream record (not just an arbitrary string as with **print**) to a specific file. -For the first two options you are populating the output-records stream which feeds into the next verb in a `then`-chain (if any), or which otherwise is formatted for output using `--o...` flags. +For the first two options, you are populating the output-records stream which feeds into the next verb in a `then`-chain (if any), or which otherwise is formatted for output using `--o...` flags. -For the last three options you are sending output directly to standard output, standard error, or a file. +For the last three options, you are sending output directly to standard output, standard error, or a file. ## Print statements @@ -38,7 +38,7 @@ The `print` statement is perhaps self-explanatory, but with a few light caveats: * There are four variants: `print` goes to stdout with final newline, `printn` goes to stdout without final newline (you can include one using "\n" in your output string), `eprint` goes to stderr with final newline, and `eprintn` goes to stderr without final newline. -* Output goes directly to stdout/stderr, respectively: data produced this way do not go downstream to the next verb in a `then`-chain. (Use `emit` for that.) +* Output goes directly to stdout/stderr, respectively: data produced this way does not go downstream to the next verb in a `then`-chain. (Use `emit` for that.) * Print statements are for strings (`print "hello"`), or things which can be made into strings: numbers (`print 3`, `print $a + $b`), or concatenations thereof (`print "a + b = " . ($a + $b)`). Maps (in `$*`, map-valued out-of-stream or local variables, and map literals) as well as arrays are printed as JSON. @@ -62,9 +62,9 @@ The `dump` statement is for printing expressions, including maps, directly to st * There are two variants: `dump` prints to stdout; `edump` prints to stderr. -* Output goes directly to stdout/stderr, respectively: data produced this way do not go downstream to the next verb in a `then`-chain. (Use `emit` for that.) +* Output goes directly to stdout/stderr, respectively: data produced this way does not go downstream to the next verb in a `then`-chain. (Use `emit` for that.) -* You can use `dump` to output single strings, numbers, or expressions including map-valued data. Map-valued data are printed as JSON. +* You can use `dump` to output single strings, numbers, or expressions including map-valued data. Map-valued data is printed as JSON. * If you use `dump` (or `edump`) with no arguments, you get a JSON structure representing the current values of all out-of-stream variables. @@ -76,7 +76,7 @@ The `dump` statement is for printing expressions, including maps, directly to st Records produced by a `mlr put` go downstream to the next verb in your `then`-chain, if any, or otherwise to standard output. If you want to additionally copy out records to files, you can do that using `tee`. -The syntax is, by example: +The syntax is, for example:mlr --from myfile.dat put 'tee > "tap.dat", $*' then sort -n index @@ -84,8 +84,7 @@ The syntax is, by example: First is `tee >`, then the filename expression (which can be an expression such as `"tap.".$a.".dat"`), then a comma, then `$*`. (Nothing else but `$*` is teeable.) -You can also write to a variable file name -- for example, you can split a -single file into multiple ones on field names: +You can also write to a variable file name -- for example, you can split a single file into multiple ones on field names:+## surv + +mlr --csv cat example.csv @@ -324,26 +323,12 @@ There are four variants: `emit1`, `emitf`, `emit`, and `emitp`. These are used to insert new records into the record stream -- or, optionally, redirect them to files. -Keep in mind that out-of-stream variables are a nested, multi-level -[map](reference-main-maps.md) (directly viewable as JSON using `dump`), while -Miller record values are as well during processing -- but records may be -flattened down for output to tabular formats. See the page [Flatten/unflatten: -JSON vs. tabular formats](flatten-unflatten.md) for more information. +Keep in mind that out-of-stream variables are a nested, multi-level [map](reference-main-maps.md) (directly viewable as JSON using `dump`), while Miller record values are as well during processing -- but records may be flattened down for output to tabular formats. See the page [Flatten/unflatten: JSON vs. tabular formats](flatten-unflatten.md) for more information. -* You can use `emit1` to emit any map-valued expression, including `$*`, - map-valued out-of-stream variables, the entire out-of-stream-variable - collection `@*`, map-valued local variables, map literals, or map-valued - function return values. -* For `emit`, `emitp`, and `emitf`, you can emit map-valued local variables, - map-valued field attributes (with `$`), map-va out-of-stream variables (with - `@`), `$*`, `@*`, or map literals (with outermost `{...}`) -- but not arbitrary - expressions which evaluate to map (such as function return values). +* You can use `emit1` to emit any map-valued expression, including `$*`, map-valued out-of-stream variables, the entire out-of-stream-variable collection `@*`, map-valued local variables, map literals, or map-valued function return values. +* For `emit`, `emitp`, and `emitf`, you can emit map-valued local variables, map-valued field attributes (with `$`), map-va out-of-stream variables (with `@`), `$*`, `@*`, or map literals (with outermost `{...}`) -- but not arbitrary expressions which evaluate to map (such as function return values). -The reason for this is part historical and part technical. As we'll see below, -you can do lots of syntactical things with `emit`, `emitp`, and `emitf`, -including printing them side-by-side, index them, redirect the output to files, -etc. What this means syntactically is that Miller's parser needs to handle all -sorts of commas, parentheses, and so on: +The reason for this is partly historical and partly technical. As we'll see below, you can do lots of syntactical things with `emit`, `emitp`, and `emitf`, including printing them side-by-side, indexing them, redirecting the output to files, etc. What this means syntactically is that Miller's parser needs to handle all sorts of commas, parentheses, and so on:@@ -3794,9 +3831,9 @@ distinct_count 5 5 10000 10000 10000 mode pan wye 1 0.3467901443380824 0.7268028627434533 sum 0 0 50005000 4986.019681679581 5062.057444929905 mean - - 5000.5 0.49860196816795804 0.5062057444929905 -stddev - - 2886.8956799071675 0.2902925151144007 0.290880086426933 -var - - 8334166.666666667 0.08426974433144456 0.08461122467974003 -skewness - - 0 -0.0006899591185521965 -0.017849760120133784 +stddev - - 2886.8956799071675 0.29029251511440074 0.2908800864269331 +var - - 8334166.666666667 0.08426974433144457 0.08461122467974005 +skewness - - 0 -0.0006899591185517494 -0.01784976012013298 minlen 3 3 1 15 13 maxlen 3 3 5 22 22 min eks eks 1 0.00004509679127584487 0.00008818962627266114 @@ -3821,6 +3858,21 @@ mean - - 5000.5 0.49860196816795804 0.5062057444929905 median pan pan 5001 0.5011592202840128 0.5060212582772865emitf @count, @sum @@ -352,12 +337,7 @@ sorts of commas, parentheses, and so on: # etc-When we try to allow `emitf`/`emit`/`emitp` to handle arbitrary map-valued -expressions, like `mapexcept($*, mymap)` and so on, this inserts more syntactic -complexity in terms of commas, parentheses, and so on. The technical term is -_LR-1 shift-reduce conflicts_, but we can simply think of this in terms of the -parser not being able to efficiently disambiguate all the punctuational -opportunities. +When we try to allow `emitf`/`emit`/`emitp` to handle arbitrary map-valued expressions, like `mapexcept($*, mymap)` and so on, this inserts more syntactic complexity in terms of commas, parentheses, and so on. The technical term is _LR-1 shift-reduce conflicts_, but we can think of this in terms of the parser being unable to efficiently disambiguate all the punctuational opportunities. So, `emit1` can handle syntactic richness in the one thing being emitted; `emitf`, `emit`, and `emitp` can handle syntactic richness in the side-by-side @@ -365,7 +345,7 @@ placement, indexing, and redirection. (Mnemonic: If all you want is to insert a new record into the record stream, `emit1` is probably the _one_ you want.) -What this means is that if you want to emit an expression which evaluates to a map, you can do quite simply +What this means is that if you want to emit an expression that evaluates to a map, you can do it quite simply:mlr --c2p --from example.csv put -q ' @@ -386,7 +366,7 @@ id color shape flag k index quantity rate 10 purple square false 10 91 72.3735 8.2430-And if you want indexing, redirects, etc., just assign to a temporary variable and use one of the other emit variants: +And if you want indexing, redirects, etc., just assign to a temporary variable and use one of the other `emit` variants:mlr --c2p --from example.csv put -q ' @@ -410,7 +390,7 @@ id color shape flag k index quantity rate ## Emitf statements -Use **emitf** to output several out-of-stream variables side-by-side in the same output record. For `emitf` these mustn't have indexing using `@name[...]`. Example: +Use **emitf** to output several out-of-stream variables side-by-side in the same output record. For `emitf`, these mustn't have indexing using `@name[...]`. Example:@@ -3767,6 +3803,7 @@ Options: -a {mean,sum,etc.} Use only the specified summarizers. -x {mean,sum,etc.} Use all summarizers, except the specified ones. --all Use all available summarizers. +--transpose Show output with field names as column names.. -h|--help Show this message.mlr put -q ' @@ -426,7 +406,7 @@ count=5,x_sum=2.26476,y_sum=2.585083 ## Emit statements -Use **emit** to output an out-of-stream variable. If it's non-indexed you'll get a simple key-value pair: +Use **emit** to output an out-of-stream variable. If it's non-indexed, you'll get a simple key-value pair:-Semicolons are required between statements even if those statements are on separate lines. **Newlines** are for your convenience but have no syntactic meaning: line endings do not terminate statements. For example, adjacent assignment statements must be separated by semicolons even if those statements are on separate lines: +Semicolons are required between statements, even if those statements are on separate lines. **Newlines** are for your convenience but have no syntactic meaning: line endings do not terminate statements. For example, adjacent assignment statements must be separated by semicolons even if those statements are on separate lines:cat data/small @@ -455,7 +435,7 @@ a=wye,b=pan,i=5,x=0.573288,y=0.863624 sum=2.26476-If it's indexed then use as many names after `emit` as there are indices: +If it's indexed, then use as many names after `emit` as there are indices:mlr put -q '@sum[$a] += $x; end { dump }' data/small @@ -624,8 +604,7 @@ sum.wye.wye 0.204603 sum.wye.pan 0.573288-Use **--flatsep** to specify the character which joins multilevel -keys for `emitp` (it defaults to a colon): +Use **--flatsep** to specify the character that joins multilevel keys for `emitp` (it defaults to a colon):mlr --flatsep / put -q '@sum[$a][$b] += $x; end { emitp @sum, "a" }' data/small @@ -703,11 +682,11 @@ hat hat 182.8535323148762 381 0.47993053101017374 hat pan 168.5538067327806 363 0.4643355557376876-What this does is walk through the first out-of-stream variable (`@x_sum` in this example) as usual, then for each keylist found (e.g. `pan,wye`), include the values for the remaining out-of-stream variables (here, `@x_count` and `@x_mean`). You should use this when all out-of-stream variables in the emit statement have **the same shape and the same keylists**. +What this does is walk through the first out-of-stream variable (`@x_sum` in this example) as usual, then for each keylist found (e.g., `pan,wye`), include the values for the remaining out-of-stream variables (here, `@x_count` and `@x_mean`). You should use this when all out-of-stream variables in the emit statement have **the same shape and the same keylists**. ## Emit-all statements -Use **emit all** (or `emit @*` which is synonymous) to output all out-of-stream variables. You can use the following idiom to get various accumulators output side-by-side (reminiscent of `mlr stats1`): +Use **emit all** (or `emit @*`, which is synonymous) to output all out-of-stream variables. You can use the following idiom to get various accumulators' output side-by-side (reminiscent of `mlr stats1`):mlr --from data/small --opprint put -q ' diff --git a/docs/src/reference-dsl-output-statements.md.in b/docs/src/reference-dsl-output-statements.md.in index 3b42c2bc7..bfc142209 100644 --- a/docs/src/reference-dsl-output-statements.md.in +++ b/docs/src/reference-dsl-output-statements.md.in @@ -6,15 +6,15 @@ You can **output** variable-values or expressions in **five ways**: * Use **emit1**/**emit**/**emitp**/**emitf** to send out-of-stream variables' current values to the output record stream, e.g. `@sum += $x; emit1 @sum` which produces an extra record such as `sum=3.1648382`. These records, just like records from input file(s), participate in downstream [then-chaining](reference-main-then-chaining.md) to other verbs. -* Use the **print** or **eprint** keywords which immediately print an expression *directly to standard output or standard error*, respectively. Note that `dump`, `edump`, `print`, and `eprint` don't output records which participate in `then`-chaining; rather, they're just immediate prints to stdout/stderr. The `printn` and `eprintn` keywords are the same except that they don't print final newlines. Additionally, you can print to a specified file instead of stdout/stderr. +* Use the **print** or **eprint** keywords which immediately print an expression *directly to standard output or standard error*, respectively. Note that `dump`, `edump`, `print`, and `eprint` don't output records that participate in `then`-chaining; rather, they're just immediate prints to stdout/stderr. The `printn` and `eprintn` keywords are the same except that they don't print final newlines. Additionally, you can print to a specified file instead of stdout/stderr. * Use the **dump** or **edump** keywords, which *immediately print all out-of-stream variables as a JSON data structure to the standard output or standard error* (respectively). -* Use **tee** which formats the current stream record (not just an arbitrary string as with **print**) to a specific file. +* Use **tee**, which formats the current stream record (not just an arbitrary string as with **print**) to a specific file. -For the first two options you are populating the output-records stream which feeds into the next verb in a `then`-chain (if any), or which otherwise is formatted for output using `--o...` flags. +For the first two options, you are populating the output-records stream which feeds into the next verb in a `then`-chain (if any), or which otherwise is formatted for output using `--o...` flags. -For the last three options you are sending output directly to standard output, standard error, or a file. +For the last three options, you are sending output directly to standard output, standard error, or a file. ## Print statements @@ -22,7 +22,7 @@ The `print` statement is perhaps self-explanatory, but with a few light caveats: * There are four variants: `print` goes to stdout with final newline, `printn` goes to stdout without final newline (you can include one using "\n" in your output string), `eprint` goes to stderr with final newline, and `eprintn` goes to stderr without final newline. -* Output goes directly to stdout/stderr, respectively: data produced this way do not go downstream to the next verb in a `then`-chain. (Use `emit` for that.) +* Output goes directly to stdout/stderr, respectively: data produced this way does not go downstream to the next verb in a `then`-chain. (Use `emit` for that.) * Print statements are for strings (`print "hello"`), or things which can be made into strings: numbers (`print 3`, `print $a + $b`), or concatenations thereof (`print "a + b = " . ($a + $b)`). Maps (in `$*`, map-valued out-of-stream or local variables, and map literals) as well as arrays are printed as JSON. @@ -46,9 +46,9 @@ The `dump` statement is for printing expressions, including maps, directly to st * There are two variants: `dump` prints to stdout; `edump` prints to stderr. -* Output goes directly to stdout/stderr, respectively: data produced this way do not go downstream to the next verb in a `then`-chain. (Use `emit` for that.) +* Output goes directly to stdout/stderr, respectively: data produced this way does not go downstream to the next verb in a `then`-chain. (Use `emit` for that.) -* You can use `dump` to output single strings, numbers, or expressions including map-valued data. Map-valued data are printed as JSON. +* You can use `dump` to output single strings, numbers, or expressions including map-valued data. Map-valued data is printed as JSON. * If you use `dump` (or `edump`) with no arguments, you get a JSON structure representing the current values of all out-of-stream variables. @@ -60,7 +60,7 @@ The `dump` statement is for printing expressions, including maps, directly to st Records produced by a `mlr put` go downstream to the next verb in your `then`-chain, if any, or otherwise to standard output. If you want to additionally copy out records to files, you can do that using `tee`. -The syntax is, by example: +The syntax is, for example: GENMD-CARDIFY-HIGHLIGHT-ONE mlr --from myfile.dat put 'tee > "tap.dat", $*' then sort -n index @@ -68,8 +68,7 @@ GENMD-EOF First is `tee >`, then the filename expression (which can be an expression such as `"tap.".$a.".dat"`), then a comma, then `$*`. (Nothing else but `$*` is teeable.) -You can also write to a variable file name -- for example, you can split a -single file into multiple ones on field names: +You can also write to a variable file name -- for example, you can split a single file into multiple ones on field names: GENMD-RUN-COMMAND mlr --csv cat example.csv @@ -135,26 +134,12 @@ There are four variants: `emit1`, `emitf`, `emit`, and `emitp`. These are used to insert new records into the record stream -- or, optionally, redirect them to files. -Keep in mind that out-of-stream variables are a nested, multi-level -[map](reference-main-maps.md) (directly viewable as JSON using `dump`), while -Miller record values are as well during processing -- but records may be -flattened down for output to tabular formats. See the page [Flatten/unflatten: -JSON vs. tabular formats](flatten-unflatten.md) for more information. +Keep in mind that out-of-stream variables are a nested, multi-level [map](reference-main-maps.md) (directly viewable as JSON using `dump`), while Miller record values are as well during processing -- but records may be flattened down for output to tabular formats. See the page [Flatten/unflatten: JSON vs. tabular formats](flatten-unflatten.md) for more information. -* You can use `emit1` to emit any map-valued expression, including `$*`, - map-valued out-of-stream variables, the entire out-of-stream-variable - collection `@*`, map-valued local variables, map literals, or map-valued - function return values. -* For `emit`, `emitp`, and `emitf`, you can emit map-valued local variables, - map-valued field attributes (with `$`), map-va out-of-stream variables (with - `@`), `$*`, `@*`, or map literals (with outermost `{...}`) -- but not arbitrary - expressions which evaluate to map (such as function return values). +* You can use `emit1` to emit any map-valued expression, including `$*`, map-valued out-of-stream variables, the entire out-of-stream-variable collection `@*`, map-valued local variables, map literals, or map-valued function return values. +* For `emit`, `emitp`, and `emitf`, you can emit map-valued local variables, map-valued field attributes (with `$`), map-va out-of-stream variables (with `@`), `$*`, `@*`, or map literals (with outermost `{...}`) -- but not arbitrary expressions which evaluate to map (such as function return values). -The reason for this is part historical and part technical. As we'll see below, -you can do lots of syntactical things with `emit`, `emitp`, and `emitf`, -including printing them side-by-side, index them, redirect the output to files, -etc. What this means syntactically is that Miller's parser needs to handle all -sorts of commas, parentheses, and so on: +The reason for this is partly historical and partly technical. As we'll see below, you can do lots of syntactical things with `emit`, `emitp`, and `emitf`, including printing them side-by-side, indexing them, redirecting the output to files, etc. What this means syntactically is that Miller's parser needs to handle all sorts of commas, parentheses, and so on: GENMD-CARDIFY emitf @count, @sum @@ -163,12 +148,7 @@ GENMD-CARDIFY # etc GENMD-EOF -When we try to allow `emitf`/`emit`/`emitp` to handle arbitrary map-valued -expressions, like `mapexcept($*, mymap)` and so on, this inserts more syntactic -complexity in terms of commas, parentheses, and so on. The technical term is -_LR-1 shift-reduce conflicts_, but we can simply think of this in terms of the -parser not being able to efficiently disambiguate all the punctuational -opportunities. +When we try to allow `emitf`/`emit`/`emitp` to handle arbitrary map-valued expressions, like `mapexcept($*, mymap)` and so on, this inserts more syntactic complexity in terms of commas, parentheses, and so on. The technical term is _LR-1 shift-reduce conflicts_, but we can think of this in terms of the parser being unable to efficiently disambiguate all the punctuational opportunities. So, `emit1` can handle syntactic richness in the one thing being emitted; `emitf`, `emit`, and `emitp` can handle syntactic richness in the side-by-side @@ -176,7 +156,7 @@ placement, indexing, and redirection. (Mnemonic: If all you want is to insert a new record into the record stream, `emit1` is probably the _one_ you want.) -What this means is that if you want to emit an expression which evaluates to a map, you can do quite simply +What this means is that if you want to emit an expression that evaluates to a map, you can do it quite simply: GENMD-RUN-COMMAND mlr --c2p --from example.csv put -q ' @@ -184,7 +164,7 @@ mlr --c2p --from example.csv put -q ' ' GENMD-EOF -And if you want indexing, redirects, etc., just assign to a temporary variable and use one of the other emit variants: +And if you want indexing, redirects, etc., just assign to a temporary variable and use one of the other `emit` variants: GENMD-RUN-COMMAND mlr --c2p --from example.csv put -q ' @@ -195,7 +175,7 @@ GENMD-EOF ## Emitf statements -Use **emitf** to output several out-of-stream variables side-by-side in the same output record. For `emitf` these mustn't have indexing using `@name[...]`. Example: +Use **emitf** to output several out-of-stream variables side-by-side in the same output record. For `emitf`, these mustn't have indexing using `@name[...]`. Example: GENMD-RUN-COMMAND mlr put -q ' @@ -208,7 +188,7 @@ GENMD-EOF ## Emit statements -Use **emit** to output an out-of-stream variable. If it's non-indexed you'll get a simple key-value pair: +Use **emit** to output an out-of-stream variable. If it's non-indexed, you'll get a simple key-value pair: GENMD-RUN-COMMAND cat data/small @@ -222,7 +202,7 @@ GENMD-RUN-COMMAND mlr put -q '@sum += $x; end { emit @sum }' data/small GENMD-EOF -If it's indexed then use as many names after `emit` as there are indices: +If it's indexed, then use as many names after `emit` as there are indices: GENMD-RUN-COMMAND mlr put -q '@sum[$a] += $x; end { dump }' data/small @@ -277,8 +257,7 @@ GENMD-RUN-COMMAND mlr --oxtab put -q '@sum[$a][$b] += $x; end { emitp @sum }' data/small GENMD-EOF -Use **--flatsep** to specify the character which joins multilevel -keys for `emitp` (it defaults to a colon): +Use **--flatsep** to specify the character that joins multilevel keys for `emitp` (it defaults to a colon): GENMD-RUN-COMMAND mlr --flatsep / put -q '@sum[$a][$b] += $x; end { emitp @sum, "a" }' data/small @@ -313,11 +292,11 @@ mlr --from data/medium --opprint put -q ' ' GENMD-EOF -What this does is walk through the first out-of-stream variable (`@x_sum` in this example) as usual, then for each keylist found (e.g. `pan,wye`), include the values for the remaining out-of-stream variables (here, `@x_count` and `@x_mean`). You should use this when all out-of-stream variables in the emit statement have **the same shape and the same keylists**. +What this does is walk through the first out-of-stream variable (`@x_sum` in this example) as usual, then for each keylist found (e.g., `pan,wye`), include the values for the remaining out-of-stream variables (here, `@x_count` and `@x_mean`). You should use this when all out-of-stream variables in the emit statement have **the same shape and the same keylists**. ## Emit-all statements -Use **emit all** (or `emit @*` which is synonymous) to output all out-of-stream variables. You can use the following idiom to get various accumulators output side-by-side (reminiscent of `mlr stats1`): +Use **emit all** (or `emit @*`, which is synonymous) to output all out-of-stream variables. You can use the following idiom to get various accumulators' output side-by-side (reminiscent of `mlr stats1`): GENMD-RUN-COMMAND mlr --from data/small --opprint put -q ' diff --git a/docs/src/reference-dsl-syntax.md b/docs/src/reference-dsl-syntax.md index f2a8b45cb..9b51cdd61 100644 --- a/docs/src/reference-dsl-syntax.md +++ b/docs/src/reference-dsl-syntax.md @@ -63,7 +63,7 @@ hat wye 10002 0.321507044286237609 0.568893318795083758 5 9 4 2 data/s pan zee 10003 0.272054845593895200 0.425789896597056627 5 10 5 2 data/small2-Anything from a `#` character to end of line is a code comment. +Anything from a `#` character to the end of the line is a code comment.mlr --opprint filter '($x > 0.5 && $y < 0.5) || ($x < 0.5 && $y > 0.5)' \ @@ -147,11 +147,11 @@ a=eks,b=wye,i=4,x=0.381399,y=0.134188,xy=0.40431623334340655 a=wye,b=pan,i=5,x=0.573288,y=0.863624,xy=1.036583592538489-A suggested use-case here is defining functions in files, and calling them from command-line expressions. +A suggested use case here is defining functions in files and calling them from command-line expressions. -Another suggested use-case is putting default parameter values in files, e.g. using `begin{@count=is_present(@count)?@count:10}` in the file, where you can precede that using `begin{@count=40}` using `-e`. +Another suggested use case is putting default parameter values in files, e.g., using `begin{@count=is_present(@count)?@count:10}` in the file, where you can precede that using `begin{@count=40}` using `-e`. -Moreover, you can have one or more `-f` expressions (maybe one function per file, for example) and one or more `-e` expressions on the command line. If you mix `-f` and `-e` then the expressions are evaluated in the order encountered. +Moreover, you can have one or more `-f` expressions (maybe one function per file, for example) and one or more `-e` expressions on the command line. If you mix `-f` and `-e`, then the expressions are evaluated in the order encountered. ## Semicolons, commas, newlines, and curly braces @@ -180,7 +180,7 @@ x=1,y=2,3=,4=,5=,6=,7=,8=,9=,10=,foo=bar x=1,y=2,3=,4=,5=,6=,7=,8=,9=,10=,foo=barmlr put ' diff --git a/docs/src/reference-dsl-syntax.md.in b/docs/src/reference-dsl-syntax.md.in index aa918c944..46e71b81f 100644 --- a/docs/src/reference-dsl-syntax.md.in +++ b/docs/src/reference-dsl-syntax.md.in @@ -21,7 +21,7 @@ mlr --opprint put ' ' data/small data/small2 GENMD-EOF -Anything from a `#` character to end of line is a code comment. +Anything from a `#` character to the end of the line is a code comment. GENMD-RUN-COMMAND mlr --opprint filter '($x > 0.5 && $y < 0.5) || ($x < 0.5 && $y > 0.5)' \ @@ -62,11 +62,11 @@ GENMD-RUN-COMMAND mlr --from data/small put -f data/fe-example-4.mlr -e '$xy = f($x, $y)' GENMD-EOF -A suggested use-case here is defining functions in files, and calling them from command-line expressions. +A suggested use case here is defining functions in files and calling them from command-line expressions. -Another suggested use-case is putting default parameter values in files, e.g. using `begin{@count=is_present(@count)?@count:10}` in the file, where you can precede that using `begin{@count=40}` using `-e`. +Another suggested use case is putting default parameter values in files, e.g., using `begin{@count=is_present(@count)?@count:10}` in the file, where you can precede that using `begin{@count=40}` using `-e`. -Moreover, you can have one or more `-f` expressions (maybe one function per file, for example) and one or more `-e` expressions on the command line. If you mix `-f` and `-e` then the expressions are evaluated in the order encountered. +Moreover, you can have one or more `-f` expressions (maybe one function per file, for example) and one or more `-e` expressions on the command line. If you mix `-f` and `-e`, then the expressions are evaluated in the order encountered. ## Semicolons, commas, newlines, and curly braces @@ -84,7 +84,7 @@ GENMD-RUN-COMMAND echo x=1,y=2 | mlr put 'while (NF < 10) { $[NF+1] = ""}; $foo = "bar"' GENMD-EOF -Semicolons are required between statements even if those statements are on separate lines. **Newlines** are for your convenience but have no syntactic meaning: line endings do not terminate statements. For example, adjacent assignment statements must be separated by semicolons even if those statements are on separate lines: +Semicolons are required between statements, even if those statements are on separate lines. **Newlines** are for your convenience but have no syntactic meaning: line endings do not terminate statements. For example, adjacent assignment statements must be separated by semicolons even if those statements are on separate lines: GENMD-INCLUDE-ESCAPED(data/newline-example.txt) diff --git a/docs/src/reference-dsl-time.md b/docs/src/reference-dsl-time.md index 867bc8dc1..0a3aa721e 100644 --- a/docs/src/reference-dsl-time.md +++ b/docs/src/reference-dsl-time.md @@ -89,7 +89,7 @@ the [ISO8601](https://en.wikipedia.org/wiki/ISO_8601) format. This was the first (and initially only) human-readable date/time format supported by Miller going all the way back to Miller 1.0.0. -You can get these from epoch-seconds using the +You can get these from epoch-seconds using the [sec2gmt](reference-dsl-builtin-functions.md#sec2gmt) DSL function. (Note that the terms _UTC_ and _GMT_ are used interchangeably in Miller.) We also have [sec2gmtdate](reference-dsl-builtin-functions.md#sec2gmtdate) DSL function. @@ -200,7 +200,7 @@ mlr: TZ environment variable appears malformed: "This/Is/A/Typo" Note that for local times, Miller omits the `T` and the `Z` you see in GMT times. -We also have the +We also have the [gmt2localtime](reference-dsl-builtin-functions.md#gmt2localtime) and [localtime2gmt](reference-dsl-builtin-functions.md#localtime2gmt) convenience functions: diff --git a/docs/src/reference-dsl-time.md.in b/docs/src/reference-dsl-time.md.in index e2e02c397..869a58495 100644 --- a/docs/src/reference-dsl-time.md.in +++ b/docs/src/reference-dsl-time.md.in @@ -67,7 +67,7 @@ the [ISO8601](https://en.wikipedia.org/wiki/ISO_8601) format. This was the first (and initially only) human-readable date/time format supported by Miller going all the way back to Miller 1.0.0. -You can get these from epoch-seconds using the +You can get these from epoch-seconds using the [sec2gmt](reference-dsl-builtin-functions.md#sec2gmt) DSL function. (Note that the terms _UTC_ and _GMT_ are used interchangeably in Miller.) We also have [sec2gmtdate](reference-dsl-builtin-functions.md#sec2gmtdate) DSL function. @@ -142,7 +142,7 @@ GENMD-EOF Note that for local times, Miller omits the `T` and the `Z` you see in GMT times. -We also have the +We also have the [gmt2localtime](reference-dsl-builtin-functions.md#gmt2localtime) and [localtime2gmt](reference-dsl-builtin-functions.md#localtime2gmt) convenience functions: diff --git a/docs/src/reference-dsl-user-defined-functions.md b/docs/src/reference-dsl-user-defined-functions.md index d2be5a162..5197701de 100644 --- a/docs/src/reference-dsl-user-defined-functions.md +++ b/docs/src/reference-dsl-user-defined-functions.md @@ -16,7 +16,7 @@ Quick links: # DSL user-defined functions -As of Miller 5.0.0 you can define your own functions, as well as subroutines. +As of Miller 5.0.0, you can define your own functions, as well as subroutines. ## User-defined functions @@ -49,7 +49,7 @@ wye pan 5 0.573288 0.863624 211.38663947090302 120 Properties of user-defined functions: -* Function bodies start with `func` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e. the Miller DSL has no nested functions.) +* Function bodies start with `func` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e., the Miller DSL has no nested functions.) * A function (uniqified by its name) may not be redefined: either by redefining a user-defined function, or by redefining a built-in function. However, functions and subroutines have separate namespaces: you can define a subroutine `log` (for logging messages to stderr, say) which does not clash with the mathematical `log` (logarithm) function. @@ -61,7 +61,7 @@ Properties of user-defined functions: * When a return value is not implicitly returned, this results in a return value of [absent-null](reference-main-null-data.md). (In the example above, if there were records for which the argument to `f` is non-numeric, the assignments would be skipped.) See also the [null-data reference page](reference-main-null-data.md). -* See the section on [Local variables](reference-dsl-variables.md#local-variables) for information on scope and extent of arguments, as well as for information on the use of local variables within functions. +* See the section on [Local variables](reference-dsl-variables.md#local-variables) for information on the scope and extent of arguments, as well as for information on the use of local variables within functions. * See the section on [Expressions from files](reference-dsl-syntax.md#expressions-from-files) for information on the use of `-f` and `-e` flags. @@ -103,7 +103,7 @@ numcalls=15 Properties of user-defined subroutines: -* Subroutine bodies start with `subr` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e. the Miller DSL has no nested subroutines.) +* Subroutine bodies start with `subr` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e., the Miller DSL has no nested subroutines.) * A subroutine (uniqified by its name) may not be redefined. However, functions and subroutines have separate namespaces: you can define a subroutine `log` which does not clash with the mathematical `log` function. @@ -115,7 +115,7 @@ Properties of user-defined subroutines: * Argument values may be reassigned: they are not read-only. -* See the section on [local variables](reference-dsl-variables.md#local-variables) for information on scope and extent of arguments, as well as for information on the use of local variables within functions. +* See the section on [local variables](reference-dsl-variables.md#local-variables) for information on the scope and extent of arguments, as well as for information on the use of local variables within functions. * See the section on [Expressions from files](reference-dsl-syntax.md#expressions-from-files) for information on the use of `-f` and `-e` flags. @@ -123,15 +123,11 @@ Properties of user-defined subroutines: Subroutines cannot return values, and they are invoked by the keyword `call`. -In hindsight, subroutines needn't have been invented. If `foo` is a function -then you can write `foo(1,2,3)` while ignoring its return value, and that plays -the role of subroutine quite well. +In hindsight, subroutines needn't have been invented. If `foo is a function, then you can write `foo(1,2,3)` while ignoring its return value, and that plays the role of a subroutine quite well. ## Loading a library of functions -If you have a file with UDFs you use frequently, say `my-udfs.mlr`, you can use -`--load` or `--mload` to define them for your Miller scripts. For example, in -your shell, +If you have a file with UDFs you use frequently, say `my-udfs.mlr`, you can use `--load` or `--mload` to define them for your Miller scripts. For example, in your shell,## step @@ -3685,6 +3719,8 @@ for the old string and not handling multiple matches, like the `sub` DSL functio See also the `gsub` and `ssub` verbs. Options: -f {a,b,c} Field names to convert. +-r {regex} Regular expression for field names to convert. +-a Convert all fields. -h|--help Show this message.alias mlr='mlr --load ~/my-functions.mlr' @@ -149,8 +145,7 @@ See the [miscellaneous-flags page](reference-main-flag-list.md#miscellaneous-fla You can define unnamed functions and assign them to variables, or pass them to functions. -See also the [page on higher-order functions](reference-dsl-higher-order-functions.md) -for more information on +See also the [page on higher-order functions](reference-dsl-higher-order-functions.md) for more information on [`select`](reference-dsl-builtin-functions.md#select), [`apply`](reference-dsl-builtin-functions.md#apply), [`reduce`](reference-dsl-builtin-functions.md#reduce), @@ -209,9 +204,7 @@ purple square false 10 91 72.3735 8.2430 purple:square above Note that you need a semicolon after the closing curly brace of the function literal. -Unlike named functions, function literals (also known as unnamed functions) -have access to local variables defined in their enclosing scope. That's -so you can do things like this: +Unlike named functions, function literals (also known as unnamed functions) have access to local variables defined in their enclosing scope. That's so you can do things like this:mlr --c2p --from example.csv put ' diff --git a/docs/src/reference-dsl-user-defined-functions.md.in b/docs/src/reference-dsl-user-defined-functions.md.in index c9f0c6d7c..4d8bb0c18 100644 --- a/docs/src/reference-dsl-user-defined-functions.md.in +++ b/docs/src/reference-dsl-user-defined-functions.md.in @@ -1,6 +1,6 @@ # DSL user-defined functions -As of Miller 5.0.0 you can define your own functions, as well as subroutines. +As of Miller 5.0.0, you can define your own functions, as well as subroutines. ## User-defined functions @@ -25,7 +25,7 @@ GENMD-EOF Properties of user-defined functions: -* Function bodies start with `func` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e. the Miller DSL has no nested functions.) +* Function bodies start with `func` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e., the Miller DSL has no nested functions.) * A function (uniqified by its name) may not be redefined: either by redefining a user-defined function, or by redefining a built-in function. However, functions and subroutines have separate namespaces: you can define a subroutine `log` (for logging messages to stderr, say) which does not clash with the mathematical `log` (logarithm) function. @@ -37,7 +37,7 @@ Properties of user-defined functions: * When a return value is not implicitly returned, this results in a return value of [absent-null](reference-main-null-data.md). (In the example above, if there were records for which the argument to `f` is non-numeric, the assignments would be skipped.) See also the [null-data reference page](reference-main-null-data.md). -* See the section on [Local variables](reference-dsl-variables.md#local-variables) for information on scope and extent of arguments, as well as for information on the use of local variables within functions. +* See the section on [Local variables](reference-dsl-variables.md#local-variables) for information on the scope and extent of arguments, as well as for information on the use of local variables within functions. * See the section on [Expressions from files](reference-dsl-syntax.md#expressions-from-files) for information on the use of `-f` and `-e` flags. @@ -67,7 +67,7 @@ GENMD-EOF Properties of user-defined subroutines: -* Subroutine bodies start with `subr` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e. the Miller DSL has no nested subroutines.) +* Subroutine bodies start with `subr` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e., the Miller DSL has no nested subroutines.) * A subroutine (uniqified by its name) may not be redefined. However, functions and subroutines have separate namespaces: you can define a subroutine `log` which does not clash with the mathematical `log` function. @@ -79,7 +79,7 @@ Properties of user-defined subroutines: * Argument values may be reassigned: they are not read-only. -* See the section on [local variables](reference-dsl-variables.md#local-variables) for information on scope and extent of arguments, as well as for information on the use of local variables within functions. +* See the section on [local variables](reference-dsl-variables.md#local-variables) for information on the scope and extent of arguments, as well as for information on the use of local variables within functions. * See the section on [Expressions from files](reference-dsl-syntax.md#expressions-from-files) for information on the use of `-f` and `-e` flags. @@ -87,15 +87,11 @@ Properties of user-defined subroutines: Subroutines cannot return values, and they are invoked by the keyword `call`. -In hindsight, subroutines needn't have been invented. If `foo` is a function -then you can write `foo(1,2,3)` while ignoring its return value, and that plays -the role of subroutine quite well. +In hindsight, subroutines needn't have been invented. If `foo is a function, then you can write `foo(1,2,3)` while ignoring its return value, and that plays the role of a subroutine quite well. ## Loading a library of functions -If you have a file with UDFs you use frequently, say `my-udfs.mlr`, you can use -`--load` or `--mload` to define them for your Miller scripts. For example, in -your shell, +If you have a file with UDFs you use frequently, say `my-udfs.mlr`, you can use `--load` or `--mload` to define them for your Miller scripts. For example, in your shell, GENMD-CARDIFY-HIGHLIGHT-ONE alias mlr='mlr --load ~/my-functions.mlr' @@ -113,8 +109,7 @@ See the [miscellaneous-flags page](reference-main-flag-list.md#miscellaneous-fla You can define unnamed functions and assign them to variables, or pass them to functions. -See also the [page on higher-order functions](reference-dsl-higher-order-functions.md) -for more information on +See also the [page on higher-order functions](reference-dsl-higher-order-functions.md) for more information on [`select`](reference-dsl-builtin-functions.md#select), [`apply`](reference-dsl-builtin-functions.md#apply), [`reduce`](reference-dsl-builtin-functions.md#reduce), @@ -147,9 +142,7 @@ GENMD-EOF Note that you need a semicolon after the closing curly brace of the function literal. -Unlike named functions, function literals (also known as unnamed functions) -have access to local variables defined in their enclosing scope. That's -so you can do things like this: +Unlike named functions, function literals (also known as unnamed functions) have access to local variables defined in their enclosing scope. That's so you can do things like this: GENMD-RUN-COMMAND mlr --c2p --from example.csv put ' diff --git a/docs/src/reference-dsl-variables.md b/docs/src/reference-dsl-variables.md index 85ad66051..161afc018 100644 --- a/docs/src/reference-dsl-variables.md +++ b/docs/src/reference-dsl-variables.md @@ -18,11 +18,11 @@ Quick links: Miller has the following kinds of variables: -**Fields of stream records**, accessed using the `$` prefix. These refer to fields of the current data-stream record. For example, in `echo x=1,y=2 | mlr put '$z = $x + $y'`, `$x` and `$y` refer to input fields, and `$z` refers to a new, computed output field. In a few contexts, presented below, you can refer to the entire record as `$*`. +**Fields of stream records**, accessed using the `$` prefix. These refer to fields of the current data-stream record. For example, in `echo x=1,y=2 | mlr put '$z = $x + $y'`, `$x` and `$y` refer to input fields, and `$z` refers to a new, computed output field. In the following contexts, you can refer to the entire record as `$*`. -**Out-of-stream variables** accessed using the `@` prefix. These refer to data which persist from one record to the next, including in `begin` and `end` blocks (which execute before/after the record stream is consumed, respectively). You use them to remember values across records, such as sums, differences, counters, and so on. In a few contexts, presented below, you can refer to the entire out-of-stream-variables collection as `@*`. +**Out-of-stream variables** accessed using the `@` prefix. These refer to data that persists from one record to the next, including in `begin` and `end` blocks (which execute before/after the record stream is consumed, respectively). You use them to remember values across records, such as sums, differences, and counters, among other things. In the following contexts, you can refer to the entire out-of-stream-variables collection as `@*`. -**Local variables** are limited in scope and extent to the current statements being executed: these include function arguments, bound variables in for loops, and local variables. +**Local variables** are limited in scope and extent to the current statements being executed, including function arguments, bound variables in for loops, and local variables. **Built-in variables** such as `NF`, `NR`, `FILENAME`, `M_PI`, and `M_E`. These are all capital letters and are read-only (although some of them change value from one record to another). @@ -32,7 +32,7 @@ Miller has the following kinds of variables: Names of fields within stream records must be specified using a `$` in [filter and put expressions](reference-dsl.md), even though the dollar signs don't appear in the data stream itself. For integer-indexed data, this looks like `awk`'s `$1,$2,$3`, except that Miller allows non-numeric names such as `$quantity` or `$hostname`. Likewise, enclose string literals in double quotes in `filter` expressions even though they don't appear in file data. In particular, `mlr filter '$x=="abc"'` passes through the record `x=abc`. -If field names have **special characters** such as `.` then you can use braces, e.g. `'${field.name}'`. +If field names have **special characters** such as `.`, then you can use braces, e.g. `'${field.name}'`. You may also use a **computed field name** in square brackets, e.g. @@ -55,7 +55,7 @@ Their **extent** is limited to the current record; their **scope** is the `filte These are **read-write**: you can do `$y=2*$x`, `$x=$x+1`, etc. -Records are Miller's output: field names present in the input stream are passed through to output (written to standard output) unless fields are removed with `cut`, or records are excluded with `filter` or `put -q`, etc. Simply assign a value to a field and it will be output. +Records are Miller's output: field names present in the input stream are passed through to output (written to standard output) unless fields are removed with `cut`, or records are excluded with `filter` or `put -q`, etc. Simply assign a value to a field, and it will be output. ## Positional field names @@ -63,7 +63,7 @@ Even though Miller's main selling point is name-indexing, sometimes you really w Use `$[[3]]` to access the name of field 3. More generally, any expression evaluating to an integer can go between `$[[` and `]]`. -Then using a computed field name, `$[ $[[3]] ]` is the value in the third field. This has the shorter equivalent notation `$[[[3]]]`. +Then, using a computed field name, `$[ $[[3]] ]` is the value in the third field. This has the shorter equivalent notation `$[[[3]]]`.-Out-of-stream variables' **extent** is from the start to the end of the record stream, i.e. every time the `put` or `filter` statement referring to them is executed. +Out-of-stream variables' **extent** is from the start to the end of the record stream, i.e., every time the `put` or `filter` statement referring to them is executed. Out-of-stream variables are **read-write**: you can do `$sum=@sum`, `@sum=$sum`, etc. ## Indexed out-of-stream variables -Using an index on the `@count` and `@sum` variables, we get the benefit of the `-g` (group-by) option which `mlr stats1` and various other Miller commands have: +Using an index on the `@count` and `@sum` variables, we get the benefit of the `-g` (group-by) option, which `mlr stats1` and various other Miller commands have:mlr cat data/small @@ -131,7 +131,7 @@ a=eks,b=wye,i=4,x=NEW,y=0.134188 a=wye,b=pan,i=5,x=0.573288,y=NEW-Right-hand side accesses to non-existent fields -- i.e. with index less than 1 or greater than `NF` -- return an absent value. Likewise, left-hand side accesses only refer to fields which already exist. For example, if a field has 5 records then assigning the name or value of the 6th (or 600th) field results in a no-op. +Right-hand side accesses to non-existent fields -- i.e., with index less than 1 or greater than `NF` -- return an absent value. Likewise, left-hand side accesses only refer to fields that already exist. For example, if a field has 5 records, then assigning the name or value of the 6th (or 600th) field results in a no-op.mlr put '$[[6]] = "NEW"' data/small @@ -155,11 +155,15 @@ a=eks,b=wye,i=4,x=0.381399,y=0.134188 a=wye,b=pan,i=5,x=0.573288,y=0.863624+!!! note + + You can use positional field names only in the [Miller DSL](reference-dsl.md), i.e., only with the verbs `put` and `filter`. + ## Out-of-stream variables -These are prefixed with an at-sign, e.g. `@sum`. Furthermore, unlike built-in variables and stream-record fields, they are maintained in an arbitrarily nested map: you can do `@sum += $quantity`, or `@sum[$color] += $quantity`, or `@sum[$color][$shape] += $quantity`. The keys for the multi-level map can be any expression which evaluates to string or integer: e.g. `@sum[NR] = $a + $b`, `@sum[$a."-".$b] = $x`, etc. +These are prefixed with an at-sign, e.g., `@sum`. Furthermore, unlike built-in variables and stream-record fields, they are maintained in an arbitrarily nested map: you can do `@sum += $quantity`, or `@sum[$color] += $quantity`, or `@sum[$color][$shape] += $quantity`. The keys for the multi-level map can be any expression that evaluates to string or integer: e.g. `@sum[NR] = $a + $b`, `@sum[$a."-".$b] = $x`, etc. -Their names and their values are entirely under your control; they change only when you assign to them. +Their names and their values are entirely under your control; they change only when you assign them. Just as for field names in stream records, if you want to define out-of-stream variables with **special characters** such as `.` then you can use braces, e.g. `'@{variable.name}["index"]'`. @@ -194,13 +198,13 @@ sum=5 sum=50mlr put -q ' @@ -305,8 +309,8 @@ Local variables are similar to out-of-stream variables, except that their extent For example:-# Here I'm using a specified random-number seed so this example always -# produces the same output for this web document: in everyday practice we +# Here I'm using a specified random-number seed, so this example always +# produces the same output for this web document: in everyday practice, we # would leave off the --seed 12345 part. mlr --seed 12345 seqgen --start 1 --stop 10 then put ' func f(a, b) { # function arguments a and b @@ -337,7 +341,7 @@ i=10,o=15.37686787628025 Things which are completely unsurprising, resembling many other languages: -* Parameter names are bound to their arguments but can be reassigned, e.g. if there is a parameter named `a` then you can reassign the value of `a` to be something else within the function if you like. +* Parameter names are bound to their arguments but can be reassigned, e.g., if there is a parameter named `a`, then you can reassign the value of `a` to be something else within the function if you like. * However, you cannot redeclare the *type* of an argument or a local: `var a=1; var a=2` is an error but `var a=1; a=2` is OK. @@ -351,13 +355,13 @@ Things which are completely unsurprising, resembling many other languages: Things which are perhaps surprising compared to other languages: -* Type declarations using `var`, or typed using `num`, `int`, `float`, `str`, `bool`, `arr`, `map`, `funct` are not necessary to declare local variables. Function arguments and variables bound in for-loops over stream records and out-of-stream variables are *implicitly* declared using `var`. (Some examples are shown below.) +* Type declarations using `var`, or typed using `num`, `int`, `float`, `str`, `bool`, `arr`, `map`, `funct`, are not necessary to declare local variables. Function arguments and variables bound in for-loops over stream records and out-of-stream variables are *implicitly* declared using `var`. (Some examples are shown below.) -* Type-checking is done at assignment time. For example, `float f = 0` is an error (since `0` is an integer), as is `float f = 0.0; f = 1`. For this reason I prefer to use `num` over `float` in most contexts since `num` encompasses integer and floating-point values. More information is at [Type-checking](reference-dsl-variables.md#type-checking). +* Type-checking is done at assignment time. For example, `float f = 0` is an error (since `0` is an integer), as is `float f = 0.0; f = 1`. For this reason, I prefer to use `num` over `float` in most contexts, as `num` encompasses both integer and floating-point values. For more information, refer to [Type-checking](reference-dsl-variables.md#type-checking). * Bound variables in for-loops over stream records and out-of-stream variables are implicitly local to that block. E.g. in `for (k, v in $*) { ... }` `for ((k1, k2), v in @*) { ... }` if there are `k`, `v`, etc. in the enclosing scope then those will be masked by the loop-local bound variables in the loop, and moreover the values of the loop-local bound variables are not available after the end of the loop. -* For C-style triple-for loops, if a for-loop variable is defined using `var`, `int`, etc. then it is scoped to that for-loop. E.g. `for (i = 0; i < 10; i += 1) { ... }` and `for (int i = 0; i < 10; i += 1) { ... }`. (This is unsurprising.). If there is no typedecl and an outer-scope variable of that name exists, then it is used. (This is also unsurprising.) But if there is no outer-scope variable of that name, then the variable is scoped to the for-loop only. +* For C-style triple-for loops, if a for-loop variable is defined using `var`, `int`, etc., then it is scoped to that for-loop. E.g. `for (i = 0; i < 10; i += 1) { ... }` and `for (int i = 0; i < 10; i += 1) { ... }`. (This is unsurprising.). If there is no typedecl and an outer-scope variable of that name exists, then it is used. (This is also unsurprising.) But if there is no outer-scope variable of that name, then the variable is scoped to the for-loop only. The following example demonstrates the scope rules: @@ -474,7 +478,7 @@ print "outer j =", j; # j is undefined in this scope. ## Map literals -Miller's `put`/`filter` DSL has four kinds of maps. **Stream records** are (single-level) maps from name to value. **Out-of-stream variables** and **local variables** can also be maps, although they can be multi-level maps (e.g. `@sum[$x][$y]`). The fourth kind is **map literals**. These cannot be on the left-hand side of assignment expressions. Syntactically they look like JSON, although Miller allows string and integer keys in its map literals while JSON allows only string keys (e.g. `"3"` rather than `3`). Note though that integer keys become stringified in Miller: `@mymap[3]=4` results in `@mymap` being `{"3":4}`. +Miller's `put`/`filter` DSL has four kinds of maps. **Stream records** are (single-level) maps from name to value. **Out-of-stream variables** and **local variables** can also be maps, although they can be multi-level maps (e.g. `@sum[$x][$y]`). The fourth kind is **map literals**. These cannot be on the left-hand side of assignment expressions. Syntactically, they look like JSON, although Miller allows string and integer keys in its map literals while JSON allows only string keys (e.g., `"3"` rather than `3`). Note, though, that integer keys become stringified in Miller: `@mymap[3]=4` results in `@mymap` being `{"3":4}`. For example, the following swaps the input stream's `a` and `i` fields, modifies `y`, and drops the rest: @@ -561,7 +565,7 @@ there are the read-only separator variables `IRS`, `ORS`, `IFS`, `OFS`, `IPS`, and `OPS` as discussed on the [separators page](reference-main-separators.md), and the flatten/unflatten separator `FLATSEP` discussed on the [flatten/unflatten page](flatten-unflatten.md). Lastly, the `ENV` map allows -read/write access to environment variables, e.g. `ENV["HOME"]` or +read/write access to environment variables, e.g., `ENV["HOME"]` or `ENV["foo_".$hostname]` or `ENV["VERSION"]="1.2.3"`. @@ -604,7 +608,7 @@ system environment variables at the time Miller starts. Any changes made to `ENV` by assigning to it will affect any subprocesses, such as using [piped tee](reference-dsl-output-statements.md#redirected-output-statements). -Their **scope is global**: you can refer to them in any `filter` or `put` statement. Their values are assigned by the input-record reader: +Their **scope is global**: you can refer to them in any `filter` or `put` statement. The input-record reader assigns their values:@@ -2086,6 +2095,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -2298,6 +2308,9 @@ Options:mlr --csv put '$nr = NR' data/a.csv @@ -630,11 +634,11 @@ a,b,c,nr The **extent** is for the duration of the put/filter: in a `begin` statement (which executes before the first input record is consumed) you will find `NR=1` and in an `end` statement (which is executed after the last input record is consumed) you will find `NR` to be the total number of records ingested. -These are all **read-only** for the `mlr put` and `mlr filter` DSL: they may be assigned from, e.g. `$nr=NR`, but they may not be assigned to: `NR=100` is a syntax error. +These are all **read-only** for the `mlr put` and `mlr filter` DSL: they may be assigned from, e.g., `$nr=NR`, but they may not be assigned to: `NR=100` is a syntax error. ## Type-checking -Miller's `put`/`filter` DSL supports two optional kinds of type-checking. One is inline **type-tests** and **type-assertions** within expressions. The other is **type declarations** for assignments to local variables, binding of arguments to user-defined functions, and return values from user-defined functions, These are discussed in the following subsections. +Miller's `put`/`filter` DSL supports two optional kinds of type-checking. One is inline **type tests** and **type assertions** within expressions. The other is **type declarations** for assignments to local variables, binding of arguments to user-defined functions, and return values from user-defined functions. These are discussed in the following subsections. Use of type-checking is entirely up to you: omit it if you want flexibility with heterogeneous data; use it if you want to help catch misspellings in your DSL code or unexpected irregularities in your input data. @@ -695,22 +699,22 @@ asserting_string See [Data-cleaning Examples](data-cleaning-examples.md) for examples of how to use these. -### Type-declarations for local variables, function parameter, and function return values +### Type declarations for local variables, function parameters, and function return values Local variables can be defined either untyped as in `x = 1`, or typed as in `int x = 1`. Types include **var** (explicitly untyped), **int**, **float**, **num** (int or float), **str**, **bool**, **arr**, **map**, and **funct**. These optional type declarations are enforced at the time values are assigned to variables: whether at the initial value assignment as in `int x = 1` or in any subsequent assignments to the same variable farther down in the scope. The reason for `num` is that `int` and `float` typedecls are very precise:+* Each user-defined function has its own frame for captures. For example: + +-float a = 0; # Runtime error since 0 is int not float -int b = 1.0; # Runtime error since 1.0 is float not int +float a = 0; # Runtime error since 0 is int, not float +int b = 1.0; # Runtime error since 1.0 is float, not int num c = 0; # OK num d = 1.0; # OK-A suggestion is to use `num` for general use when you want numeric content, and use `int` when you genuinely want integer-only values, e.g. in loop indices or map keys (since Miller map keys can only be strings or ints). +A suggestion is to use `num` for general use when you want numeric content, and use `int` when you genuinely want integer-only values, e.g., in loop indices or map keys (since Miller map keys can only be strings or ints). -The `var` type declaration indicates no type restrictions, e.g. `var x = 1` has the same type restrictions on `x` as `x = 1`. The difference is in intentional shadowing: if you have `x = 1` in outer scope and `x = 2` in inner scope (e.g. within a for-loop or an if-statement) then outer-scope `x` has value 2 after the second assignment. But if you have `var x = 2` in the inner scope, then you are declaring a variable scoped to the inner block.) For example: +The `var` type declaration indicates no type restrictions, e.g., `var x = 1` has the same type restrictions on `x` as `x = 1`. The difference is in intentional shadowing: if you have `x = 1` in outer scope and `x = 2` in inner scope (e.g., within a for-loop or an if-statement) then outer-scope `x` has value 2 after the second assignment. But if you have `var x = 2` in the inner scope, then you are declaring a variable scoped to the inner block.) For example:x = 1; @@ -728,7 +732,7 @@ if (NR == 4) { print x; # Value of this x is still 1-Likewise function arguments can optionally be typed, with type enforced when the function is called: +Likewise, function arguments can optionally be typed, with type enforced when the function is called:func f(map m, int i) { @@ -760,7 +764,7 @@ func f(map m, int i): bool { } ... ... - # In Miller if your functions don't explicitly return a value, they return absent-null. + # In Miller, if your functions don't explicitly return a value, they return absent-null. # So it would also be a runtime error on reaching the end of this function without # an explicit return statement. } @@ -841,7 +845,7 @@ Example recursive copy of out-of-stream variables: }-Example of out-of-stream variable assigned to full stream record, where the 2nd record is stashed, and the 4th record is overwritten with that: +Example of an out-of-stream variable assigned to the full stream record, where the 2nd record is stashed, and the 4th record is overwritten with that:mlr put 'NR == 2 {@keep = $*}; NR == 4 {$* = @keep}' data/small diff --git a/docs/src/reference-dsl-variables.md.in b/docs/src/reference-dsl-variables.md.in index 68fca60e4..0b9ddf60b 100644 --- a/docs/src/reference-dsl-variables.md.in +++ b/docs/src/reference-dsl-variables.md.in @@ -2,11 +2,11 @@ Miller has the following kinds of variables: -**Fields of stream records**, accessed using the `$` prefix. These refer to fields of the current data-stream record. For example, in `echo x=1,y=2 | mlr put '$z = $x + $y'`, `$x` and `$y` refer to input fields, and `$z` refers to a new, computed output field. In a few contexts, presented below, you can refer to the entire record as `$*`. +**Fields of stream records**, accessed using the `$` prefix. These refer to fields of the current data-stream record. For example, in `echo x=1,y=2 | mlr put '$z = $x + $y'`, `$x` and `$y` refer to input fields, and `$z` refers to a new, computed output field. In the following contexts, you can refer to the entire record as `$*`. -**Out-of-stream variables** accessed using the `@` prefix. These refer to data which persist from one record to the next, including in `begin` and `end` blocks (which execute before/after the record stream is consumed, respectively). You use them to remember values across records, such as sums, differences, counters, and so on. In a few contexts, presented below, you can refer to the entire out-of-stream-variables collection as `@*`. +**Out-of-stream variables** accessed using the `@` prefix. These refer to data that persists from one record to the next, including in `begin` and `end` blocks (which execute before/after the record stream is consumed, respectively). You use them to remember values across records, such as sums, differences, and counters, among other things. In the following contexts, you can refer to the entire out-of-stream-variables collection as `@*`. -**Local variables** are limited in scope and extent to the current statements being executed: these include function arguments, bound variables in for loops, and local variables. +**Local variables** are limited in scope and extent to the current statements being executed, including function arguments, bound variables in for loops, and local variables. **Built-in variables** such as `NF`, `NR`, `FILENAME`, `M_PI`, and `M_E`. These are all capital letters and are read-only (although some of them change value from one record to another). @@ -16,7 +16,7 @@ Miller has the following kinds of variables: Names of fields within stream records must be specified using a `$` in [filter and put expressions](reference-dsl.md), even though the dollar signs don't appear in the data stream itself. For integer-indexed data, this looks like `awk`'s `$1,$2,$3`, except that Miller allows non-numeric names such as `$quantity` or `$hostname`. Likewise, enclose string literals in double quotes in `filter` expressions even though they don't appear in file data. In particular, `mlr filter '$x=="abc"'` passes through the record `x=abc`. -If field names have **special characters** such as `.` then you can use braces, e.g. `'${field.name}'`. +If field names have **special characters** such as `.`, then you can use braces, e.g. `'${field.name}'`. You may also use a **computed field name** in square brackets, e.g. @@ -36,7 +36,7 @@ Their **extent** is limited to the current record; their **scope** is the `filte These are **read-write**: you can do `$y=2*$x`, `$x=$x+1`, etc. -Records are Miller's output: field names present in the input stream are passed through to output (written to standard output) unless fields are removed with `cut`, or records are excluded with `filter` or `put -q`, etc. Simply assign a value to a field and it will be output. +Records are Miller's output: field names present in the input stream are passed through to output (written to standard output) unless fields are removed with `cut`, or records are excluded with `filter` or `put -q`, etc. Simply assign a value to a field, and it will be output. ## Positional field names @@ -44,7 +44,7 @@ Even though Miller's main selling point is name-indexing, sometimes you really w Use `$[[3]]` to access the name of field 3. More generally, any expression evaluating to an integer can go between `$[[` and `]]`. -Then using a computed field name, `$[ $[[3]] ]` is the value in the third field. This has the shorter equivalent notation `$[[[3]]]`. +Then, using a computed field name, `$[ $[[3]] ]` is the value in the third field. This has the shorter equivalent notation `$[[[3]]]`. GENMD-RUN-COMMAND mlr cat data/small @@ -70,7 +70,7 @@ GENMD-RUN-COMMAND mlr put '$[[[NR]]] = "NEW"' data/small GENMD-EOF -Right-hand side accesses to non-existent fields -- i.e. with index less than 1 or greater than `NF` -- return an absent value. Likewise, left-hand side accesses only refer to fields which already exist. For example, if a field has 5 records then assigning the name or value of the 6th (or 600th) field results in a no-op. +Right-hand side accesses to non-existent fields -- i.e., with index less than 1 or greater than `NF` -- return an absent value. Likewise, left-hand side accesses only refer to fields that already exist. For example, if a field has 5 records, then assigning the name or value of the 6th (or 600th) field results in a no-op. GENMD-RUN-COMMAND mlr put '$[[6]] = "NEW"' data/small @@ -80,11 +80,15 @@ GENMD-RUN-COMMAND mlr put '$[[[6]]] = "NEW"' data/small GENMD-EOF +!!! note + + You can use positional field names only in the [Miller DSL](reference-dsl.md), i.e., only with the verbs `put` and `filter`. + ## Out-of-stream variables -These are prefixed with an at-sign, e.g. `@sum`. Furthermore, unlike built-in variables and stream-record fields, they are maintained in an arbitrarily nested map: you can do `@sum += $quantity`, or `@sum[$color] += $quantity`, or `@sum[$color][$shape] += $quantity`. The keys for the multi-level map can be any expression which evaluates to string or integer: e.g. `@sum[NR] = $a + $b`, `@sum[$a."-".$b] = $x`, etc. +These are prefixed with an at-sign, e.g., `@sum`. Furthermore, unlike built-in variables and stream-record fields, they are maintained in an arbitrarily nested map: you can do `@sum += $quantity`, or `@sum[$color] += $quantity`, or `@sum[$color][$shape] += $quantity`. The keys for the multi-level map can be any expression that evaluates to string or integer: e.g. `@sum[NR] = $a + $b`, `@sum[$a."-".$b] = $x`, etc. -Their names and their values are entirely under your control; they change only when you assign to them. +Their names and their values are entirely under your control; they change only when you assign them. Just as for field names in stream records, if you want to define out-of-stream variables with **special characters** such as `.` then you can use braces, e.g. `'@{variable.name}["index"]'`. @@ -106,13 +110,13 @@ mlr put '@sum += $a; end {emit @sum}' \ data/a.dkvp GENMD-EOF -Out-of-stream variables' **extent** is from the start to the end of the record stream, i.e. every time the `put` or `filter` statement referring to them is executed. +Out-of-stream variables' **extent** is from the start to the end of the record stream, i.e., every time the `put` or `filter` statement referring to them is executed. Out-of-stream variables are **read-write**: you can do `$sum=@sum`, `@sum=$sum`, etc. ## Indexed out-of-stream variables -Using an index on the `@count` and `@sum` variables, we get the benefit of the `-g` (group-by) option which `mlr stats1` and various other Miller commands have: +Using an index on the `@count` and `@sum` variables, we get the benefit of the `-g` (group-by) option, which `mlr stats1` and various other Miller commands have: GENMD-RUN-COMMAND mlr put -q ' @@ -169,8 +173,8 @@ Local variables are similar to out-of-stream variables, except that their extent For example: GENMD-RUN-COMMAND -# Here I'm using a specified random-number seed so this example always -# produces the same output for this web document: in everyday practice we +# Here I'm using a specified random-number seed, so this example always +# produces the same output for this web document: in everyday practice, we # would leave off the --seed 12345 part. mlr --seed 12345 seqgen --start 1 --stop 10 then put ' func f(a, b) { # function arguments a and b @@ -189,7 +193,7 @@ GENMD-EOF Things which are completely unsurprising, resembling many other languages: -* Parameter names are bound to their arguments but can be reassigned, e.g. if there is a parameter named `a` then you can reassign the value of `a` to be something else within the function if you like. +* Parameter names are bound to their arguments but can be reassigned, e.g., if there is a parameter named `a`, then you can reassign the value of `a` to be something else within the function if you like. * However, you cannot redeclare the *type* of an argument or a local: `var a=1; var a=2` is an error but `var a=1; a=2` is OK. @@ -203,13 +207,13 @@ Things which are completely unsurprising, resembling many other languages: Things which are perhaps surprising compared to other languages: -* Type declarations using `var`, or typed using `num`, `int`, `float`, `str`, `bool`, `arr`, `map`, `funct` are not necessary to declare local variables. Function arguments and variables bound in for-loops over stream records and out-of-stream variables are *implicitly* declared using `var`. (Some examples are shown below.) +* Type declarations using `var`, or typed using `num`, `int`, `float`, `str`, `bool`, `arr`, `map`, `funct`, are not necessary to declare local variables. Function arguments and variables bound in for-loops over stream records and out-of-stream variables are *implicitly* declared using `var`. (Some examples are shown below.) -* Type-checking is done at assignment time. For example, `float f = 0` is an error (since `0` is an integer), as is `float f = 0.0; f = 1`. For this reason I prefer to use `num` over `float` in most contexts since `num` encompasses integer and floating-point values. More information is at [Type-checking](reference-dsl-variables.md#type-checking). +* Type-checking is done at assignment time. For example, `float f = 0` is an error (since `0` is an integer), as is `float f = 0.0; f = 1`. For this reason, I prefer to use `num` over `float` in most contexts, as `num` encompasses both integer and floating-point values. For more information, refer to [Type-checking](reference-dsl-variables.md#type-checking). * Bound variables in for-loops over stream records and out-of-stream variables are implicitly local to that block. E.g. in `for (k, v in $*) { ... }` `for ((k1, k2), v in @*) { ... }` if there are `k`, `v`, etc. in the enclosing scope then those will be masked by the loop-local bound variables in the loop, and moreover the values of the loop-local bound variables are not available after the end of the loop. -* For C-style triple-for loops, if a for-loop variable is defined using `var`, `int`, etc. then it is scoped to that for-loop. E.g. `for (i = 0; i < 10; i += 1) { ... }` and `for (int i = 0; i < 10; i += 1) { ... }`. (This is unsurprising.). If there is no typedecl and an outer-scope variable of that name exists, then it is used. (This is also unsurprising.) But if there is no outer-scope variable of that name, then the variable is scoped to the for-loop only. +* For C-style triple-for loops, if a for-loop variable is defined using `var`, `int`, etc., then it is scoped to that for-loop. E.g. `for (i = 0; i < 10; i += 1) { ... }` and `for (int i = 0; i < 10; i += 1) { ... }`. (This is unsurprising.). If there is no typedecl and an outer-scope variable of that name exists, then it is used. (This is also unsurprising.) But if there is no outer-scope variable of that name, then the variable is scoped to the for-loop only. The following example demonstrates the scope rules: @@ -233,7 +237,7 @@ GENMD-EOF ## Map literals -Miller's `put`/`filter` DSL has four kinds of maps. **Stream records** are (single-level) maps from name to value. **Out-of-stream variables** and **local variables** can also be maps, although they can be multi-level maps (e.g. `@sum[$x][$y]`). The fourth kind is **map literals**. These cannot be on the left-hand side of assignment expressions. Syntactically they look like JSON, although Miller allows string and integer keys in its map literals while JSON allows only string keys (e.g. `"3"` rather than `3`). Note though that integer keys become stringified in Miller: `@mymap[3]=4` results in `@mymap` being `{"3":4}`. +Miller's `put`/`filter` DSL has four kinds of maps. **Stream records** are (single-level) maps from name to value. **Out-of-stream variables** and **local variables** can also be maps, although they can be multi-level maps (e.g. `@sum[$x][$y]`). The fourth kind is **map literals**. These cannot be on the left-hand side of assignment expressions. Syntactically, they look like JSON, although Miller allows string and integer keys in its map literals while JSON allows only string keys (e.g., `"3"` rather than `3`). Note, though, that integer keys become stringified in Miller: `@mymap[3]=4` results in `@mymap` being `{"3":4}`. For example, the following swaps the input stream's `a` and `i` fields, modifies `y`, and drops the rest: @@ -296,7 +300,7 @@ there are the read-only separator variables `IRS`, `ORS`, `IFS`, `OFS`, `IPS`, and `OPS` as discussed on the [separators page](reference-main-separators.md), and the flatten/unflatten separator `FLATSEP` discussed on the [flatten/unflatten page](flatten-unflatten.md). Lastly, the `ENV` map allows -read/write access to environment variables, e.g. `ENV["HOME"]` or +read/write access to environment variables, e.g., `ENV["HOME"]` or `ENV["foo_".$hostname]` or `ENV["VERSION"]="1.2.3"`. @@ -316,7 +320,7 @@ system environment variables at the time Miller starts. Any changes made to `ENV` by assigning to it will affect any subprocesses, such as using [piped tee](reference-dsl-output-statements.md#redirected-output-statements). -Their **scope is global**: you can refer to them in any `filter` or `put` statement. Their values are assigned by the input-record reader: +Their **scope is global**: you can refer to them in any `filter` or `put` statement. The input-record reader assigns their values: GENMD-RUN-COMMAND mlr --csv put '$nr = NR' data/a.csv @@ -328,11 +332,11 @@ GENMD-EOF The **extent** is for the duration of the put/filter: in a `begin` statement (which executes before the first input record is consumed) you will find `NR=1` and in an `end` statement (which is executed after the last input record is consumed) you will find `NR` to be the total number of records ingested. -These are all **read-only** for the `mlr put` and `mlr filter` DSL: they may be assigned from, e.g. `$nr=NR`, but they may not be assigned to: `NR=100` is a syntax error. +These are all **read-only** for the `mlr put` and `mlr filter` DSL: they may be assigned from, e.g., `$nr=NR`, but they may not be assigned to: `NR=100` is a syntax error. ## Type-checking -Miller's `put`/`filter` DSL supports two optional kinds of type-checking. One is inline **type-tests** and **type-assertions** within expressions. The other is **type declarations** for assignments to local variables, binding of arguments to user-defined functions, and return values from user-defined functions, These are discussed in the following subsections. +Miller's `put`/`filter` DSL supports two optional kinds of type-checking. One is inline **type tests** and **type assertions** within expressions. The other is **type declarations** for assignments to local variables, binding of arguments to user-defined functions, and return values from user-defined functions. These are discussed in the following subsections. Use of type-checking is entirely up to you: omit it if you want flexibility with heterogeneous data; use it if you want to help catch misspellings in your DSL code or unexpected irregularities in your input data. @@ -350,22 +354,22 @@ GENMD-EOF See [Data-cleaning Examples](data-cleaning-examples.md) for examples of how to use these. -### Type-declarations for local variables, function parameter, and function return values +### Type declarations for local variables, function parameters, and function return values Local variables can be defined either untyped as in `x = 1`, or typed as in `int x = 1`. Types include **var** (explicitly untyped), **int**, **float**, **num** (int or float), **str**, **bool**, **arr**, **map**, and **funct**. These optional type declarations are enforced at the time values are assigned to variables: whether at the initial value assignment as in `int x = 1` or in any subsequent assignments to the same variable farther down in the scope. The reason for `num` is that `int` and `float` typedecls are very precise: GENMD-CARDIFY -float a = 0; # Runtime error since 0 is int not float -int b = 1.0; # Runtime error since 1.0 is float not int +float a = 0; # Runtime error since 0 is int, not float +int b = 1.0; # Runtime error since 1.0 is float, not int num c = 0; # OK num d = 1.0; # OK GENMD-EOF -A suggestion is to use `num` for general use when you want numeric content, and use `int` when you genuinely want integer-only values, e.g. in loop indices or map keys (since Miller map keys can only be strings or ints). +A suggestion is to use `num` for general use when you want numeric content, and use `int` when you genuinely want integer-only values, e.g., in loop indices or map keys (since Miller map keys can only be strings or ints). -The `var` type declaration indicates no type restrictions, e.g. `var x = 1` has the same type restrictions on `x` as `x = 1`. The difference is in intentional shadowing: if you have `x = 1` in outer scope and `x = 2` in inner scope (e.g. within a for-loop or an if-statement) then outer-scope `x` has value 2 after the second assignment. But if you have `var x = 2` in the inner scope, then you are declaring a variable scoped to the inner block.) For example: +The `var` type declaration indicates no type restrictions, e.g., `var x = 1` has the same type restrictions on `x` as `x = 1`. The difference is in intentional shadowing: if you have `x = 1` in outer scope and `x = 2` in inner scope (e.g., within a for-loop or an if-statement) then outer-scope `x` has value 2 after the second assignment. But if you have `var x = 2` in the inner scope, then you are declaring a variable scoped to the inner block.) For example: GENMD-CARDIFY x = 1; @@ -383,7 +387,7 @@ if (NR == 4) { print x; # Value of this x is still 1 GENMD-EOF -Likewise function arguments can optionally be typed, with type enforced when the function is called: +Likewise, function arguments can optionally be typed, with type enforced when the function is called: GENMD-CARDIFY func f(map m, int i) { @@ -415,7 +419,7 @@ func f(map m, int i): bool { } ... ... - # In Miller if your functions don't explicitly return a value, they return absent-null. + # In Miller, if your functions don't explicitly return a value, they return absent-null. # So it would also be a runtime error on reaching the end of this function without # an explicit return statement. } @@ -478,7 +482,7 @@ mlr --opprint --from data/small put -q ' ' GENMD-EOF -Example of out-of-stream variable assigned to full stream record, where the 2nd record is stashed, and the 4th record is overwritten with that: +Example of an out-of-stream variable assigned to the full stream record, where the 2nd record is stashed, and the 4th record is overwritten with that: GENMD-RUN-COMMAND mlr put 'NR == 2 {@keep = $*}; NR == 4 {$* = @keep}' data/small diff --git a/docs/src/reference-main-env-vars.md b/docs/src/reference-main-env-vars.md index 3b3302b8c..295973d58 100644 --- a/docs/src/reference-main-env-vars.md +++ b/docs/src/reference-main-env-vars.md @@ -19,6 +19,6 @@ Quick links: The following environment variables affect how Miller works: * `MLRRC`: see [Customization](customization.md). -* `MLR_NO_COLOR`, `MLR_ALWAYS_COLOR`, `MLR_KEY_COLOR`, `MLR_VALUE_COLOR`, `MLR_PASS_COLOR`, `MLR_FAIL_COLOR`, `MLR_REPL_PS1_COLOR`, `MLR_REPL_PS2_COLOR`, `MLR_HELP_COLOR`: see [Output Colorization](output-colorization.md). +* `MLR_NO_COLOR`, `NO_COLOR`, `MLR_ALWAYS_COLOR`, `MLR_KEY_COLOR`, `MLR_VALUE_COLOR`, `MLR_PASS_COLOR`, `MLR_FAIL_COLOR`, `MLR_REPL_PS1_COLOR`, `MLR_REPL_PS2_COLOR`, `MLR_HELP_COLOR`: see [Output Colorization](output-colorization.md). * `MLR_REPL_PS1`, `MLR_REPL_PS2`: see [REPL](repl.md). diff --git a/docs/src/reference-main-env-vars.md.in b/docs/src/reference-main-env-vars.md.in index 5c74638a1..869c73f3f 100644 --- a/docs/src/reference-main-env-vars.md.in +++ b/docs/src/reference-main-env-vars.md.in @@ -3,6 +3,6 @@ The following environment variables affect how Miller works: * `MLRRC`: see [Customization](customization.md). -* `MLR_NO_COLOR`, `MLR_ALWAYS_COLOR`, `MLR_KEY_COLOR`, `MLR_VALUE_COLOR`, `MLR_PASS_COLOR`, `MLR_FAIL_COLOR`, `MLR_REPL_PS1_COLOR`, `MLR_REPL_PS2_COLOR`, `MLR_HELP_COLOR`: see [Output Colorization](output-colorization.md). +* `MLR_NO_COLOR`, `NO_COLOR`, `MLR_ALWAYS_COLOR`, `MLR_KEY_COLOR`, `MLR_VALUE_COLOR`, `MLR_PASS_COLOR`, `MLR_FAIL_COLOR`, `MLR_REPL_PS1_COLOR`, `MLR_REPL_PS2_COLOR`, `MLR_HELP_COLOR`: see [Output Colorization](output-colorization.md). * `MLR_REPL_PS1`, `MLR_REPL_PS2`: see [REPL](repl.md). diff --git a/docs/src/reference-main-flag-list.md b/docs/src/reference-main-flag-list.md index f9ce597ff..e0f36f3af 100644 --- a/docs/src/reference-main-flag-list.md +++ b/docs/src/reference-main-flag-list.md @@ -63,9 +63,9 @@ Notes: **Flags:** * `--pass-comments`: Immediately print commented lines (prefixed by `#`) within the input. -* `--pass-comments-with {string}`: Immediately print commented lines within input, with specified prefix. +* `--pass-comments-with {string}`: Immediately print commented lines within input, with specified prefix. For CSV input format, the prefix must be a single character. * `--skip-comments`: Ignore commented lines (prefixed by `#`) within the input. -* `--skip-comments-with {string}`: Ignore commented lines within input, with specified prefix. +* `--skip-comments-with {string}`: Ignore commented lines within input, with specified prefix. For CSV input format, the prefix must be a single character. ## Compressed-data flags @@ -123,10 +123,20 @@ These are flags which are applicable to CSV format. * `--headerless-csv-output or --ho or --headerless-tsv-output`: Print only CSV/TSV data lines; do not print CSV/TSV header lines. * `--implicit-csv-header or --headerless-csv-input or --hi or --implicit-tsv-header`: Use 1,2,3,... as field labels, rather than from line 1 of input files. Tip: combine with `label` to recreate missing headers. * `--lazy-quotes`: Accepts quotes appearing in unquoted fields, and non-doubled quotes appearing in quoted fields. +* `--no-auto-unsparsify`: For CSV/TSV output: if the record keys change from one row to another, emit a blank line and a new header line. This is non-compliant with RFC 4180 but it helpful for heterogeneous data. * `--no-implicit-csv-header or --no-implicit-tsv-header`: Opposite of `--implicit-csv-header`. This is the default anyway -- the main use is for the flags to `mlr join` if you have main file(s) which are headerless but you want to join in on a file which does have a CSV/TSV header. Then you could use `mlr --csv --implicit-csv-header join --no-implicit-csv-header -l your-join-in-with-header.csv ... your-headerless.csv`. * `--quote-all`: Force double-quoting of CSV fields. * `-N`: Keystroke-saver for `--implicit-csv-header --headerless-csv-output`. +## DKVP-only flags + +These are flags which are applicable to DKVP format. + + +**Flags:** + +* `--incr-key`: Without this option, keyless DKVP fields are keyed by field number. For example: `a=10,b=20,30,d=40,50` is ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With this option, they're keyed by a running counter of keyless fields. For example: `a=10,b=20,30,d=40,50` is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`. + ## File-format flags See the File formats doc page, and or `mlr help file-formats`, for more @@ -143,9 +153,9 @@ are overridden in all cases by setting output format to `format2`. **Flags:** * `--asv or --asvlite`: Use ASV format for input and output data. -* `--csv or -c`: Use CSV format for input and output data. +* `--csv or -c or --c2c`: Use CSV format for input and output data. * `--csvlite`: Use CSV-lite format for input and output data. -* `--dkvp`: Use DKVP format for input and output data. +* `--dkvp or --d2d`: Use DKVP format for input and output data. * `--gen-field-name`: Specify field name for --igen. Defaults to "i". * `--gen-start`: Specify start value for --igen. Defaults to 1. * `--gen-step`: Specify step value for --igen. Defaults to 1. @@ -157,6 +167,7 @@ are overridden in all cases by setting output format to `format2`. * `--igen`: Ignore input files and instead generate sequential numeric input using --gen-field-name, --gen-start, --gen-step, and --gen-stop values. See also the seqgen verb, which is more useful/intuitive. * `--ijson`: Use JSON format for input data. * `--ijsonl`: Use JSON Lines format for input data. +* `--imd or --imarkdown`: Use markdown-tabular format for input data. * `--inidx`: Use NIDX format for input data. * `--io {format name}`: Use format name for input and output data. For example: `--io csv` is the same as `--csv`. * `--ipprint`: Use PPRINT format for input data. @@ -164,27 +175,27 @@ are overridden in all cases by setting output format to `format2`. * `--itsvlite`: Use TSV-lite format for input data. * `--iusv or --iusvlite`: Use USV format for input data. * `--ixtab`: Use XTAB format for input data. -* `--json or -j`: Use JSON format for input and output data. -* `--jsonl`: Use JSON Lines format for input and output data. -* `--nidx`: Use NIDX format for input and output data. +* `--json or -j or --j2j`: Use JSON format for input and output data. +* `--jsonl or --l2l`: Use JSON Lines format for input and output data. +* `--nidx or --n2n`: Use NIDX format for input and output data. * `--oasv or --oasvlite`: Use ASV format for output data. * `--ocsv`: Use CSV format for output data. * `--ocsvlite`: Use CSV-lite format for output data. * `--odkvp`: Use DKVP format for output data. * `--ojson`: Use JSON format for output data. * `--ojsonl`: Use JSON Lines format for output data. -* `--omd`: Use markdown-tabular format for output data. +* `--omd or --omarkdown`: Use markdown-tabular format for output data. * `--onidx`: Use NIDX format for output data. * `--opprint`: Use PPRINT format for output data. * `--otsv`: Use TSV format for output data. * `--otsvlite`: Use TSV-lite format for output data. * `--ousv or --ousvlite`: Use USV format for output data. * `--oxtab`: Use XTAB format for output data. -* `--pprint`: Use PPRINT format for input and output data. -* `--tsv or -t`: Use TSV format for input and output data. +* `--pprint or --p2p`: Use PPRINT format for input and output data. +* `--tsv or -t or --t2t`: Use TSV format for input and output data. * `--tsvlite`: Use TSV-lite format for input and output data. * `--usv or --usvlite`: Use USV format for input and output data. -* `--xtab`: Use XTAB format for input and output data. +* `--xtab or --x2x`: Use XTAB format for input and output data. * `--xvright`: Right-justify values for XTAB format. * `-i {format name}`: Use format name for input data. For example: `-i csv` is the same as `--icsv`. * `-o {format name}`: Use format name for output data. For example: `-o csv` is the same as `--ocsv`. @@ -193,14 +204,14 @@ are overridden in all cases by setting output format to `format2`. These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening). -See the Flatten/unflatten doc page for more information. +See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information. **Flags:** * `--flatsep or --jflatsep {string}`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`. -* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. -* `--no-auto-unflatten`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. +* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. +* `--no-auto-unflatten`: When input is non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. ## Format-conversion keystroke-saver flags @@ -233,7 +244,7 @@ These are flags which are applicable to JSON output format. * `--jlistwrap or --jl`: Wrap JSON output in outermost `[ ]`. This is the default for JSON output format. * `--jvquoteall`: Force all JSON values -- recursively into lists and object -- to string. * `--jvstack`: Put one key-value pair per line for JSON output (multi-line output). This is the default for JSON output format. -* `--no-jlistwrap`: Wrap JSON output in outermost `[ ]`. This is the default for JSON Lines output format. +* `--no-jlistwrap`: Do not wrap JSON output in outermost `[ ]`. This is the default for JSON Lines output format. * `--no-jvstack`: Put objects/arrays all on one line for JSON output. This is the default for JSON Lines output format. ## Legacy flags @@ -278,6 +289,7 @@ These are flags which don't fit into any other category. * `--no-dedupe-field-names`: By default, if an input record has a field named `x` and another also named `x`, the second will be renamed `x_2`, and so on. With this flag provided, the second `x`'s value will replace the first `x`'s value when the record is read. This flag has no effect on JSON input records, where duplicate keys always result in the last one's value being retained. * `--no-fflush`: Let buffered output not be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to allow less-frequent updates when output is to the terminal. This is unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead. * `--no-hash-records`: See --hash-records. +* `--norc`: Do not load a .mlrrc file. * `--nr-progress-mod {m}`: With m a positive integer: print filename and record count to os.Stderr every m input records. * `--ofmt {format}`: E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes (https://pkg.go.dev/fmt) for floating-point numbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb. * `--ofmte {n}`: Use --ofmte 6 as shorthand for --ofmt %.6e, etc. @@ -325,8 +337,8 @@ Mechanisms for coloring: How you can control colorization: * Suppression/unsuppression: - * Environment variable `export MLR_NO_COLOR=true` means don't color - even if stdout+TTY. + * Environment variable `export MLR_NO_COLOR=true` or `export NO_COLOR=true` + means don't color even if stdout+TTY. * Environment variable `export MLR_ALWAYS_COLOR=true` means do color even if not stdout+TTY. For example, you might want to use this when piping mlr output to `less -r`. @@ -372,7 +384,8 @@ These are flags which are applicable to PPRINT format. **Flags:** -* `--barred`: Prints a border around PPRINT output (not available for input). +* `--barred or --barred-output`: Prints a border around PPRINT output. +* `--barred-input`: When used in conjunction with --pprint, accepts barred input. * `--right`: Right-justifies all fields for PPRINT output. ## Profiling flags @@ -434,13 +447,13 @@ Notes about all other separators: - To avoid backslashing, you can use any of the following names: ascii_esc = "\x1b" - ascii_etx = "\x04" + ascii_etx = "\x03" ascii_fs = "\x1c" ascii_gs = "\x1d" - ascii_null = "\x01" + ascii_null = "\x00" ascii_rs = "\x1e" - ascii_soh = "\x02" - ascii_stx = "\x03" + ascii_soh = "\x01" + ascii_stx = "\x02" ascii_us = "\x1f" asv_fs = "\x1f" asv_rs = "\x1e" @@ -474,6 +487,7 @@ Notes about all other separators: csv "," N/A "\n" csvlite "," N/A "\n" dkvp "," "=" "\n" + gen "," N/A "\n" json N/A N/A N/A markdown " " N/A "\n" nidx " " N/A "\n" diff --git a/docs/src/reference-main-null-data.md b/docs/src/reference-main-null-data.md index 63bfffaa9..175ae2ad2 100644 --- a/docs/src/reference-main-null-data.md +++ b/docs/src/reference-main-null-data.md @@ -125,7 +125,7 @@ with the exception that the `min` and `max` functions are special: if one argume x=,y=3,a=3,b=-Likewise, empty works like 0 for addition and subtraction, and multiplication: +Likewise, empty works like 0 for addition and subtraction, and like 1 for multiplication:echo 'x=,y=3' | mlr put '$a = $x + $y; $b = $x - $y; $c = $x * $y' diff --git a/docs/src/reference-main-null-data.md.in b/docs/src/reference-main-null-data.md.in index 087edaa78..3ac1051ac 100644 --- a/docs/src/reference-main-null-data.md.in +++ b/docs/src/reference-main-null-data.md.in @@ -54,7 +54,7 @@ GENMD-RUN-COMMAND echo 'x=,y=3' | mlr put '$a=min($x,$y);$b=max($x,$y)' GENMD-EOF -Likewise, empty works like 0 for addition and subtraction, and multiplication: +Likewise, empty works like 0 for addition and subtraction, and like 1 for multiplication: GENMD-RUN-COMMAND echo 'x=,y=3' | mlr put '$a = $x + $y; $b = $x - $y; $c = $x * $y' diff --git a/docs/src/reference-main-overview.md b/docs/src/reference-main-overview.md index cc9c3a0b3..b7e1a97c4 100644 --- a/docs/src/reference-main-overview.md +++ b/docs/src/reference-main-overview.md @@ -66,7 +66,7 @@ See also the [Glossary](glossary.md) for more about terms such as When you type `mlr {something} myfile.dat`, the `{something}` part is called a **verb**. It specifies how you want to transform your data. Most of the verbs are counterparts of built-in system tools like `cut` and `sort` -- but with file-format awareness, and giving you the ability to refer to fields by name. -The verbs `put` and `filter` are special in that they have a rich expression language (domain-specific language, or "DSL"). More information about them can be found at on the [Intro to Miller's programming language page](miller-programming-language.md); see also [DSL reference](reference-dsl.md) for more details. +The verbs `put` and `filter` are special in that they have a rich expression language (domain-specific language, or "DSL"). More information about them can be found on the [Intro to Miller's Programming Language page](miller-programming-language.md); see also the [DSL Reference](reference-dsl.md) for more details. Here's a comparison of verbs and `put`/`filter` DSL expressions: diff --git a/docs/src/reference-main-overview.md.in b/docs/src/reference-main-overview.md.in index 413b358e7..42c3b8f0c 100644 --- a/docs/src/reference-main-overview.md.in +++ b/docs/src/reference-main-overview.md.in @@ -35,7 +35,7 @@ See also the [Glossary](glossary.md) for more about terms such as When you type `mlr {something} myfile.dat`, the `{something}` part is called a **verb**. It specifies how you want to transform your data. Most of the verbs are counterparts of built-in system tools like `cut` and `sort` -- but with file-format awareness, and giving you the ability to refer to fields by name. -The verbs `put` and `filter` are special in that they have a rich expression language (domain-specific language, or "DSL"). More information about them can be found at on the [Intro to Miller's programming language page](miller-programming-language.md); see also [DSL reference](reference-dsl.md) for more details. +The verbs `put` and `filter` are special in that they have a rich expression language (domain-specific language, or "DSL"). More information about them can be found on the [Intro to Miller's Programming Language page](miller-programming-language.md); see also the [DSL Reference](reference-dsl.md) for more details. Here's a comparison of verbs and `put`/`filter` DSL expressions: diff --git a/docs/src/reference-main-regular-expressions.md b/docs/src/reference-main-regular-expressions.md index f15b55f59..60126f5fa 100644 --- a/docs/src/reference-main-regular-expressions.md +++ b/docs/src/reference-main-regular-expressions.md @@ -61,9 +61,9 @@ name=jane,regex=^j.*e$ name=bull,regex=^b[ou]ll$-## Regex captures +## Regex captures for the `=~` operator -Regex captures of the form `\0` through `\9` are supported as +Regex captures of the form `\0` through `\9` are supported as follows: * Captures have in-function context for `sub` and `gsub`. For example, the first `\1,\2` pair belong to the first `sub` and the second `\1,\2` pair belong to the second `sub`: @@ -77,6 +77,24 @@ Regex captures of the form `\0` through `\9` are supported as mlr put '$a =~ "(..)_(....); $b = "left_\1"; $c = "right_\2"'+mlr -n put ' +func f() { + if ("456 defg" =~ "([0-9]+) ([a-z]+)") { + print "INNER: \1 \2"; + } +} +end { + if ("123 abc" =~ "([0-9]+) ([a-z]+)") { + print "OUTER PRE: \1 \2"; + f(); + print "OUTER POST: \1 \2"; + } +}' ++ * The captures are not retained across multiple puts. For example, here the `\1,\2` won't be expanded from the regex capture:@@ -85,6 +103,142 @@ Regex captures of the form `\0` through `\9` are supported as * Up to nine matches are supported: `\1` through `\9`, while `\0` is the entire match string; `\15` is treated as `\1` followed by an unrelated `5`. +## Resetting captures + +If you use `(...)` in your regular expression, then up to 9 matches are supported for the `=~` +operator, and an arbitrary number of matches are supported for the `match` DSL function. + +* Before any match is done, `"\1"` etc. in a string evaluate to themselves. +* After a successful match is done, `"\1"` etc. in a string evaluate to the matched substring. +* After an unsuccessful match is done, `"\1"` etc. in a string evaluate to the empty string. +* You can match against `null` to reset to the original state. + ++mlr repl +++ +[mlr] "\1:\2" +"\1:\2" + +[mlr] "abc" =~ "..." +true + +[mlr] "\1:\2" +":" + +[mlr] "abc" =~ "(.).(.)" +true + +[mlr] "\1:\2" +"a:c" + +[mlr] "abc" =~ "(.)x(.)" +false + +[mlr] "\1:\2" +":" + +[mlr] "abc" =~ null + +[mlr] "\1:\2" +"\1:\2" ++ +## The `strmatch` and `strmatchx` DSL functions + +The `=~` and `!=~` operators have been in Miller for a long time, and they will continue to be +supported. They do, however, have some deficiencies. As of Miller 6.11 and beyond, the `strmatch` +and `strmatchx` provide more robust ways to do capturing. + +First, some examples. + +The `strmatch` function only returns a boolean result, and it doesn't set `\0..\9`: + ++mlr repl +++ +[mlr] strmatch("abc", "....") +false + +[mlr] strmatch("abc", "...") +true + +[mlr] strmatch("abc", "(.).(.)") +true + +[mlr] strmatch("[ab:3458]", "([a-z]+):([0-9]+)") +true ++ +The `strmatchx` function also doesn't set `\0..\9`, but returns a map-valued result: + ++mlr repl +++ +[mlr] strmatchx("abc", "....") +{ + "matched": false +} + +[mlr] strmatchx("abc", "...") +{ + "matched": true, + "full_capture": "abc", + "full_start": 1, + "full_end": 3 +} + +[mlr] strmatchx("abc", "(.).(.)") +{ + "matched": true, + "full_capture": "abc", + "full_start": 1, + "full_end": 3, + "captures": ["a", "c"], + "starts": [1, 3], + "ends": [1, 3] +} + +[mlr] "[ab:3458]" =~ "([a-z]+):([0-9]+)" +true + +[mlr] "\1" +"ab" + +[mlr] "\2" +"3458" + +[mlr] strmatchx("[ab:3458]", "([a-z]+):([0-9]+)") +{ + "matched": true, + "full_capture": "ab:3458", + "full_start": 2, + "full_end": 8, + "captures": ["ab", "3458"], + "starts": [2, 5], + "ends": [3, 8] +} ++ +Notes: + +* When there is no match, the result from `strmatchx` only has the `"matched":false` key/value pair. +* When there is a match with no captures, the result from `strmatchx` has the `"matched":true` key/value pair, + as well as `full_capture` (taking the place of `\0` set by `=~`), and `full_start` and `full_end` + which `=~` does not offer. +* When there is a match with no captures, the result from `strmatchx` also has the `captures` array + whose slots 1, 2, 3, ... are the same as would have been set by `=~` via `\1, \2, \3, ...`. + However, `strmatchx` offers an arbitrary number of captures, not just `\1..\9`. + Additionally, the `starts` and `ends` arrays are indices into the input string. +* Since you hold the return value from `strmatchx`, you can operate on it as you wish --- instead of + relying on the (function-scoped) globals `\0..\9`. +* The price paid is that using `strmatchx` does indeed tend to take more keystrokes than `=~`. + ## More information Regular expressions are those supported by the [Go regexp package](https://pkg.go.dev/regexp), which in turn are of type [RE2](https://github.com/google/re2/wiki/Syntax) except for `\C`: @@ -97,7 +251,8 @@ package syntax // import "regexp/syntax" Package syntax parses regular expressions into parse trees and compiles parse trees into programs. Most clients of regular expressions will use the facilities -of package regexp (such as Compile and Match) instead of this package. +of package regexp (such as regexp.Compile and regexp.Match) instead of this +package. # Syntax @@ -147,6 +302,7 @@ Grouping: (re) numbered capturing group (submatch) (?Pre) named & numbered capturing group (submatch) + (? re) named & numbered capturing group (submatch) (?:re) non-capturing group (?flags) set flags within current group; non-capturing (?flags:re) set flags during re; non-capturing diff --git a/docs/src/reference-main-regular-expressions.md.in b/docs/src/reference-main-regular-expressions.md.in index e81f24552..893378627 100644 --- a/docs/src/reference-main-regular-expressions.md.in +++ b/docs/src/reference-main-regular-expressions.md.in @@ -36,9 +36,9 @@ GENMD-RUN-COMMAND mlr filter '$name =~ $regex' data/regex-in-data.dat GENMD-EOF -## Regex captures +## Regex captures for the `=~` operator -Regex captures of the form `\0` through `\9` are supported as +Regex captures of the form `\0` through `\9` are supported as follows: * Captures have in-function context for `sub` and `gsub`. For example, the first `\1,\2` pair belong to the first `sub` and the second `\1,\2` pair belong to the second `sub`: @@ -52,6 +52,24 @@ GENMD-SHOW-COMMAND mlr put '$a =~ "(..)_(....); $b = "left_\1"; $c = "right_\2"' GENMD-EOF +* Each user-defined function has its own frame for captures. For example: + +GENMD-SHOW-COMMAND +mlr -n put ' +func f() { + if ("456 defg" =~ "([0-9]+) ([a-z]+)") { + print "INNER: \1 \2"; + } +} +end { + if ("123 abc" =~ "([0-9]+) ([a-z]+)") { + print "OUTER PRE: \1 \2"; + f(); + print "OUTER POST: \1 \2"; + } +}' +GENMD-EOF + * The captures are not retained across multiple puts. For example, here the `\1,\2` won't be expanded from the regex capture: GENMD-SHOW-COMMAND @@ -60,6 +78,136 @@ GENMD-EOF * Up to nine matches are supported: `\1` through `\9`, while `\0` is the entire match string; `\15` is treated as `\1` followed by an unrelated `5`. +## Resetting captures + +If you use `(...)` in your regular expression, then up to 9 matches are supported for the `=~` +operator, and an arbitrary number of matches are supported for the `match` DSL function. + +* Before any match is done, `"\1"` etc. in a string evaluate to themselves. +* After a successful match is done, `"\1"` etc. in a string evaluate to the matched substring. +* After an unsuccessful match is done, `"\1"` etc. in a string evaluate to the empty string. +* You can match against `null` to reset to the original state. + +GENMD-CARDIFY-HIGHLIGHT-ONE +mlr repl + +[mlr] "\1:\2" +"\1:\2" + +[mlr] "abc" =~ "..." +true + +[mlr] "\1:\2" +":" + +[mlr] "abc" =~ "(.).(.)" +true + +[mlr] "\1:\2" +"a:c" + +[mlr] "abc" =~ "(.)x(.)" +false + +[mlr] "\1:\2" +":" + +[mlr] "abc" =~ null + +[mlr] "\1:\2" +"\1:\2" +GENMD-EOF + +## The `strmatch` and `strmatchx` DSL functions + +The `=~` and `!=~` operators have been in Miller for a long time, and they will continue to be +supported. They do, however, have some deficiencies. As of Miller 6.11 and beyond, the `strmatch` +and `strmatchx` provide more robust ways to do capturing. + +First, some examples. + +The `strmatch` function only returns a boolean result, and it doesn't set `\0..\9`: + +GENMD-CARDIFY-HIGHLIGHT-ONE +mlr repl + +[mlr] strmatch("abc", "....") +false + +[mlr] strmatch("abc", "...") +true + +[mlr] strmatch("abc", "(.).(.)") +true + +[mlr] strmatch("[ab:3458]", "([a-z]+):([0-9]+)") +true +GENMD-EOF + +The `strmatchx` function also doesn't set `\0..\9`, but returns a map-valued result: + +GENMD-CARDIFY-HIGHLIGHT-ONE +mlr repl + +[mlr] strmatchx("abc", "....") +{ + "matched": false +} + +[mlr] strmatchx("abc", "...") +{ + "matched": true, + "full_capture": "abc", + "full_start": 1, + "full_end": 3 +} + +[mlr] strmatchx("abc", "(.).(.)") +{ + "matched": true, + "full_capture": "abc", + "full_start": 1, + "full_end": 3, + "captures": ["a", "c"], + "starts": [1, 3], + "ends": [1, 3] +} + +[mlr] "[ab:3458]" =~ "([a-z]+):([0-9]+)" +true + +[mlr] "\1" +"ab" + +[mlr] "\2" +"3458" + +[mlr] strmatchx("[ab:3458]", "([a-z]+):([0-9]+)") +{ + "matched": true, + "full_capture": "ab:3458", + "full_start": 2, + "full_end": 8, + "captures": ["ab", "3458"], + "starts": [2, 5], + "ends": [3, 8] +} +GENMD-EOF + +Notes: + +* When there is no match, the result from `strmatchx` only has the `"matched":false` key/value pair. +* When there is a match with no captures, the result from `strmatchx` has the `"matched":true` key/value pair, + as well as `full_capture` (taking the place of `\0` set by `=~`), and `full_start` and `full_end` + which `=~` does not offer. +* When there is a match with no captures, the result from `strmatchx` also has the `captures` array + whose slots 1, 2, 3, ... are the same as would have been set by `=~` via `\1, \2, \3, ...`. + However, `strmatchx` offers an arbitrary number of captures, not just `\1..\9`. + Additionally, the `starts` and `ends` arrays are indices into the input string. +* Since you hold the return value from `strmatchx`, you can operate on it as you wish --- instead of + relying on the (function-scoped) globals `\0..\9`. +* The price paid is that using `strmatchx` does indeed tend to take more keystrokes than `=~`. + ## More information Regular expressions are those supported by the [Go regexp package](https://pkg.go.dev/regexp), which in turn are of type [RE2](https://github.com/google/re2/wiki/Syntax) except for `\C`: diff --git a/docs/src/reference-main-separators.md b/docs/src/reference-main-separators.md index c13241e65..8ed7612a7 100644 --- a/docs/src/reference-main-separators.md +++ b/docs/src/reference-main-separators.md @@ -187,13 +187,13 @@ Many things we'd like to write as separators need to be escaped from the shell ascii_esc = "\x1b" -ascii_etx = "\x04" +ascii_etx = "\x03" ascii_fs = "\x1c" ascii_gs = "\x1d" -ascii_null = "\x01" +ascii_null = "\x00" ascii_rs = "\x1e" -ascii_soh = "\x02" -ascii_stx = "\x03" +ascii_soh = "\x01" +ascii_stx = "\x02" ascii_us = "\x1f" asv_fs = "\x1f" asv_rs = "\x1e" diff --git a/docs/src/reference-main-strings.md b/docs/src/reference-main-strings.md index df35284f4..b16b03483 100644 --- a/docs/src/reference-main-strings.md +++ b/docs/src/reference-main-strings.md @@ -197,4 +197,4 @@ See also [https://en.wikipedia.org/wiki/Escape_sequences_in_C](https://en.wikipe These replacements apply only to strings you key in for the DSL expressions for `filter` and `put`: that is, if you type `\t` in a string literal for a `filter`/`put` expression, it will be turned into a tab character. If you want a backslash followed by a `t`, then please type `\\t`. -However, these replacements are done automatically only for string literals within DSL expressions -- they are not done automatically to fields within your data stream. If you wish to make these replacements, you can do (for example) `mlr put '$field = gsub($field, "\\t", "\t")'`. If you need to make such a replacement for all fields in your data, you should probably use the system `sed` command instead. +However, these replacements are done automatically only for string literals within DSL expressions -- they are not done automatically to fields within your data stream. If you wish to make these replacements, you can do (for example) `mlr put '$field = gsub($field, "\\t", "\t")'`. If you need to make such a replacement for all fields in your data, you should probably use the system `sed` command instead. diff --git a/docs/src/reference-main-strings.md.in b/docs/src/reference-main-strings.md.in index e67560550..7ad9e431d 100644 --- a/docs/src/reference-main-strings.md.in +++ b/docs/src/reference-main-strings.md.in @@ -143,4 +143,4 @@ See also [https://en.wikipedia.org/wiki/Escape_sequences_in_C](https://en.wikipe These replacements apply only to strings you key in for the DSL expressions for `filter` and `put`: that is, if you type `\t` in a string literal for a `filter`/`put` expression, it will be turned into a tab character. If you want a backslash followed by a `t`, then please type `\\t`. -However, these replacements are done automatically only for string literals within DSL expressions -- they are not done automatically to fields within your data stream. If you wish to make these replacements, you can do (for example) `mlr put '$field = gsub($field, "\\t", "\t")'`. If you need to make such a replacement for all fields in your data, you should probably use the system `sed` command instead. +However, these replacements are done automatically only for string literals within DSL expressions -- they are not done automatically to fields within your data stream. If you wish to make these replacements, you can do (for example) `mlr put '$field = gsub($field, "\\t", "\t")'`. If you need to make such a replacement for all fields in your data, you should probably use the system `sed` command instead. diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index 89bbc2b71..b50c97d7d 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -596,6 +596,7 @@ Same as uniq -c. Options: -f {a,b,c} Field names for distinct count. +-x {a,b,c} Field names to exclude for distinct count: use each record's others instead. -n Show only the number of distinct values. Not compatible with -u. -o {name} Field name for output count. Default "count". Ignored with -u. @@ -803,7 +804,7 @@ Options: -r Treat field names as regular expressions. "ab", "a.*b" will match any field name containing the substring "ab" or matching "a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may - be used. The -o flag is ignored when -r is present. + be used. -h|--help Show this message. Examples: mlr cut -f hostname,status @@ -969,6 +970,10 @@ a,b,cUsage: mlr filter [options] {DSL expression} +Lets you use a domain-specific language to programmatically filter which +stream records will be output. +See also: https://miller.readthedocs.io/en/latest/reference-verbs + Options: -f {file name} File containing a DSL expression (see examples below). If the filename is a directory, all *.mlr files in that directory are loaded. @@ -981,7 +986,7 @@ Options: Since the expression pieces are simply concatenated, please be sure to use intervening semicolons to separate expressions.) --s name=value: Predefines out-of-stream variable @name to have +-s name=value: Predefines out-of-stream variable @name to have Thus mlr put -s foo=97 '$column += @foo' is like mlr put 'begin {@foo = 97} $column += @foo'. The value part is subject to type-inferencing. @@ -1459,6 +1464,8 @@ for the old string and handling multiple matches, like the `gsub` DSL function. See also the `sub` and `ssub` verbs. Options: -f {a,b,c} Field names to convert. +-r {regex} Regular expression for field names to convert. +-a Convert all fields. -h|--help Show this message.@@ -1670,6 +1677,8 @@ Options: --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field names from the left file. Automatically includes the join-field name(s). Helpful for when you only want a limited subset of information from the left file. + Tip: you can use --lk "": this means the left file becomes solely a row-selector + for the input files. --lp {text} Additional prefix for non-join output field names from the left file --rp {text} Additional prefix for non-join output field names from @@ -1704,7 +1713,7 @@ be specified CSV as well unless you override with 'mlr --csv ... join --ijson -l Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'. Please use "mlr --usage-separator-options" for information on specifying separators. -Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information +Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information including examples.Usage: mlr put [options] {DSL expression} +Lets you use a domain-specific language to programmatically alter stream records. +See also: https://miller.readthedocs.io/en/latest/reference-verbs + Options: -f {file name} File containing a DSL expression (see examples below). If the filename is a directory, all *.mlr files in that directory are loaded. @@ -2310,7 +2323,7 @@ Options: Since the expression pieces are simply concatenated, please be sure to use intervening semicolons to separate expressions.) --s name=value: Predefines out-of-stream variable @name to have +-s name=value: Predefines out-of-stream variable @name to have Thus mlr put -s foo=97 '$column += @foo' is like mlr put 'begin {@foo = 97} $column += @foo'. The value part is subject to type-inferencing. @@ -2452,9 +2465,9 @@ Options: first-match replacement. -h|--help Show this message. Examples: -mlr rename old_name,new_name' -mlr rename old_name_1,new_name_1,old_name_2,new_name_2' -mlr rename -r 'Date_[0-9]+,Date,' Rename all such fields to be "Date" +mlr rename old_name,new_name +mlr rename old_name_1,new_name_1,old_name_2,new_name_2 +mlr rename -r 'Date_[0-9]+,Date' Rename all such fields to be "Date" mlr rename -r '"Date_[0-9]+",Date' Same mlr rename -r 'Date_([0-9]+).*,\1' Rename all such fields to be of the form 20151015 mlr rename -r '"name"i,Name' Rename "name", "Name", "NAME", etc. to "Name" @@ -2947,6 +2960,7 @@ Options: -nf {comma-separated field names} Same as -n -nr {comma-separated field names} Numerical descending; nulls sort first -t {comma-separated field names} Natural ascending +-b Move sort fields to start of record, as in reorder -b -tr|-rt {comma-separated field names} Natural descending -h|--help Show this message. @@ -3123,6 +3137,23 @@ a b c 9 8 7+## sparsify + ++mlr sparsify --help +++Usage: mlr sparsify [options] +Unsets fields for which the key is the empty string (or, optionally, another +specified value). Only makes sense with output format not being CSV or TSV. +Options: +-s {filler string} What values to remove. Defaults to the empty string. +-f {a,b,c} Specify field names to be operated on; any other fields won't be + modified. The default is to modify all fields. +-h|--help Show this message. +Example: if input is a=1,b=,c=3 then output is a=1,c=3. ++ ## split@@ -3182,6 +3213,8 @@ Replaces old string with new string in specified field(s), without regex support the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs. Options: -f {a,b,c} Field names to convert. +-r {regex} Regular expression for field names to convert. +-a Convert all fields. -h|--help Show this message.@@ -3239,6 +3272,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -3273,7 +3307,7 @@ Options: Example: mlr stats1 -a min,p10,p50,p90,max -f value -g size,shape Example: mlr stats1 -a count,mode -f size Example: mlr stats1 -a count,mode -f size -g shape -Example: mlr stats1 -a count,mode --fr '^[a-h].*$' -gr '^k.*$' +Example: mlr stats1 -a count,mode --fr '^[a-h].*$' --gr '^k.*$' This computes count and mode statistics on all field names beginning with a through h, grouped by all field names starting with k. @@ -3406,14 +3440,14 @@ fields, optionally categorized by one or more fields. data/medium-x_y_cov 0.000042574820827444476 -x_y_corr 0.0005042001844467462 -y_y_cov 0.08461122467974003 +x_y_cov 0.00004257482082749404 +x_y_corr 0.0005042001844473328 +y_y_cov 0.08461122467974005 y_y_corr 1 -x2_xy_cov 0.04188382281779374 -x2_xy_corr 0.630174342037994 -x2_y2_cov -0.00030953725962542085 -x2_y2_corr -0.0034249088761121966 +x2_xy_cov 0.041883822817793716 +x2_xy_corr 0.6301743420379936 +x2_y2_cov -0.0003095372596253918 +x2_y2_corr -0.003424908876111875@@ -3422,12 +3456,12 @@ x2_y2_corr -0.0034249088761121966 data/medium-a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2 -pan 0.01702551273681908 0.5004028922897639 2081 0.00028691820445814767 1 0 2081 1 0.8781320866715662 0.11908230147563566 2081 0.41749827377311266 -eks 0.0407804923685586 0.48140207967651016 1965 0.0016461239223448587 1 0 1965 1 0.8978728611690183 0.10734054433612333 1965 0.45563223864254526 -wye -0.03915349075204814 0.5255096523974456 1966 0.0015051268704373607 1 0 1966 1 0.8538317334220835 0.1267454301662969 1966 0.38991721818599295 -zee 0.0027812364960399147 0.5043070448033061 2047 0.000007751652858786137 1 0 2047 1 0.8524439912011013 0.12401684308018937 2047 0.39356598090006495 -hat -0.018620577041095078 0.5179005397264935 1941 0.0003520036646055585 1 0 1941 1 0.8412305086345014 0.13557328318623216 1941 0.3687944261732265 +a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2 +pan 0.017025512736819345 0.500402892289764 2081 0.00028691820445815624 1 -0.00000000000000002890430283104539 2081 1 0.8781320866715664 0.11908230147563569 2081 0.4174982737731127 +eks 0.04078049236855813 0.4814020796765104 1965 0.0016461239223448218 1 0.00000000000000017862676354313703 1965 1 0.897872861169018 0.1073405443361234 1965 0.4556322386425451 +wye -0.03915349075204785 0.5255096523974457 1966 0.0015051268704373377 1 0.00000000000000004464425401127647 1966 1 0.8538317334220837 0.1267454301662969 1966 0.3899172181859931 +zee 0.0027812364960401333 0.5043070448033061 2047 0.000007751652858787357 1 0.00000000000000004819404567023685 2047 1 0.8524439912011011 0.12401684308018947 2047 0.39356598090006495 +hat -0.018620577041095272 0.5179005397264937 1941 0.00035200366460556604 1 -0.00000000000000003400445761787692 1941 1 0.8412305086345017 0.13557328318623207 1941 0.3687944261732266Here's an example simple line-fit. The `x` and `y` @@ -3513,11 +3547,11 @@ upsec_count_pca_quality 0.9999590846136102 donesec 92.33051350964094 color purple -upsec_count_pca_m -39.03009744795354 -upsec_count_pca_b 979.9883413064914 +upsec_count_pca_m -39.030097447953594 +upsec_count_pca_b 979.9883413064917 upsec_count_pca_n 21 upsec_count_pca_quality 0.9999908956206317 -donesec 25.10852919630297 +donesec 25.108529196302943+mlr surv --help +++Usage: mlr surv -d {duration-field} -s {status-field} + +Estimate Kaplan-Meier survival curve (right-censored). +Options: + -d {field} Name of duration field (time-to-event or censoring). + -s {field} Name of status field (0=censored, 1=event). + -h, --help Show this message. ++ ## tac@@ -4066,6 +4118,7 @@ count-distinct. For uniq, -f is a synonym for -g. Options: -g {d,e,f} Group-by-field names for uniq counts. +-x {a,b,c} Field names to exclude for uniq: use each record's others instead. -c Show repeat counts in addition to unique values. -n Show only the number of distinct values. -o {name} Field name for output count. Default "count". @@ -4081,7 +4134,7 @@ There are two main ways to use `mlr uniq`: the first way is with `-g` to specify wc -l data/colored-shapes.csv- 10079 data/colored-shapes.csv +10079 data/colored-shapes.csv@@ -4238,7 +4291,7 @@ color=purple,shape=square,flag=0 wc -l data/repeats.dkvp- 57 data/repeats.dkvp +57 data/repeats.dkvpdiff --git a/docs/src/reference-verbs.md.in b/docs/src/reference-verbs.md.in index 44feda3de..5f6f31097 100644 --- a/docs/src/reference-verbs.md.in +++ b/docs/src/reference-verbs.md.in @@ -995,6 +995,12 @@ GENMD-RUN-COMMAND mlr --ijson --opprint sort-within-records data/sort-within-records.json GENMD-EOF +## sparsify + +GENMD-RUN-COMMAND +mlr sparsify --help +GENMD-EOF + ## split GENMD-RUN-COMMAND @@ -1155,6 +1161,12 @@ GENMD-RUN-COMMAND mlr --from data/medium --opprint summary --transpose -a mean,median,mode GENMD-EOF +## surv + +GENMD-RUN-COMMAND +mlr surv --help +GENMD-EOF + ## tac GENMD-RUN-COMMAND diff --git a/docs/src/release-docs.md b/docs/src/release-docs.md index 4b9380d72..22924b141 100644 --- a/docs/src/release-docs.md +++ b/docs/src/release-docs.md @@ -16,7 +16,7 @@ Quick links: # Documents for releases -If your `mlr version` says something like `mlr 6.0.0-dev`, with the `-dev` suffix, you're likely building from source, or you've obtained a recent artifact from GitHub Actions -- +If your `mlr version` says something like `mlr 6.0.0-dev`, with the `-dev` suffix, you're likely building from source, or you've obtained a recent artifact from GitHub Actions -- the page [https://miller.readthedocs.io/en/main](https://miller.readthedocs.io/en/main) contains information for the latest contributions to the [Miller repository](https://github.com/johnkerl/miller). If your `mlr version` says something like `Miller v5.10.2` or `mlr 6.0.0`, without the `-dev` suffix, you're likely using a Miller executable from a package manager -- please see below for the documentation for Miller as of the release you're using. @@ -24,6 +24,12 @@ If your `mlr version` says something like `Miller v5.10.2` or `mlr 6.0.0`, witho | Release | Docs | Release notes | |---------|---------------------------------------------------------------------|---------------| main | [main branch](https://miller.readthedocs.io/en/main) | N/A | +6.14.0 | [Miller 6.14.0](https://miller.readthedocs.io/en/6.14.0) | [Survival curve, misc. features and bugfixes](https://github.com/johnkerl/miller/releases/tag/v6.14.0) | +6.13.0 | [Miller 6.13.0](https://miller.readthedocs.io/en/6.13.0) | [File-stat DSL function, new stats accumulator, misc. bugfixes](https://github.com/johnkerl/miller/releases/tag/v6.13.0) | +6.12.0 | [Miller 6.12.0](https://miller.readthedocs.io/en/6.12.0) | [New sparsify verb, wide-table performance improvement, thousands separator for fmtnum function](https://github.com/johnkerl/miller/releases/tag/v6.12.0) | +6.11.0 | [Miller 6.11.0](https://miller.readthedocs.io/en/6.11.0) | [CSV/TSV auto-unsparsify, regex-fieldname support for reorder/sub/ssub/gsub, strmatch DSL function, and more](https://github.com/johnkerl/miller/releases/tag/v6.11.0) | +6.10.0 | [Miller 6.10.0](https://miller.readthedocs.io/en/6.10.0) | [Add --files option; bugfixes; use Go 1.19](https://github.com/johnkerl/miller/releases/tag/v6.10.0) | +6.9.0 | [Miller 6.9.0](https://miller.readthedocs.io/en/6.9.0) | [Nanosecond timestamps, ZSTD compression, improved data-error handling, and more](https://github.com/johnkerl/miller/releases/tag/v6.9.0) | 6.8.0 | [Miller 6.8.0](https://miller.readthedocs.io/en/6.8.0) | [New case verb, index DSL function, and more](https://github.com/johnkerl/miller/releases/tag/v6.8.0) | 6.7.0 | [Miller 6.7.0](https://miller.readthedocs.io/en/6.7.0) | [New leftpad/rightpad DSL functions, unspace verb, and more](https://github.com/johnkerl/miller/releases/tag/v6.7.0) | 6.6.0 | [Miller 6.6.0](https://miller.readthedocs.io/en/6.6.0) | [Bugfixes and unspace verb](https://github.com/johnkerl/miller/releases/tag/v6.6.0) | diff --git a/docs/src/release-docs.md.in b/docs/src/release-docs.md.in index 41bd54025..4b89cf87d 100644 --- a/docs/src/release-docs.md.in +++ b/docs/src/release-docs.md.in @@ -1,6 +1,6 @@ # Documents for releases -If your `mlr version` says something like `mlr 6.0.0-dev`, with the `-dev` suffix, you're likely building from source, or you've obtained a recent artifact from GitHub Actions -- +If your `mlr version` says something like `mlr 6.0.0-dev`, with the `-dev` suffix, you're likely building from source, or you've obtained a recent artifact from GitHub Actions -- the page [https://miller.readthedocs.io/en/main](https://miller.readthedocs.io/en/main) contains information for the latest contributions to the [Miller repository](https://github.com/johnkerl/miller). If your `mlr version` says something like `Miller v5.10.2` or `mlr 6.0.0`, without the `-dev` suffix, you're likely using a Miller executable from a package manager -- please see below for the documentation for Miller as of the release you're using. @@ -8,6 +8,12 @@ If your `mlr version` says something like `Miller v5.10.2` or `mlr 6.0.0`, witho | Release | Docs | Release notes | |---------|---------------------------------------------------------------------|---------------| main | [main branch](https://miller.readthedocs.io/en/main) | N/A | +6.14.0 | [Miller 6.14.0](https://miller.readthedocs.io/en/6.14.0) | [Survival curve, misc. features and bugfixes](https://github.com/johnkerl/miller/releases/tag/v6.14.0) | +6.13.0 | [Miller 6.13.0](https://miller.readthedocs.io/en/6.13.0) | [File-stat DSL function, new stats accumulator, misc. bugfixes](https://github.com/johnkerl/miller/releases/tag/v6.13.0) | +6.12.0 | [Miller 6.12.0](https://miller.readthedocs.io/en/6.12.0) | [New sparsify verb, wide-table performance improvement, thousands separator for fmtnum function](https://github.com/johnkerl/miller/releases/tag/v6.12.0) | +6.11.0 | [Miller 6.11.0](https://miller.readthedocs.io/en/6.11.0) | [CSV/TSV auto-unsparsify, regex-fieldname support for reorder/sub/ssub/gsub, strmatch DSL function, and more](https://github.com/johnkerl/miller/releases/tag/v6.11.0) | +6.10.0 | [Miller 6.10.0](https://miller.readthedocs.io/en/6.10.0) | [Add --files option; bugfixes; use Go 1.19](https://github.com/johnkerl/miller/releases/tag/v6.10.0) | +6.9.0 | [Miller 6.9.0](https://miller.readthedocs.io/en/6.9.0) | [Nanosecond timestamps, ZSTD compression, improved data-error handling, and more](https://github.com/johnkerl/miller/releases/tag/v6.9.0) | 6.8.0 | [Miller 6.8.0](https://miller.readthedocs.io/en/6.8.0) | [New case verb, index DSL function, and more](https://github.com/johnkerl/miller/releases/tag/v6.8.0) | 6.7.0 | [Miller 6.7.0](https://miller.readthedocs.io/en/6.7.0) | [New leftpad/rightpad DSL functions, unspace verb, and more](https://github.com/johnkerl/miller/releases/tag/v6.7.0) | 6.6.0 | [Miller 6.6.0](https://miller.readthedocs.io/en/6.6.0) | [Bugfixes and unspace verb](https://github.com/johnkerl/miller/releases/tag/v6.6.0) | diff --git a/docs/src/scripting.md b/docs/src/scripting.md index 71c6b22a0..4766dcb50 100644 --- a/docs/src/scripting.md +++ b/docs/src/scripting.md @@ -234,7 +234,7 @@ then fraction -f count Points: -* Same as above, where the `#!` line isn't needed. (But you can include a `#!` line; `mlr -s` will simply see it as a comment line.). +* Same as above, where the `#!` line isn't needed. (But you can include a `#!` line; `mlr -s` will simply see it as a comment line.) * As above, you don't need all the backslashing for line-continuations. * As above, you don't need the explicit `--` or `"$@"`. diff --git a/docs/src/scripting.md.in b/docs/src/scripting.md.in index 3234c9398..f29fe8b63 100644 --- a/docs/src/scripting.md.in +++ b/docs/src/scripting.md.in @@ -101,7 +101,7 @@ GENMD-EOF Points: -* Same as above, where the `#!` line isn't needed. (But you can include a `#!` line; `mlr -s` will simply see it as a comment line.). +* Same as above, where the `#!` line isn't needed. (But you can include a `#!` line; `mlr -s` will simply see it as a comment line.) * As above, you don't need all the backslashing for line-continuations. * As above, you don't need the explicit `--` or `"$@"`. diff --git a/docs/src/shapes-of-data.md b/docs/src/shapes-of-data.md index bab58b7f0..f97040543 100644 --- a/docs/src/shapes-of-data.md +++ b/docs/src/shapes-of-data.md @@ -33,7 +33,7 @@ Also try `od -xcv` and/or `cat -e` on your file to check for non-printable chara Use the `file` command to see if there are CR/LF terminators (in this case, there are not):-file data/colours.csv +file data/colours.csvdata/colours.csv: Unicode text, UTF-8 text @@ -42,7 +42,7 @@ data/colours.csv: Unicode text, UTF-8 text Look at the file to find names of fields:-cat data/colours.csv +cat data/colours.csvKEY;DE;EN;ES;FI;FR;IT;NL;PL;TO;TR @@ -53,13 +53,13 @@ masterdata_colourcode_2;Schwarz;Black;Negro;Musta;Noir;Nero;Zwart;Czarny;Negru;S Extract a few fields:-mlr --csv cut -f KEY,PL,TO data/colours.csv +mlr --csv cut -f KEY,PL,TO data/colours.csvUse XTAB output format to get a sharper picture of where records/fields are being split:-mlr --icsv --oxtab cat data/colours.csv +mlr --icsv --oxtab cat data/colours.csvKEY;DE;EN;ES;FI;FR;IT;NL;PL;TO;TR masterdata_colourcode_1;Weiß;White;Blanco;Valkoinen;Blanc;Bianco;Wit;Biały;Alb;Beyaz @@ -70,7 +70,7 @@ KEY;DE;EN;ES;FI;FR;IT;NL;PL;TO;TR masterdata_colourcode_2;Schwarz;Black;Negro;Mu Using XTAB output format makes it clearer that `KEY;DE;...;TR` is being treated as a single field name in the CSV header, and likewise each subsequent line is being treated as a single field value. This is because the default field separator is a comma but we have semicolons here. Use XTAB again with different field separator (`--fs semicolon`):-mlr --icsv --ifs semicolon --oxtab cat data/colours.csv +mlr --icsv --ifs semicolon --oxtab cat data/colours.csvKEY masterdata_colourcode_1 @@ -101,7 +101,7 @@ TR Siyah Using the new field-separator, retry the cut:-mlr --csv --fs semicolon cut -f KEY,PL,TO data/colours.csv +mlr --csv --fs semicolon cut -f KEY,PL,TO data/colours.csvKEY;PL;TO diff --git a/docs/src/shapes-of-data.md.in b/docs/src/shapes-of-data.md.in index c32b0dad1..3636f406d 100644 --- a/docs/src/shapes-of-data.md.in +++ b/docs/src/shapes-of-data.md.in @@ -17,14 +17,14 @@ Also try `od -xcv` and/or `cat -e` on your file to check for non-printable chara Use the `file` command to see if there are CR/LF terminators (in this case, there are not): GENMD-CARDIFY-HIGHLIGHT-ONE -file data/colours.csv +file data/colours.csv data/colours.csv: Unicode text, UTF-8 text GENMD-EOF Look at the file to find names of fields: GENMD-CARDIFY-HIGHLIGHT-ONE -cat data/colours.csv +cat data/colours.csv KEY;DE;EN;ES;FI;FR;IT;NL;PL;TO;TR masterdata_colourcode_1;Weiß;White;Blanco;Valkoinen;Blanc;Bianco;Wit;Biały;Alb;Beyaz masterdata_colourcode_2;Schwarz;Black;Negro;Musta;Noir;Nero;Zwart;Czarny;Negru;Siyah @@ -33,13 +33,13 @@ GENMD-EOF Extract a few fields: GENMD-CARDIFY-HIGHLIGHT-ONE -mlr --csv cut -f KEY,PL,TO data/colours.csv +mlr --csv cut -f KEY,PL,TO data/colours.csv GENMD-EOF Use XTAB output format to get a sharper picture of where records/fields are being split: GENMD-CARDIFY-HIGHLIGHT-ONE -mlr --icsv --oxtab cat data/colours.csv +mlr --icsv --oxtab cat data/colours.csv KEY;DE;EN;ES;FI;FR;IT;NL;PL;TO;TR masterdata_colourcode_1;Weiß;White;Blanco;Valkoinen;Blanc;Bianco;Wit;Biały;Alb;Beyaz KEY;DE;EN;ES;FI;FR;IT;NL;PL;TO;TR masterdata_colourcode_2;Schwarz;Black;Negro;Musta;Noir;Nero;Zwart;Czarny;Negru;Siyah @@ -48,7 +48,7 @@ GENMD-EOF Using XTAB output format makes it clearer that `KEY;DE;...;TR` is being treated as a single field name in the CSV header, and likewise each subsequent line is being treated as a single field value. This is because the default field separator is a comma but we have semicolons here. Use XTAB again with different field separator (`--fs semicolon`): GENMD-CARDIFY-HIGHLIGHT-ONE -mlr --icsv --ifs semicolon --oxtab cat data/colours.csv +mlr --icsv --ifs semicolon --oxtab cat data/colours.csv KEY masterdata_colourcode_1 DE Weiß EN White @@ -77,7 +77,7 @@ GENMD-EOF Using the new field-separator, retry the cut: GENMD-CARDIFY-HIGHLIGHT-ONE -mlr --csv --fs semicolon cut -f KEY,PL,TO data/colours.csv +mlr --csv --fs semicolon cut -f KEY,PL,TO data/colours.csv KEY;PL;TO masterdata_colourcode_1;Biały;Alb masterdata_colourcode_2;Czarny;Negru diff --git a/docs/src/sorting.md b/docs/src/sorting.md index 68e1f4a02..7d876eda2 100644 --- a/docs/src/sorting.md +++ b/docs/src/sorting.md @@ -214,6 +214,8 @@ a b c ## The sort function by example +The Miller DSL has a [`sort`](reference-dsl-builtin-functions.md#sort) function: + * It returns a sorted copy of an input array or map. * Without second argument, uses Miller's default ordering which is numbers numerically, then strings lexically. * With second which is string, takes sorting flags from it: `"f"` for lexical or `"c"` for case-folded lexical, or `"t"` for natural sort order. An additional `"r"` in this string is for reverse/descending. diff --git a/docs/src/sorting.md.in b/docs/src/sorting.md.in index 28617c697..0d59836e9 100644 --- a/docs/src/sorting.md.in +++ b/docs/src/sorting.md.in @@ -66,6 +66,8 @@ GENMD-EOF ## The sort function by example +The Miller DSL has a [`sort`](reference-dsl-builtin-functions.md#sort) function: + * It returns a sorted copy of an input array or map. * Without second argument, uses Miller's default ordering which is numbers numerically, then strings lexically. * With second which is string, takes sorting flags from it: `"f"` for lexical or `"c"` for case-folded lexical, or `"t"` for natural sort order. An additional `"r"` in this string is for reverse/descending. diff --git a/docs/src/statistics-examples.md b/docs/src/statistics-examples.md index b1b7ea7b3..2e80e8a39 100644 --- a/docs/src/statistics-examples.md +++ b/docs/src/statistics-examples.md @@ -23,7 +23,7 @@ For one or more specified field names, simply compute p25 and p75, then write thmlr --oxtab stats1 -f x -a p25,p75 \ then put '$x_iqr = $x_p75 - $x_p25' \ - data/medium + data/mediumx_p25 0.24667037823231752 @@ -40,7 +40,7 @@ For wildcarded field names, first compute p25 and p75, then loop over field name $["\1_iqr"] = $["\1_p75"] - $["\1_p25"] } }' \ - data/medium + data/mediumi_p25 2501 diff --git a/docs/src/statistics-examples.md.in b/docs/src/statistics-examples.md.in index a98ead194..1da4aa235 100644 --- a/docs/src/statistics-examples.md.in +++ b/docs/src/statistics-examples.md.in @@ -7,7 +7,7 @@ For one or more specified field names, simply compute p25 and p75, then write th GENMD-RUN-COMMAND mlr --oxtab stats1 -f x -a p25,p75 \ then put '$x_iqr = $x_p75 - $x_p25' \ - data/medium + data/medium GENMD-EOF For wildcarded field names, first compute p25 and p75, then loop over field names with `p25` in them: @@ -19,7 +19,7 @@ mlr --oxtab stats1 --fr '[i-z]' -a p25,p75 \ $["\1_iqr"] = $["\1_p75"] - $["\1_p25"] } }' \ - data/medium + data/medium GENMD-EOF ## Computing weighted means diff --git a/docs/src/structure-of-these-documents.md b/docs/src/structure-of-these-documents.md index 5d2993ee7..cdaeef8a9 100644 --- a/docs/src/structure-of-these-documents.md +++ b/docs/src/structure-of-these-documents.md @@ -19,13 +19,13 @@ Quick links: The goal is _multiple levels of detail_. * The [Introduction page](index.md) is the shortest: headlines and **essential summary**. -* The _Getting started_ section is for **new or near-new users** who want some simple examples along with connecting narrative. The goal is to get a new user up and running, able to do some interesting things with their own data. +* The _Getting started_ section is for **new or near-new users** who want some simple examples along with a connecting narrative. The goal is to get a new user up and running, enabling them to perform interesting tasks with their own data. * The _Miller in more detail_ section is just-past-introductory, **tell-me-more material** about some of the things that make Miller unique: what file formats it handles (and how it handles them), how it relates to other tools in the Unix toolkit, and so on. * The _FAQs and examples_ section is non-introductory for people looking for various ways to do things by example. The discussion is pragmatic rather than theoretical, and **use-case-driven**. * The _Background_ section is some **non-essential historical** and meta material on why Miller was created. -* The _Reference_ section aims to answer all questions the previous sections didn't. The discussion is **concept-driven**, although there are still plenty of examples throughout for concreteness. - * _Main reference_ goes carefully through various aspects of Miller, concept by concept. - * _DSL reference_ focuses on the [Miller programming language](miller-programming-language.md), again following a concept-at-a-time approach. - * _Misc. reference_ is aptly named, with things like build-from-source notes. - * _Documents for previous releases_ is not only for historical curiosity -- experience has shown that various Linux/BSD distros update their Miller versions on their own cadences, so the version on your system (as shown by `mlr --version`) might be best-served by its respective documentation version. -* Lastly, new with the Miller 6 documents is a very easy-to-access **Search field** at the top of each page. +* The _Reference_ section aims to answer all questions that the previous sections didn't. The discussion is **concept-driven**, although it includes numerous examples throughout for concreteness. + * The main reference carefully examines various aspects of Miller, concept by concept. + * The _DSL reference_ focuses on the [Miller programming language](miller-programming-language.md), again following a concept-at-a-time approach. + * The _miscellaneous reference_ is aptly named, with things like build-from-source notes. + * _Documents for previous releases_ is not only for historical curiosity -- experience has shown that various Linux/BSD distros update their Miller versions on their own cadences, so the version on your system (as shown by `mlr --version`) might be best served by its respective documentation version. +* Lastly, new with the Miller 6 documents is an easy-to-access **Search field** at the top of each page. diff --git a/docs/src/structure-of-these-documents.md.in b/docs/src/structure-of-these-documents.md.in index 9cb40d3a1..29a558d51 100644 --- a/docs/src/structure-of-these-documents.md.in +++ b/docs/src/structure-of-these-documents.md.in @@ -3,13 +3,13 @@ The goal is _multiple levels of detail_. * The [Introduction page](index.md) is the shortest: headlines and **essential summary**. -* The _Getting started_ section is for **new or near-new users** who want some simple examples along with connecting narrative. The goal is to get a new user up and running, able to do some interesting things with their own data. +* The _Getting started_ section is for **new or near-new users** who want some simple examples along with a connecting narrative. The goal is to get a new user up and running, enabling them to perform interesting tasks with their own data. * The _Miller in more detail_ section is just-past-introductory, **tell-me-more material** about some of the things that make Miller unique: what file formats it handles (and how it handles them), how it relates to other tools in the Unix toolkit, and so on. * The _FAQs and examples_ section is non-introductory for people looking for various ways to do things by example. The discussion is pragmatic rather than theoretical, and **use-case-driven**. * The _Background_ section is some **non-essential historical** and meta material on why Miller was created. -* The _Reference_ section aims to answer all questions the previous sections didn't. The discussion is **concept-driven**, although there are still plenty of examples throughout for concreteness. - * _Main reference_ goes carefully through various aspects of Miller, concept by concept. - * _DSL reference_ focuses on the [Miller programming language](miller-programming-language.md), again following a concept-at-a-time approach. - * _Misc. reference_ is aptly named, with things like build-from-source notes. - * _Documents for previous releases_ is not only for historical curiosity -- experience has shown that various Linux/BSD distros update their Miller versions on their own cadences, so the version on your system (as shown by `mlr --version`) might be best-served by its respective documentation version. -* Lastly, new with the Miller 6 documents is a very easy-to-access **Search field** at the top of each page. +* The _Reference_ section aims to answer all questions that the previous sections didn't. The discussion is **concept-driven**, although it includes numerous examples throughout for concreteness. + * The main reference carefully examines various aspects of Miller, concept by concept. + * The _DSL reference_ focuses on the [Miller programming language](miller-programming-language.md), again following a concept-at-a-time approach. + * The _miscellaneous reference_ is aptly named, with things like build-from-source notes. + * _Documents for previous releases_ is not only for historical curiosity -- experience has shown that various Linux/BSD distros update their Miller versions on their own cadences, so the version on your system (as shown by `mlr --version`) might be best served by its respective documentation version. +* Lastly, new with the Miller 6 documents is an easy-to-access **Search field** at the top of each page. diff --git a/docs/src/swipes.sh b/docs/src/swipes.sh new file mode 100755 index 000000000..f5f1064f2 --- /dev/null +++ b/docs/src/swipes.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +for x in *.md.in; do + sed -i .emd 's/ *$//' $x + rm $x.emd +done diff --git a/docs/src/two-pass-algorithms.md b/docs/src/two-pass-algorithms.md index 146f3a81e..e475aebf3 100644 --- a/docs/src/two-pass-algorithms.md +++ b/docs/src/two-pass-algorithms.md @@ -598,8 +598,8 @@ hat pan 0.4643355557376876 x_count 10000 x_sum 4986.019681679581 x_mean 0.49860196816795804 -x_var 0.08426974433144456 -x_stddev 0.2902925151144007 +x_var 0.08426974433144457 +x_stddev 0.29029251511440074diff --git a/docs/src/unix-toolkit-context.md b/docs/src/unix-toolkit-context.md index 1687f4868..ffc8ede78 100644 --- a/docs/src/unix-toolkit-context.md +++ b/docs/src/unix-toolkit-context.md @@ -63,9 +63,9 @@ Likewise with `mlr sort`, `mlr tac`, and so on. ## awk-like features: mlr filter and mlr put -* `mlr filter` includes/excludes records based on a filter expression, e.g. `mlr filter '$count > 10'`. +* `mlr filter` includes/excludes records based on a filter expression, e.g., `mlr filter '$count > 10'`. -* `mlr put` adds a new field as a function of others, e.g. `mlr put '$xy = $x * $y'` or `mlr put '$counter = NR'`. +* `mlr put` adds a new field as a function of others, e.g., `mlr put '$xy = $x * $y'` or `mlr put '$counter = NR'`. * The `$name` syntax is straight from `awk`'s `$1 $2 $3` (adapted to name-based indexing), as are the variables `FS`, `OFS`, `RS`, `ORS`, `NF`, `NR`, and `FILENAME`. The `ENV[...]` syntax is from Ruby. @@ -73,7 +73,7 @@ Likewise with `mlr sort`, `mlr tac`, and so on. * Like `awk`, Miller (as of v5.0.0) allows you to define new functions within its `put` and `filter` expression language. Further programmability comes from chaining with `then`. -* As with `awk`, `$`-variables are stream variables and all verbs (such as `cut`, `stats1`, `put`, etc.) as well as `put`/`filter` statements operate on streams. This means that you define actions to be done on each record and then stream your data through those actions. The built-in variables `NF`, `NR`, etc. change from one record to another, `$x` is a label for field `x` in the current record, and the input to `sqrt($x)` changes from one record to the next. The expression language for the `put` and `filter` verbs additionally allows you to define `begin {...}` and `end {...}` blocks for actions to be taken before and after records are processed, respectively. +* As with `awk`, `$`-variables are stream variables and all verbs (such as `cut`, `stats1`, `put`, etc.) as well as `put`/`filter` statements operate on streams. This means that you define actions to be done on each record and then stream your data through those actions. The built-in variables `NF`, `NR`, etc., change from one record to another, `$x` is a label for field `x` in the current record, and the input to `sqrt($x)` changes from one record to the next. The expression language for the `put` and `filter` verbs additionally allows you to define `begin {...}` and `end {...}` blocks for actions to be taken before and after records are processed, respectively. * As with `awk`, Miller's `put`/`filter` language lets you set `@sum=0` before records are read, then update that sum on each record, then print its value at the end. Unlike `awk`, Miller makes syntactically explicit the difference between variables with extent across all records (names starting with `@`, such as `@sum`) and variables which are local to the current expression invocation (names starting without `@`, such as `sum`). diff --git a/docs/src/unix-toolkit-context.md.in b/docs/src/unix-toolkit-context.md.in index bea7b27f3..14da2d777 100644 --- a/docs/src/unix-toolkit-context.md.in +++ b/docs/src/unix-toolkit-context.md.in @@ -26,9 +26,9 @@ Likewise with `mlr sort`, `mlr tac`, and so on. ## awk-like features: mlr filter and mlr put -* `mlr filter` includes/excludes records based on a filter expression, e.g. `mlr filter '$count > 10'`. +* `mlr filter` includes/excludes records based on a filter expression, e.g., `mlr filter '$count > 10'`. -* `mlr put` adds a new field as a function of others, e.g. `mlr put '$xy = $x * $y'` or `mlr put '$counter = NR'`. +* `mlr put` adds a new field as a function of others, e.g., `mlr put '$xy = $x * $y'` or `mlr put '$counter = NR'`. * The `$name` syntax is straight from `awk`'s `$1 $2 $3` (adapted to name-based indexing), as are the variables `FS`, `OFS`, `RS`, `ORS`, `NF`, `NR`, and `FILENAME`. The `ENV[...]` syntax is from Ruby. @@ -36,7 +36,7 @@ Likewise with `mlr sort`, `mlr tac`, and so on. * Like `awk`, Miller (as of v5.0.0) allows you to define new functions within its `put` and `filter` expression language. Further programmability comes from chaining with `then`. -* As with `awk`, `$`-variables are stream variables and all verbs (such as `cut`, `stats1`, `put`, etc.) as well as `put`/`filter` statements operate on streams. This means that you define actions to be done on each record and then stream your data through those actions. The built-in variables `NF`, `NR`, etc. change from one record to another, `$x` is a label for field `x` in the current record, and the input to `sqrt($x)` changes from one record to the next. The expression language for the `put` and `filter` verbs additionally allows you to define `begin {...}` and `end {...}` blocks for actions to be taken before and after records are processed, respectively. +* As with `awk`, `$`-variables are stream variables and all verbs (such as `cut`, `stats1`, `put`, etc.) as well as `put`/`filter` statements operate on streams. This means that you define actions to be done on each record and then stream your data through those actions. The built-in variables `NF`, `NR`, etc., change from one record to another, `$x` is a label for field `x` in the current record, and the input to `sqrt($x)` changes from one record to the next. The expression language for the `put` and `filter` verbs additionally allows you to define `begin {...}` and `end {...}` blocks for actions to be taken before and after records are processed, respectively. * As with `awk`, Miller's `put`/`filter` language lets you set `@sum=0` before records are read, then update that sum on each record, then print its value at the end. Unlike `awk`, Miller makes syntactically explicit the difference between variables with extent across all records (names starting with `@`, such as `@sum`) and variables which are local to the current expression invocation (names starting without `@`, such as `sum`). diff --git a/docs/src/why.md b/docs/src/why.md index a8b2ed585..aa00458be 100644 --- a/docs/src/why.md +++ b/docs/src/why.md @@ -20,44 +20,44 @@ Someone asked me the other day about design, tradeoffs, thought process, why I f ## Who is Miller for? -For background, I'm a software engineer, with a heavy devops bent and a non-trivial amount of data-engineering in my career. **Initially I wrote Miller mainly for myself:** I'm coder-friendly (being a coder); I'm Github-friendly; most of my data are well-structured or easily structurable (TSV-formatted SQL-query output, CSV files, log files, JSON data structures); I care about interoperability between all the various formats Miller supports (I've encountered them all); I do all my work on Linux or OS X. +For background, I'm a software engineer with a heavy devops bent and a non-trivial amount of data engineering in my career. **Initially, I wrote Miller mainly for myself:** I'm coder-friendly (being a coder); I'm Github-friendly; most of my data is either well-structured or easily structurable (TSV-formatted SQL-query output, CSV files, log files, JSON data structures); I care about interoperability between all the various formats Miller supports (I've encountered them all); I do all my work on Linux or OS X. -But now there's this neat little tool **which seems to be useful for people in various disciplines**. I don't even know entirely *who*. I can click through Github starrers and read a bit about what they seem to do, but not everyone that uses Miller is even *on* Github (or stars things). I've gotten a lot of feature requests through Github -- but only from people who are Github users. Not everyone's a coder (it seems like a lot of Miller's Github starrers are devops folks like myself, or data-science-ish people, or biology/genomics folks.) A lot of people care 100% about CSV. And so on. +But now there's this neat little tool **which seems to be useful for people in various disciplines**. I don't even know entirely *who*. I can click through Github starrers and read a bit about what they seem to do, but not everyone who uses Miller is even *on* Github (or stars things). I've gotten a lot of feature requests through Github -- but only from people who are Github users. Not everyone's a coder (it seems like many of Miller's Github starrers are devops folks like myself, or data-science-ish people, or biology/genomics folks.) A lot of people care 100% about CSV. And so on. -So the reason for the [Miller User Survey](https://github.com/johnkerl/miller/discussions/542) is to answer questions such as: does Miller do what you need? Do you use it for all sorts of things, or just one or two nice things? Are there things you wish it did but it doesn't? Is it almost there, or just nowhere near what you want? Are there not enough features or way too many? Are the docs too complicated; do you have a hard time finding out how to do what you want? Should I think differently about what this tool even *is* in the first place? Should I think differently about who it's for? +So the reason for the [Miller User Survey](https://github.com/johnkerl/miller/discussions/542) is to answer questions such as: does Miller do what you need? Do you use it for all sorts of things, or just one or two nice things? Are there things you wish it did, but it doesn't? Is it almost there, or just nowhere near what you want? Are there not enough features or way too many? Are the docs too complicated? Do you have a hard time finding out how to do what you want? Should I think differently about what this tool even *is* in the first place? Should I think differently about who it's for? ## What was Miller created to do? -First: there are tools like `xsv` which handles CSV marvelously and `jq` which handles JSON marvelously, and so on -- but I over the years of my career in the software industry I've found myself, and others, doing a lot of ad-hoc things which really were fundamentally the same *except* for format. So the number one thing about Miller is doing common things while supporting **multiple formats**: (a) ingest a list of records where a record is a list of key-value pairs (however represented in the input files); (b) transform that stream of records; (c) emit the transformed stream -- either in the same format as input, or in a different format. +The first thing: there are tools like `xsv` which handles CSV marvelously and `jq` which handles JSON marvelously, and so on -- but I over the years of my career in the software industry I've found myself, and others, doing a lot of ad-hoc things which were fundamentally the same *except* for format. So the number one thing about Miller is doing common things while supporting **multiple formats**: (a) ingest a list of records where a record is a list of key-value pairs (however represented in the input files); (b) transform that stream of records; (c) emit the transformed stream -- either in the same format as input, or in a different format. -Second thing, a lot like the first: just as I didn't want to build something only for a single file format, I didn't want to build something only for one problem domain. In my work doing software engineering, devops, data engineering, etc. I saw a lot of commonalities and I wanted to **solve as many problems simultaneously as possible**. +The second thing is a lot like the first: just as I didn't want to build something only for a single file format, I didn't want to build something only for one problem domain. In my work doing software engineering, devops, data engineering, etc. I saw a lot of commonalities, and I wanted to **solve as many problems simultaneously as possible**. -Third: it had to be **streaming**. As time goes by and we (some of us, sometimes) have machines with tens or hundreds of GB of RAM, it's maybe less important, but I'm unhappy with tools which ingest all data, then do stuff, then emit all data. One reason is to be able to handle files bigger than available RAM. Another reason is to be able to handle input which trickles in, e.g. you have some process emitting data now and then and you can pipe it to Miller and it will emit transformed records one at a time. +Third: it had to be **streaming**. As time goes by and we (some of us, sometimes) have machines with tens or hundreds of GB of RAM, it's less important, but I'm unhappy with tools that ingest all data, then do stuff, then emit all data. One reason is to be able to handle files bigger than available RAM. Another reason is to be able to handle input which trickles in, e.g., you have some process emitting data now and then, and you can pipe it to Miller and it will emit transformed records one at a time. -Fourth: it had to be **fast**. This precludes all sorts of very nice things written in Ruby, for example. I love Ruby as a very expressive language, and I have several very useful little utility scripts written in Ruby. But a few years ago I ported over some of my old tried-and-true C programs and the lines-of-code count was a *lot* lower -- it was great! Until I ran them on multi-GB files and realized they took 60x as long to complete. So I couldn't write Miller in Ruby, or in languages like it. I was going to have to do something in a low-level language in order to make it performant. +Fourth: it had to be **fast**. This precludes all sorts of very nice things written in Ruby, for example. I love Ruby as a very expressive language, and I have several very useful little utility scripts written in Ruby. But a few years ago, I ported over some of my old tried-and-true C programs and the lines-of-code count was a *lot* lower -- it was great! Until I ran them on multi-GB files and realized they took 60x as long to complete. So I couldn't write Miller in Ruby, or languages like it. I was going to have to do something in a low-level language in order to make it performant. -Fifth thing: I wanted Miller to be **pipe-friendly and interoperate with other command-line tools**. Since the basic paradigm is ingest records, transform records, emit records -- where the input and output formats can be the same or different, and the transform can be complex, or just pass-through -- this means you can use it to transform data, or re-format it, or both. So if you just want to do data-cleaning/prep/formatting and do all the "real" work in R, you can. If you just want a little glue script between other tools you can get that. And if you want to do non-trivial data-reduction in Miller you can. +The fifth thing: I wanted Miller to be **pipe-friendly and interoperate with other command-line tools**. Since the basic paradigm is ingest records, transform records, emit records -- where the input and output formats can be the same or different, and the transform can be complex, or just pass-through -- this means you can use it to transform data, or re-format it, or both. So if you just want to do data-cleaning/prep/formatting and do all the "real" work in R, you can. If you want a little glue script between other tools, you can get that. And if you want to do non-trivial data-reduction in Miller, you can. -Sixth thing: Must have **comprehensive documentation and unit-test**. Since Miller handles a lot of formats and solves a lot of problems, there's a lot to test and a lot to keep working correctly as I add features or optimize. And I wanted it to be able to explain itself -- not only through web docs like the one you're reading but also through `man mlr` and `mlr --help`, `mlr sort --help`, etc. +Sixth thing: Must have **comprehensive documentation and unit-test**. Since Miller handles a wide range of formats and solves numerous problems, there's a lot to test and a lot to keep working correctly as I add features or optimize. And I wanted it to be able to explain itself -- not only through web docs like the one you're reading but also through `man mlr` and `mlr --help`, `mlr sort --help`, etc. -Seventh thing: **Must have a domain-specific language** (DSL) **but also must let you do common things without it**. All those little verbs Miller has to help you *avoid* having to write for-loops are great. I use them for keystroke-saving: `mlr stats1 -a mean,stddev,min,max -f quantity`, for example, without you having to write for-loops or define accumulator variables. But you also have to be able to break out of that and write arbitrary code when you want to: `mlr put '$distance = $rate * $time'` or anything else you can think up. In Perl/AWK/etc. it's all DSL. In xsv et al. it's all verbs. In Miller I like having the combination. +Seventh thing: **Must have a domain-specific language** (DSL) **but also must let you do everyday things without it**. All those little verbs Miller has to help you *avoid* having to write for-loops are great. I use them for keystroke-saving: `mlr stats1 -a mean,stddev,min,max -f quantity`, for example, without you having to write for-loops or define accumulator variables. But you also have to be able to break out of that and write arbitrary code when you want to: `mlr put '$distance = $rate * $time'` or anything else you can think up. In Perl/AWK/etc. it's all DSL. In xsv et al. it's all verbs. In Miller, I like having the combination. -Eighth thing: It's an **awful lot of fun to write**. In my experience I didn't find any tools which do multi-format, streaming, efficient, multi-purpose, with DSL and non-DSL, so I wrote one. But I don't guarantee it's unique in the world. It fills a niche in the world (people use it) but it also fills a niche in my life. +Eighth thing: It's an **awful lot of fun to write**. In my experience, I didn't find any tools that do multi-format, streaming, efficient, multi-purpose, with DSL and non-DSL, so I wrote one. But I don't guarantee it's unique in the world. It fills a niche in the world (people use it), but it also fills a niche in my life. ## Tradeoffs -Miller is command-line-only by design. People who want a graphical user interface won't find it here. This is in part (a) accommodating my personal preferences, and in part (b) guided by my experience/belief that the command line is very expressive. Steeper learning curve than a GUI, yes. I consider that price worth paying for the tool-niche which Miller occupies. +Miller is command-line-only by design. People who want a graphical user interface won't find it here. This is in part (a) accommodating my personal preferences, and in part (b) guided by my experience/belief that the command line is very expressive. Steeper learning curve than a GUI, yes. That price is worth paying for the tool-niche which Miller occupies. -Another tradeoff: supporting lists of records keeps me supporting only what can be expressed in *all* of those formats. For example, `[1,2,3,4,5]` is valid but unmillerable JSON: the list elements are not records. So Miller can't (and won't) handle arbitrary JSON -- because Miller only handles tabular data which can be expressed in a variety of formats. +Another tradeoff: supporting lists of records keeps me supporting only what can be expressed in *all* of those formats. For example, `[1,2,3,4,5]` is valid but unmillerable JSON: the list elements are not records. So Miller can't (and won't) handle arbitrary JSON -- because Miller only handles tabular data, which can be expressed in a variety of formats. -A third tradeoff is doing build-from-scratch in a low-level language. It'd be quicker to write (but slower to run) if written in a high-level language. If Miller were written in Python, it would be implemented in significantly fewer lines of code than its current Go implementation. The DSL would just be an `eval` of Python code. And it would run slower, but maybe not enough slower to be a problem for most folks. Later I found out about the [rows](https://github.com/turicas/rows) tool -- if you find Miller useful, you should check out `rows` as well. +A third tradeoff is building from scratch in a low-level language. It'd be quicker to write (but slower to run) if written in a high-level language. If Miller were written in Python, it would be implemented in significantly fewer lines of code than its current Go implementation. The DSL would be an `eval` of Python code. And it would run slower, but maybe not slow enough to be a problem for most people. Later, I discovered the [rows](https://github.com/turicas/rows) tool -- if you find Miller useful, you should also check out `rows`. -A fourth tradeoff is in the DSL (more visibly so in 5.0.0 but already in pre-5.0.0): how much to make it dynamically typed -- so you can just say `y=x+1` with a minimum number of keystrokes -- vs. having it do a good job of telling you when you've made a typo. This is a common paradigm across *all* languages. Some like Ruby you don't declare anything and they're quick to code little stuff in but programs of even a few thousand lines (which isn't large in the software world) become insanely unmanageable. Then, Java at the other extreme, does scale and is very typesafe -- but you have to type in a lot of punctuation, angle brackets, datatypes, repetition, etc. just to be able to get anything done. And some in the middle like Go are typesafe but with type-inference which aim to do the best of both. In the Miller (5.0.0) DSL you get `y=x+1` by default but you can have things like `int y = x+1` etc. so the typesafety is opt-in. See also the [Type-checking page](reference-dsl-variables.md#type-checking) for more information on this. +A fourth tradeoff is in the DSL (more visibly so in 5.0.0 but already in pre-5.0.0): how much to make it dynamically typed -- so you can just say `y=x+1` with a minimum number of keystrokes -- vs. having it do a good job of telling you when you've made a typo. This is a common paradigm across *all* languages. In some languages, like Ruby, you don't declare anything, and they're quick to code little stuff in, but programs of even a few thousand lines (which isn't large in the software world) become insanely unmanageable. Then, Java at the other extreme, does scale and is very typesafe -- but you have to type in a lot of punctuation, angle brackets, datatypes, repetition, etc., just to be able to get anything done. And some in the middle, like Go, are typesafe but with type inference, which aim to do the best of both. In the Miller (5.0.0) DSL, you get `y=x+1` by default, but you can have things like `int y = x+1` etc., so the typesafety is opt-in. See also the [Type-checking page](reference-dsl-variables.md#type-checking) for more information on this. ## Related tools -Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). It doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well. +Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). It doesn't mention [rows](https://github.com/turicas/rows), so here's a plug for that as well. ## Moving forward -I originally aimed Miller at people who already know what `sed`/`awk`/`cut`/`sort`/`join` are and wanted some options. But as time goes by I realize that tools like this can be useful to folks who *don't* know what those things are; people who aren't primarily coders; people who are scientists, or data scientists. These days some journalists do data analysis. So moving forward in terms of docs, I am working on having more cookbook, follow-by-example stuff in addition to the existing language-reference kinds of stuff. And continuing to seek out input from people who use Miller on where to go next. +I initially aimed Miller at people who already know what `sed`/`awk`/`cut`/`sort`/`join` are and wanted some options. But as time goes by, I realize that tools like this can be helpful to folks who *don't* know what those things are; people who aren't primarily coders; people who are scientists, or data scientists. These days some journalists do data analysis. Moving forward in terms of docs, I am working on having more cookbook, follow-by-example stuff in addition to the existing language-reference kinds of stuff. And continuing to seek out input from people who use Miller on where to go next. diff --git a/docs/src/why.md.in b/docs/src/why.md.in index 3c83c39c4..3e176a460 100644 --- a/docs/src/why.md.in +++ b/docs/src/why.md.in @@ -4,44 +4,44 @@ Someone asked me the other day about design, tradeoffs, thought process, why I f ## Who is Miller for? -For background, I'm a software engineer, with a heavy devops bent and a non-trivial amount of data-engineering in my career. **Initially I wrote Miller mainly for myself:** I'm coder-friendly (being a coder); I'm Github-friendly; most of my data are well-structured or easily structurable (TSV-formatted SQL-query output, CSV files, log files, JSON data structures); I care about interoperability between all the various formats Miller supports (I've encountered them all); I do all my work on Linux or OS X. +For background, I'm a software engineer with a heavy devops bent and a non-trivial amount of data engineering in my career. **Initially, I wrote Miller mainly for myself:** I'm coder-friendly (being a coder); I'm Github-friendly; most of my data is either well-structured or easily structurable (TSV-formatted SQL-query output, CSV files, log files, JSON data structures); I care about interoperability between all the various formats Miller supports (I've encountered them all); I do all my work on Linux or OS X. -But now there's this neat little tool **which seems to be useful for people in various disciplines**. I don't even know entirely *who*. I can click through Github starrers and read a bit about what they seem to do, but not everyone that uses Miller is even *on* Github (or stars things). I've gotten a lot of feature requests through Github -- but only from people who are Github users. Not everyone's a coder (it seems like a lot of Miller's Github starrers are devops folks like myself, or data-science-ish people, or biology/genomics folks.) A lot of people care 100% about CSV. And so on. +But now there's this neat little tool **which seems to be useful for people in various disciplines**. I don't even know entirely *who*. I can click through Github starrers and read a bit about what they seem to do, but not everyone who uses Miller is even *on* Github (or stars things). I've gotten a lot of feature requests through Github -- but only from people who are Github users. Not everyone's a coder (it seems like many of Miller's Github starrers are devops folks like myself, or data-science-ish people, or biology/genomics folks.) A lot of people care 100% about CSV. And so on. -So the reason for the [Miller User Survey](https://github.com/johnkerl/miller/discussions/542) is to answer questions such as: does Miller do what you need? Do you use it for all sorts of things, or just one or two nice things? Are there things you wish it did but it doesn't? Is it almost there, or just nowhere near what you want? Are there not enough features or way too many? Are the docs too complicated; do you have a hard time finding out how to do what you want? Should I think differently about what this tool even *is* in the first place? Should I think differently about who it's for? +So the reason for the [Miller User Survey](https://github.com/johnkerl/miller/discussions/542) is to answer questions such as: does Miller do what you need? Do you use it for all sorts of things, or just one or two nice things? Are there things you wish it did, but it doesn't? Is it almost there, or just nowhere near what you want? Are there not enough features or way too many? Are the docs too complicated? Do you have a hard time finding out how to do what you want? Should I think differently about what this tool even *is* in the first place? Should I think differently about who it's for? ## What was Miller created to do? -First: there are tools like `xsv` which handles CSV marvelously and `jq` which handles JSON marvelously, and so on -- but I over the years of my career in the software industry I've found myself, and others, doing a lot of ad-hoc things which really were fundamentally the same *except* for format. So the number one thing about Miller is doing common things while supporting **multiple formats**: (a) ingest a list of records where a record is a list of key-value pairs (however represented in the input files); (b) transform that stream of records; (c) emit the transformed stream -- either in the same format as input, or in a different format. +The first thing: there are tools like `xsv` which handles CSV marvelously and `jq` which handles JSON marvelously, and so on -- but I over the years of my career in the software industry I've found myself, and others, doing a lot of ad-hoc things which were fundamentally the same *except* for format. So the number one thing about Miller is doing common things while supporting **multiple formats**: (a) ingest a list of records where a record is a list of key-value pairs (however represented in the input files); (b) transform that stream of records; (c) emit the transformed stream -- either in the same format as input, or in a different format. -Second thing, a lot like the first: just as I didn't want to build something only for a single file format, I didn't want to build something only for one problem domain. In my work doing software engineering, devops, data engineering, etc. I saw a lot of commonalities and I wanted to **solve as many problems simultaneously as possible**. +The second thing is a lot like the first: just as I didn't want to build something only for a single file format, I didn't want to build something only for one problem domain. In my work doing software engineering, devops, data engineering, etc. I saw a lot of commonalities, and I wanted to **solve as many problems simultaneously as possible**. -Third: it had to be **streaming**. As time goes by and we (some of us, sometimes) have machines with tens or hundreds of GB of RAM, it's maybe less important, but I'm unhappy with tools which ingest all data, then do stuff, then emit all data. One reason is to be able to handle files bigger than available RAM. Another reason is to be able to handle input which trickles in, e.g. you have some process emitting data now and then and you can pipe it to Miller and it will emit transformed records one at a time. +Third: it had to be **streaming**. As time goes by and we (some of us, sometimes) have machines with tens or hundreds of GB of RAM, it's less important, but I'm unhappy with tools that ingest all data, then do stuff, then emit all data. One reason is to be able to handle files bigger than available RAM. Another reason is to be able to handle input which trickles in, e.g., you have some process emitting data now and then, and you can pipe it to Miller and it will emit transformed records one at a time. -Fourth: it had to be **fast**. This precludes all sorts of very nice things written in Ruby, for example. I love Ruby as a very expressive language, and I have several very useful little utility scripts written in Ruby. But a few years ago I ported over some of my old tried-and-true C programs and the lines-of-code count was a *lot* lower -- it was great! Until I ran them on multi-GB files and realized they took 60x as long to complete. So I couldn't write Miller in Ruby, or in languages like it. I was going to have to do something in a low-level language in order to make it performant. +Fourth: it had to be **fast**. This precludes all sorts of very nice things written in Ruby, for example. I love Ruby as a very expressive language, and I have several very useful little utility scripts written in Ruby. But a few years ago, I ported over some of my old tried-and-true C programs and the lines-of-code count was a *lot* lower -- it was great! Until I ran them on multi-GB files and realized they took 60x as long to complete. So I couldn't write Miller in Ruby, or languages like it. I was going to have to do something in a low-level language in order to make it performant. -Fifth thing: I wanted Miller to be **pipe-friendly and interoperate with other command-line tools**. Since the basic paradigm is ingest records, transform records, emit records -- where the input and output formats can be the same or different, and the transform can be complex, or just pass-through -- this means you can use it to transform data, or re-format it, or both. So if you just want to do data-cleaning/prep/formatting and do all the "real" work in R, you can. If you just want a little glue script between other tools you can get that. And if you want to do non-trivial data-reduction in Miller you can. +The fifth thing: I wanted Miller to be **pipe-friendly and interoperate with other command-line tools**. Since the basic paradigm is ingest records, transform records, emit records -- where the input and output formats can be the same or different, and the transform can be complex, or just pass-through -- this means you can use it to transform data, or re-format it, or both. So if you just want to do data-cleaning/prep/formatting and do all the "real" work in R, you can. If you want a little glue script between other tools, you can get that. And if you want to do non-trivial data-reduction in Miller, you can. -Sixth thing: Must have **comprehensive documentation and unit-test**. Since Miller handles a lot of formats and solves a lot of problems, there's a lot to test and a lot to keep working correctly as I add features or optimize. And I wanted it to be able to explain itself -- not only through web docs like the one you're reading but also through `man mlr` and `mlr --help`, `mlr sort --help`, etc. +Sixth thing: Must have **comprehensive documentation and unit-test**. Since Miller handles a wide range of formats and solves numerous problems, there's a lot to test and a lot to keep working correctly as I add features or optimize. And I wanted it to be able to explain itself -- not only through web docs like the one you're reading but also through `man mlr` and `mlr --help`, `mlr sort --help`, etc. -Seventh thing: **Must have a domain-specific language** (DSL) **but also must let you do common things without it**. All those little verbs Miller has to help you *avoid* having to write for-loops are great. I use them for keystroke-saving: `mlr stats1 -a mean,stddev,min,max -f quantity`, for example, without you having to write for-loops or define accumulator variables. But you also have to be able to break out of that and write arbitrary code when you want to: `mlr put '$distance = $rate * $time'` or anything else you can think up. In Perl/AWK/etc. it's all DSL. In xsv et al. it's all verbs. In Miller I like having the combination. +Seventh thing: **Must have a domain-specific language** (DSL) **but also must let you do everyday things without it**. All those little verbs Miller has to help you *avoid* having to write for-loops are great. I use them for keystroke-saving: `mlr stats1 -a mean,stddev,min,max -f quantity`, for example, without you having to write for-loops or define accumulator variables. But you also have to be able to break out of that and write arbitrary code when you want to: `mlr put '$distance = $rate * $time'` or anything else you can think up. In Perl/AWK/etc. it's all DSL. In xsv et al. it's all verbs. In Miller, I like having the combination. -Eighth thing: It's an **awful lot of fun to write**. In my experience I didn't find any tools which do multi-format, streaming, efficient, multi-purpose, with DSL and non-DSL, so I wrote one. But I don't guarantee it's unique in the world. It fills a niche in the world (people use it) but it also fills a niche in my life. +Eighth thing: It's an **awful lot of fun to write**. In my experience, I didn't find any tools that do multi-format, streaming, efficient, multi-purpose, with DSL and non-DSL, so I wrote one. But I don't guarantee it's unique in the world. It fills a niche in the world (people use it), but it also fills a niche in my life. ## Tradeoffs -Miller is command-line-only by design. People who want a graphical user interface won't find it here. This is in part (a) accommodating my personal preferences, and in part (b) guided by my experience/belief that the command line is very expressive. Steeper learning curve than a GUI, yes. I consider that price worth paying for the tool-niche which Miller occupies. +Miller is command-line-only by design. People who want a graphical user interface won't find it here. This is in part (a) accommodating my personal preferences, and in part (b) guided by my experience/belief that the command line is very expressive. Steeper learning curve than a GUI, yes. That price is worth paying for the tool-niche which Miller occupies. -Another tradeoff: supporting lists of records keeps me supporting only what can be expressed in *all* of those formats. For example, `[1,2,3,4,5]` is valid but unmillerable JSON: the list elements are not records. So Miller can't (and won't) handle arbitrary JSON -- because Miller only handles tabular data which can be expressed in a variety of formats. +Another tradeoff: supporting lists of records keeps me supporting only what can be expressed in *all* of those formats. For example, `[1,2,3,4,5]` is valid but unmillerable JSON: the list elements are not records. So Miller can't (and won't) handle arbitrary JSON -- because Miller only handles tabular data, which can be expressed in a variety of formats. -A third tradeoff is doing build-from-scratch in a low-level language. It'd be quicker to write (but slower to run) if written in a high-level language. If Miller were written in Python, it would be implemented in significantly fewer lines of code than its current Go implementation. The DSL would just be an `eval` of Python code. And it would run slower, but maybe not enough slower to be a problem for most folks. Later I found out about the [rows](https://github.com/turicas/rows) tool -- if you find Miller useful, you should check out `rows` as well. +A third tradeoff is building from scratch in a low-level language. It'd be quicker to write (but slower to run) if written in a high-level language. If Miller were written in Python, it would be implemented in significantly fewer lines of code than its current Go implementation. The DSL would be an `eval` of Python code. And it would run slower, but maybe not slow enough to be a problem for most people. Later, I discovered the [rows](https://github.com/turicas/rows) tool -- if you find Miller useful, you should also check out `rows`. -A fourth tradeoff is in the DSL (more visibly so in 5.0.0 but already in pre-5.0.0): how much to make it dynamically typed -- so you can just say `y=x+1` with a minimum number of keystrokes -- vs. having it do a good job of telling you when you've made a typo. This is a common paradigm across *all* languages. Some like Ruby you don't declare anything and they're quick to code little stuff in but programs of even a few thousand lines (which isn't large in the software world) become insanely unmanageable. Then, Java at the other extreme, does scale and is very typesafe -- but you have to type in a lot of punctuation, angle brackets, datatypes, repetition, etc. just to be able to get anything done. And some in the middle like Go are typesafe but with type-inference which aim to do the best of both. In the Miller (5.0.0) DSL you get `y=x+1` by default but you can have things like `int y = x+1` etc. so the typesafety is opt-in. See also the [Type-checking page](reference-dsl-variables.md#type-checking) for more information on this. +A fourth tradeoff is in the DSL (more visibly so in 5.0.0 but already in pre-5.0.0): how much to make it dynamically typed -- so you can just say `y=x+1` with a minimum number of keystrokes -- vs. having it do a good job of telling you when you've made a typo. This is a common paradigm across *all* languages. In some languages, like Ruby, you don't declare anything, and they're quick to code little stuff in, but programs of even a few thousand lines (which isn't large in the software world) become insanely unmanageable. Then, Java at the other extreme, does scale and is very typesafe -- but you have to type in a lot of punctuation, angle brackets, datatypes, repetition, etc., just to be able to get anything done. And some in the middle, like Go, are typesafe but with type inference, which aim to do the best of both. In the Miller (5.0.0) DSL, you get `y=x+1` by default, but you can have things like `int y = x+1` etc., so the typesafety is opt-in. See also the [Type-checking page](reference-dsl-variables.md#type-checking) for more information on this. ## Related tools -Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). It doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well. +Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). It doesn't mention [rows](https://github.com/turicas/rows), so here's a plug for that as well. ## Moving forward -I originally aimed Miller at people who already know what `sed`/`awk`/`cut`/`sort`/`join` are and wanted some options. But as time goes by I realize that tools like this can be useful to folks who *don't* know what those things are; people who aren't primarily coders; people who are scientists, or data scientists. These days some journalists do data analysis. So moving forward in terms of docs, I am working on having more cookbook, follow-by-example stuff in addition to the existing language-reference kinds of stuff. And continuing to seek out input from people who use Miller on where to go next. +I initially aimed Miller at people who already know what `sed`/`awk`/`cut`/`sort`/`join` are and wanted some options. But as time goes by, I realize that tools like this can be helpful to folks who *don't* know what those things are; people who aren't primarily coders; people who are scientists, or data scientists. These days some journalists do data analysis. Moving forward in terms of docs, I am working on having more cookbook, follow-by-example stuff in addition to the existing language-reference kinds of stuff. And continuing to seek out input from people who use Miller on where to go next. diff --git a/go.mod b/go.mod index d6288da32..10b971673 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module github.com/johnkerl/miller +module github.com/johnkerl/miller/v6 // The repo is 'miller' and the executable is 'mlr', going back many years and // predating the Go port. @@ -7,35 +7,39 @@ module github.com/johnkerl/miller // executable would be 'miller' not 'mlr'. // // So we have cmd/mlr/main.go: -// * go build github.com/johnkerl/miller/cmd/mlr -// * go install github.com/johnkerl/miller/cmd/mlr +// * go build github.com/johnkerl/miller/v6/cmd/mlr +// * go install github.com/johnkerl/miller/v6/cmd/mlr // go get github.com/johnkerl/lumin@v1.0.0 // Local development: // replace github.com/johnkerl/lumin => /Users/kerl/git/johnkerl/lumin -go 1.19 +go 1.24.0 require ( github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb github.com/johnkerl/lumin v1.0.0 github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 - github.com/klauspost/compress v1.16.7 - github.com/lestrrat-go/strftime v1.0.6 + github.com/klauspost/compress v1.18.3 + github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1 + github.com/lestrrat-go/strftime v1.1.1 github.com/mattn/go-isatty v0.0.20 github.com/nine-lives-later/go-windows-terminal-sequences v1.0.4 github.com/pkg/profile v1.7.0 - github.com/stretchr/testify v1.8.4 - golang.org/x/sys v0.15.0 - golang.org/x/term v0.15.0 - golang.org/x/text v0.14.0 + github.com/stretchr/testify v1.11.1 + golang.org/x/sys v0.40.0 + golang.org/x/term v0.39.0 + golang.org/x/text v0.33.0 ) require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/felixge/fgprof v0.9.3 // indirect + github.com/golang/snappy v1.0.0 // indirect github.com/google/pprof v0.0.0-20211214055906-6f57359322fd // indirect - github.com/pkg/errors v0.9.1 // indirect + github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + golang.org/x/tools v0.40.0 // indirect + gonum.org/v1/gonum v0.16.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 25fed86dd..0a7bba556 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb h1:IT4JYU7k4ikYg1S github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb/go.mod h1:bH6Xx7IW64qjjJq8M2u4dxNaBiDfKK+z/3eGDpXEQhc= github.com/felixge/fgprof v0.9.3 h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g= github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw= +github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= +github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/pprof v0.0.0-20211214055906-6f57359322fd h1:1FjCyPC+syAzJ5/2S8fqdZK1R22vvA0J7JZKcuOIQ7Y= github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg= github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w= @@ -15,37 +17,42 @@ github.com/johnkerl/lumin v1.0.0 h1:CV34cHZOJ92Y02RbQ0rd4gA0C06Qck9q8blOyaPoWpU= github.com/johnkerl/lumin v1.0.0/go.mod h1:eLf5AdQOaLvzZ2zVy4REr/DSeEwG+CZreHwNLICqv9E= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= -github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= -github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw= +github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= +github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb h1:Z5BVHFk/DLOIUAd2NycF0mLtKfhl7ynm4Uy5+AFhT48= +github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb/go.mod h1:+U+6yzfITr4/teU2YhxWhdyw6YzednT/16/UBMjlDrU= +github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1 h1:UyIQ1VTQq/0CS/wLYjf3DV6uRKTd1xcsng3BccM4XCY= +github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1/go.mod h1:uvVFnikBpVz7S1pdsyUI+BBRlz64vmU6Q+kviiB+fpU= github.com/lestrrat-go/envload v0.0.0-20180220234015-a3eb8ddeffcc h1:RKf14vYWi2ttpEmkA4aQ3j4u9dStX2t4M8UM6qqNsG8= github.com/lestrrat-go/envload v0.0.0-20180220234015-a3eb8ddeffcc/go.mod h1:kopuH9ugFRkIXf3YoqHKyrJ9YfUFsckUU9S7B+XP+is= -github.com/lestrrat-go/strftime v1.0.6 h1:CFGsDEt1pOpFNU+TJB0nhz9jl+K0hZSLE205AhTIGQQ= -github.com/lestrrat-go/strftime v1.0.6/go.mod h1:f7jQKgV5nnJpYgdEasS+/y7EsTb8ykN2z68n3TtcTaw= +github.com/lestrrat-go/strftime v1.1.1 h1:zgf8QCsgj27GlKBy3SU9/8MMgegZ8UCzlCyHYrUF0QU= +github.com/lestrrat-go/strftime v1.1.1/go.mod h1:YDrzHJAODYQ+xxvrn5SG01uFIQAeDTzpxNVppCz7Nmw= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/nine-lives-later/go-windows-terminal-sequences v1.0.4 h1:NC4H8hewgaktBqMI5yzy6L/Vln5/H7BEziyxaE2fX3Y= github.com/nine-lives-later/go-windows-terminal-sequences v1.0.4/go.mod h1:eUQxpEiJy001RoaLXrNa5+QQLYiEgmEafwWuA3ppJSo= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/profile v1.7.0 h1:hnbDkaNWPCLMO9wGLdBFTIZvzDrDfBM2072E1S9gJkA= github.com/pkg/profile v1.7.0/go.mod h1:8Uer0jas47ZQMJ7VD+OHknK4YDY07LPUC6dEvqDjvNo= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= -golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= +golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= +golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= +golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= +golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA= +golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/man/manpage.txt b/man/manpage.txt index e7e3d3582..90bff3293 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -1,6 +1,4 @@ -MILLER(1) MILLER(1) - - +4mMILLER24m(1) 4mMILLER24m(1) 1mNAME0m Miller -- like awk, sed, cut, join, and sort for name-indexed data such @@ -29,7 +27,7 @@ MILLER(1) MILLER(1) insertion-ordered hash map. This encompasses a variety of data formats, including but not limited to the familiar CSV, TSV, and JSON. (Miller can handle positionally-indexed data as a special case.) This - manpage documents mlr 6.10.0. + manpage documents mlr 6.16.0. 1mEXAMPLES0m mlr --icsv --opprint cat example.csv @@ -82,7 +80,7 @@ MILLER(1) MILLER(1) | 4 5 6 | Record 2: "apple":"4", "bat":"5", "cog":"6" +---------------------+ - Markdown tabular (supported for output only): + Markdown tabular: +-----------------------+ | | apple | bat | cog | | | | --- | --- | --- | | @@ -126,6 +124,7 @@ MILLER(1) MILLER(1) mlr help comments-in-data-flags mlr help compressed-data-flags mlr help csv/tsv-only-flags + mlr help dkvp-only-flags mlr help file-format-flags mlr help flatten-unflatten-flags mlr help format-conversion-keystroke-saver-flags @@ -178,9 +177,9 @@ MILLER(1) MILLER(1) json-parse json-stringify join label latin1-to-utf8 least-frequent merge-fields most-frequent nest nothing put regularize remove-empty-columns rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle - skip-trivial-records sort sort-within-records split ssub stats1 stats2 step - sub summary tac tail tee template top utf8-to-latin1 unflatten uniq unspace - unsparsify + skip-trivial-records sort sort-within-records sparsify split ssub stats1 + stats2 step sub summary surv tac tail tee template top utf8-to-latin1 + unflatten uniq unspace unsparsify 1mFUNCTION LIST0m abs acos acosh antimode any append apply arrayify asin asinh asserting_absent @@ -204,13 +203,14 @@ MILLER(1) MILLER(1) percentiles pow qnorm reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita - splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime - strfntime_local strftime strftime_local string strip strlen strpntime - strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2 - sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate - typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement - urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - . - .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ + splitax splitkv splitkvx splitnv splitnvx sqrt ssub stat stddev strfntime + strfntime_local strftime strftime_local string strip strlen strmatch strmatchx + strpntime strpntime_local strptime strptime_local sub substr substr0 substr1 + sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper + truncate typeof unflatten unformat unformatx upntime uptime urand urand32 + urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % & + && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | + || ~ 1mCOMMENTS-IN-DATA FLAGS0m Miller lets you put comments in your data, such as @@ -234,12 +234,14 @@ MILLER(1) MILLER(1) within the input. --pass-comments-with {string} Immediately print commented lines within input, with - specified prefix. + specified prefix. For CSV input format, the prefix + must be a single character. --skip-comments Ignore commented lines (prefixed by `#`) within the input. --skip-comments-with {string} Ignore commented lines within input, with specified - prefix. + prefix. For CSV input format, the prefix must be a + single character. 1mCOMPRESSED-DATA FLAGS0m Miller offers a few different ways to handle reading data files @@ -318,6 +320,10 @@ MILLER(1) MILLER(1) recreate missing headers. --lazy-quotes Accepts quotes appearing in unquoted fields, and non-doubled quotes appearing in quoted fields. + --no-auto-unsparsify For CSV/TSV output: if the record keys change from + one row to another, emit a blank line and a new + header line. This is non-compliant with RFC 4180 but + it helpful for heterogeneous data. --no-implicit-csv-header or --no-implicit-tsv-header Opposite of `--implicit-csv-header`. This is the default anyway -- the main use is for the flags to @@ -332,6 +338,16 @@ MILLER(1) MILLER(1) -N Keystroke-saver for `--implicit-csv-header --headerless-csv-output`. +1mDKVP-ONLY FLAGS0m + These are flags which are applicable to DKVP format. + + --incr-key Without this option, keyless DKVP fields are keyed by + field number. For example: `a=10,b=20,30,d=40,50` is + ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With + this option, they're keyed by a running counter of + keyless fields. For example: `a=10,b=20,30,d=40,50` + is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`. + 1mFILE-FORMAT FLAGS0m See the File formats doc page, and or `mlr help file-formats`, for more about file formats Miller supports. @@ -344,9 +360,9 @@ MILLER(1) MILLER(1) are overridden in all cases by setting output format to `format2`. --asv or --asvlite Use ASV format for input and output data. - --csv or -c Use CSV format for input and output data. + --csv or -c or --c2c Use CSV format for input and output data. --csvlite Use CSV-lite format for input and output data. - --dkvp Use DKVP format for input and output data. + --dkvp or --d2d Use DKVP format for input and output data. --gen-field-name Specify field name for --igen. Defaults to "i". --gen-start Specify start value for --igen. Defaults to 1. --gen-step Specify step value for --igen. Defaults to 1. @@ -361,6 +377,7 @@ MILLER(1) MILLER(1) seqgen verb, which is more useful/intuitive. --ijson Use JSON format for input data. --ijsonl Use JSON Lines format for input data. + --imd or --imarkdown Use markdown-tabular format for input data. --inidx Use NIDX format for input data. --io {format name} Use format name for input and output data. For example: `--io csv` is the same as `--csv`. @@ -369,27 +386,27 @@ MILLER(1) MILLER(1) --itsvlite Use TSV-lite format for input data. --iusv or --iusvlite Use USV format for input data. --ixtab Use XTAB format for input data. - --json or -j Use JSON format for input and output data. - --jsonl Use JSON Lines format for input and output data. - --nidx Use NIDX format for input and output data. + --json or -j or --j2j Use JSON format for input and output data. + --jsonl or --l2l Use JSON Lines format for input and output data. + --nidx or --n2n Use NIDX format for input and output data. --oasv or --oasvlite Use ASV format for output data. --ocsv Use CSV format for output data. --ocsvlite Use CSV-lite format for output data. --odkvp Use DKVP format for output data. --ojson Use JSON format for output data. --ojsonl Use JSON Lines format for output data. - --omd Use markdown-tabular format for output data. + --omd or --omarkdown Use markdown-tabular format for output data. --onidx Use NIDX format for output data. --opprint Use PPRINT format for output data. --otsv Use TSV format for output data. --otsvlite Use TSV-lite format for output data. --ousv or --ousvlite Use USV format for output data. --oxtab Use XTAB format for output data. - --pprint Use PPRINT format for input and output data. - --tsv or -t Use TSV format for input and output data. + --pprint or --p2p Use PPRINT format for input and output data. + --tsv or -t or --t2t Use TSV format for input and output data. --tsvlite Use TSV-lite format for input and output data. --usv or --usvlite Use USV format for input and output data. - --xtab Use XTAB format for input and output data. + --xtab or --x2x Use XTAB format for input and output data. --xvright Right-justify values for XTAB format. -i {format name} Use format name for input data. For example: `-i csv` is the same as `--icsv`. @@ -399,7 +416,7 @@ MILLER(1) MILLER(1) 1mFLATTEN-UNFLATTEN FLAGS0m These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening). - See the Flatten/unflatten doc page for more information. + See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information. --flatsep or --jflatsep {string} Separator for flattening multi-level JSON keys, e.g. @@ -407,32 +424,31 @@ MILLER(1) MILLER(1) formats. Defaults to `.`. --no-auto-flatten When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` - then this flattens to `y.1=7,y.2=8,y.3=9, and + then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. - --no-auto-unflatten When input non-JSON and output is JSON, suppress the - default auto-unflatten behavior. Default: if the + --no-auto-unflatten When input is non-JSON and output is JSON, suppress + the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to - `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With - `--no-auto-flatten`, instead we get - `${y.1}=7,${y.2}=8,${y.3}=9`. + `$y=[7,8,9]`. With `--no-auto-flatten`, instead we + get `${y.1}=7,${y.2}=8,${y.3}=9`. 1mFORMAT-CONVERSION KEYSTROKE-SAVER FLAGS0m As keystroke-savers for format-conversion you may use the following. The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX, - JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively. Note that markdown - format is available for output only. + JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively. - | In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown | - +--------+-------+-------+--------+--------+--------+--------+--------+----------+ - | CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m | - | TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m | - | JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m | - | JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m | - | DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m | - | NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m | - | XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m | - | PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m | + | In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown | + +----------+----------+----------+----------+-------+-------+-------+-------+--------+----------| + | CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m | + | TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m | + | JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m | + | JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m | + | DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m | + | NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m | + | XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m | + | PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m | + | Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | | -p Keystroke-saver for `--nidx --fs space --repifs`. -T Keystroke-saver for `--nidx --fs tab`. @@ -447,8 +463,8 @@ MILLER(1) MILLER(1) --jvstack Put one key-value pair per line for JSON output (multi-line output). This is the default for JSON output format. - --no-jlistwrap Wrap JSON output in outermost `[ ]`. This is the - default for JSON Lines output format. + --no-jlistwrap Do not wrap JSON output in outermost `[ ]`. This is + the default for JSON Lines output format. --no-jvstack Put objects/arrays all on one line for JSON output. This is the default for JSON Lines output format. @@ -547,6 +563,7 @@ MILLER(1) MILLER(1) since direct-to-screen output for large files has its own overhead. --no-hash-records See --hash-records. + --norc Do not load a .mlrrc file. --nr-progress-mod {m} With m a positive integer: print filename and record count to os.Stderr every m input records. --ofmt {format} E.g. `%.18f`, `%.0f`, `%9.6e`. Please use @@ -622,8 +639,8 @@ MILLER(1) MILLER(1) How you can control colorization: * Suppression/unsuppression: - * Environment variable `export MLR_NO_COLOR=true` means don't color - even if stdout+TTY. + * Environment variable `export MLR_NO_COLOR=true` or `export NO_COLOR=true` + means don't color even if stdout+TTY. * Environment variable `export MLR_ALWAYS_COLOR=true` means do color even if not stdout+TTY. For example, you might want to use this when piping mlr output to `less -r`. @@ -674,8 +691,10 @@ MILLER(1) MILLER(1) 1mPPRINT-ONLY FLAGS0m These are flags which are applicable to PPRINT format. - --barred Prints a border around PPRINT output (not available - for input). + --barred or --barred-output + Prints a border around PPRINT output. + --barred-input When used in conjunction with --pprint, accepts + barred input. --right Right-justifies all fields for PPRINT output. 1mPROFILING FLAGS0m @@ -740,13 +759,13 @@ MILLER(1) MILLER(1) - To avoid backslashing, you can use any of the following names: ascii_esc = "\x1b" - ascii_etx = "\x04" + ascii_etx = "\x03" ascii_fs = "\x1c" ascii_gs = "\x1d" - ascii_null = "\x01" + ascii_null = "\x00" ascii_rs = "\x1e" - ascii_soh = "\x02" - ascii_stx = "\x03" + ascii_soh = "\x01" + ascii_stx = "\x02" ascii_us = "\x1f" asv_fs = "\x1f" asv_rs = "\x1e" @@ -780,11 +799,12 @@ MILLER(1) MILLER(1) csv "," N/A "\n" csvlite "," N/A "\n" dkvp "," "=" "\n" + gen "," N/A "\n" json N/A N/A N/A markdown " " N/A "\n" nidx " " N/A "\n" pprint " " N/A "\n" - tsv " " N/A "\n" + tsv " " N/A "\n" xtab "\n" " " "\n\n" --fs {string} Specify FS for input and output. @@ -965,6 +985,7 @@ MILLER(1) MILLER(1) Options: -f {a,b,c} Field names for distinct count. + -x {a,b,c} Field names to exclude for distinct count: use each record's others instead. -n Show only the number of distinct values. Not compatible with -u. -o {name} Field name for output count. Default "count". Ignored with -u. @@ -1003,7 +1024,7 @@ MILLER(1) MILLER(1) -r Treat field names as regular expressions. "ab", "a.*b" will match any field name containing the substring "ab" or matching "a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may - be used. The -o flag is ignored when -r is present. + be used. -h|--help Show this message. Examples: mlr cut -f hostname,status @@ -1047,6 +1068,10 @@ MILLER(1) MILLER(1) 1mfilter0m Usage: mlr filter [options] {DSL expression} + Lets you use a domain-specific language to programmatically filter which + stream records will be output. + See also: https://miller.readthedocs.io/en/latest/reference-verbs + Options: -f {file name} File containing a DSL expression (see examples below). If the filename is a directory, all *.mlr files in that directory are loaded. @@ -1241,6 +1266,8 @@ MILLER(1) MILLER(1) See also the `sub` and `ssub` verbs. Options: -f {a,b,c} Field names to convert. + -r {regex} Regular expression for field names to convert. + -a Convert all fields. -h|--help Show this message. 1mhaving-fields0m @@ -1314,6 +1341,8 @@ MILLER(1) MILLER(1) --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field names from the left file. Automatically includes the join-field name(s). Helpful for when you only want a limited subset of information from the left file. + Tip: you can use --lk "": this means the left file becomes solely a row-selector + for the input files. --lp {text} Additional prefix for non-join output field names from the left file --rp {text} Additional prefix for non-join output field names from @@ -1348,7 +1377,7 @@ MILLER(1) MILLER(1) Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'. Please use "mlr --usage-separator-options" for information on specifying separators. - Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information + Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information including examples. 1mlabel0m @@ -1392,6 +1421,7 @@ MILLER(1) MILLER(1) antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -1498,6 +1528,9 @@ MILLER(1) MILLER(1) 1mput0m Usage: mlr put [options] {DSL expression} + Lets you use a domain-specific language to programmatically alter stream records. + See also: https://miller.readthedocs.io/en/latest/reference-verbs + Options: -f {file name} File containing a DSL expression (see examples below). If the filename is a directory, all *.mlr files in that directory are loaded. @@ -1604,9 +1637,9 @@ MILLER(1) MILLER(1) first-match replacement. -h|--help Show this message. Examples: - mlr rename old_name,new_name' - mlr rename old_name_1,new_name_1,old_name_2,new_name_2' - mlr rename -r 'Date_[0-9]+,Date,' Rename all such fields to be "Date" + mlr rename old_name,new_name + mlr rename old_name_1,new_name_1,old_name_2,new_name_2 + mlr rename -r 'Date_[0-9]+,Date' Rename all such fields to be "Date" mlr rename -r '"Date_[0-9]+",Date' Same mlr rename -r 'Date_([0-9]+).*,\1' Rename all such fields to be of the form 20151015 mlr rename -r '"name"i,Name' Rename "name", "Name", "NAME", etc. to "Name" @@ -1796,6 +1829,7 @@ MILLER(1) MILLER(1) -nf {comma-separated field names} Same as -n -nr {comma-separated field names} Numerical descending; nulls sort first -t {comma-separated field names} Natural ascending + -b Move sort fields to start of record, as in reorder -b -tr|-rt {comma-separated field names} Natural descending -h|--help Show this message. @@ -1811,6 +1845,17 @@ MILLER(1) MILLER(1) -r Recursively sort subobjects/submaps, e.g. for JSON input. -h|--help Show this message. + 1msparsify0m + Usage: mlr sparsify [options] + Unsets fields for which the key is the empty string (or, optionally, another + specified value). Only makes sense with output format not being CSV or TSV. + Options: + -s {filler string} What values to remove. Defaults to the empty string. + -f {a,b,c} Specify field names to be operated on; any other fields won't be + modified. The default is to modify all fields. + -h|--help Show this message. + Example: if input is a=1,b=,c=3 then output is a=1,c=3. + 1msplit0m Usage: mlr split [options] {filename} Options: @@ -1859,6 +1904,8 @@ MILLER(1) MILLER(1) the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs. Options: -f {a,b,c} Field names to convert. + -r {regex} Regular expression for field names to convert. + -a Convert all fields. -h|--help Show this message. 1mstats10m @@ -1876,6 +1923,7 @@ MILLER(1) MILLER(1) antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -1910,7 +1958,7 @@ MILLER(1) MILLER(1) Example: mlr stats1 -a min,p10,p50,p90,max -f value -g size,shape Example: mlr stats1 -a count,mode -f size Example: mlr stats1 -a count,mode -f size -g shape - Example: mlr stats1 -a count,mode --fr '^[a-h].*$' -gr '^k.*$' + Example: mlr stats1 -a count,mode --fr '^[a-h].*$' --gr '^k.*$' This computes count and mode statistics on all field names beginning with a through h, grouped by all field names starting with k. @@ -2005,6 +2053,8 @@ MILLER(1) MILLER(1) See also the `gsub` and `ssub` verbs. Options: -f {a,b,c} Field names to convert. + -r {regex} Regular expression for field names to convert. + -a Convert all fields. -h|--help Show this message. 1msummary0m @@ -2047,8 +2097,18 @@ MILLER(1) MILLER(1) -a {mean,sum,etc.} Use only the specified summarizers. -x {mean,sum,etc.} Use all summarizers, except the specified ones. --all Use all available summarizers. + --transpose Show output with field names as column names.. -h|--help Show this message. + 1msurv0m + Usage: mlr surv -d {duration-field} -s {status-field} + + Estimate Kaplan-Meier survival curve (right-censored). + Options: + -d {field} Name of duration field (time-to-event or censoring). + -s {field} Name of status field (0=censored, 1=event). + -h, --help Show this message. + 1mtac0m Usage: mlr tac [options] Prints records in reverse order from the order in which they were encountered. @@ -2131,6 +2191,7 @@ MILLER(1) MILLER(1) Options: -g {d,e,f} Group-by-field names for uniq counts. + -x {a,b,c} Field names to exclude for uniq: use each record's others instead. -c Show repeat counts in addition to unique values. -n Show only the number of distinct values. -o {name} Field name for output count. Default "count". @@ -2285,7 +2346,7 @@ MILLER(1) MILLER(1) (class=math #args=1) Ceiling: nearest integer at or above. 1mclean_whitespace0m - (class=string #args=1) Same as collapse_whitespace and strip. + (class=string #args=1) Same as collapse_whitespace and strip, followed by type inference. 1mcollapse_whitespace0m (class=string #args=1) Strip repeated whitespace from string. @@ -2377,9 +2438,14 @@ MILLER(1) MILLER(1) $* = fmtifnum($*, "%.6f") formats numeric fields in the current record, leaving non-numeric ones alone 1mfmtnum0m - (class=conversion #args=2) Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. '$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. This function recurses on array and map values. - Example: - $x = fmtnum($x, "%.6f") + (class=conversion #args=2) Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. '$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. Miller-specific extension: "%_d" and "%_f" for comma-separated thousands. This function recurses on array and map values. + Examples: + $y = fmtnum($x, "%.6f") + $o = fmtnum($n, "%d") + $o = fmtnum($n, "%12d") + $y = fmtnum($x, "%.6_f") + $o = fmtnum($n, "%_d") + $o = fmtnum($n, "%12_d") 1mfold0m (class=higher-order-functions #args=3) Given a map or array as first argument and a function as second argument, accumulates entries into a final output -- for example, sum or product. For arrays, the function should take two arguments, for accumulated value and array element. For maps, it should take four arguments, for accumulated key and value, and map-element key and value; it should return the updated accumulator as a new key-value pair (i.e. a single-entry map). The start value for the accumulator is taken from the third argument. @@ -2932,6 +2998,18 @@ MILLER(1) MILLER(1) Example: ssub("abc.def", ".", "X") gives "abcXdef" + 1mstat0m + (class=system #args=1) Returns a map containing information about the provided path: "name" with string value, "size" as decimal int value, "mode" as octal int value, "modtime" as int-valued epoch seconds, and "isdir" as boolean value. + Examples: + stat("./mlr") gives { + "name": "mlr", + "size": 38391584, + "mode": 0755, + "modtime": 1715207874, + "isdir": false + } + stat("./mlr")["size"] gives 38391584 + 1mstddev0m (class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns empty string AKA void for array/map of length less than two; returns error for non-array/non-map types. Example: @@ -2974,6 +3052,40 @@ MILLER(1) MILLER(1) 1mstrlen0m (class=string #args=1) String length. + 1mstrmatch0m + (class=string #args=2) Boolean yes/no for whether the stringable first argument matches the regular-expression second argument. No regex captures are provided; please see `strmatch`. + Examples: + strmatch("a", "abc") is false + strmatch("abc", "a") is true + strmatch("abc", "a[a-z]c") is true + strmatch("abc", "(a).(c)") is true + strmatch(12345, "34") is true + + 1mstrmatchx0m + (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in contrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here. + Examples: + strmatchx("a", "abc") returns: + { + "matched": false + } + strmatchx("abc", "a") returns: + { + "matched": true, + "full_capture": "a", + "full_start": 1, + "full_end": 1 + } + strmatchx("[zy:3458]", "([a-z]+):([0-9]+)") returns: + { + "matched": true, + "full_capture": "zy:3458", + "full_start": 2, + "full_end": 8, + "captures": ["zy", "3458"], + "starts": [2, 5], + "ends": [3, 8] + } + 1mstrpntime0m (class=time #args=2) strpntime: Parses timestamp as integer nanoseconds since the epoch. See also strpntime_local. Examples: @@ -3626,6 +3738,4 @@ MILLER(1) MILLER(1) MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite https://miller.readthedocs.io - - - 2023-12-13 MILLER(1) + 2026-01-02 4mMILLER24m(1) diff --git a/man/mkman.rb b/man/mkman.rb index 6b13f5462..325515fe5 100755 --- a/man/mkman.rb +++ b/man/mkman.rb @@ -19,7 +19,7 @@ def main # Live code-generation needs to be using mlr from *this* tree, not from # somewhere else in the PATH. unless File.executable?('../mlr') - $stderr.puts "#{$0}: Need ../../mlr to exist: please check 'make build' in ../.." + $stderr.puts "#{$0}: Need ../mlr to exist: please check 'make build' in ../.." exit 1 end `../mlr --version` diff --git a/man/mlr.1 b/man/mlr.1 index 4d5ee4f5c..f36d5e2f0 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2023-12-13 +.\" Date: 2026-01-02 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2023-12-13" "\ \&" "\ \&" +.TH "MILLER" "1" "2026-01-02" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -47,7 +47,7 @@ on integer-indexed fields: if the natural data structure for the latter is the array, then Miller's natural data structure is the insertion-ordered hash map. This encompasses a variety of data formats, including but not limited to the familiar CSV, TSV, and JSON. (Miller can handle positionally-indexed data as -a special case.) This manpage documents mlr 6.10.0. +a special case.) This manpage documents mlr 6.16.0. .SH "EXAMPLES" .sp @@ -111,7 +111,7 @@ PPRINT: pretty-printed tabular | 4 5 6 | Record 2: "apple":"4", "bat":"5", "cog":"6" +---------------------+ -Markdown tabular (supported for output only): +Markdown tabular: +-----------------------+ | | apple | bat | cog | | | | --- | --- | --- | | @@ -161,6 +161,7 @@ Flags: mlr help comments-in-data-flags mlr help compressed-data-flags mlr help csv/tsv-only-flags + mlr help dkvp-only-flags mlr help file-format-flags mlr help flatten-unflatten-flags mlr help format-conversion-keystroke-saver-flags @@ -219,9 +220,9 @@ fraction gap grep group-by group-like gsub having-fields head histogram json-parse json-stringify join label latin1-to-utf8 least-frequent merge-fields most-frequent nest nothing put regularize remove-empty-columns rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle -skip-trivial-records sort sort-within-records split ssub stats1 stats2 step -sub summary tac tail tee template top utf8-to-latin1 unflatten uniq unspace -unsparsify +skip-trivial-records sort sort-within-records sparsify split ssub stats1 +stats2 step sub summary surv tac tail tee template top utf8-to-latin1 +unflatten uniq unspace unsparsify .fi .if n \{\ .RE @@ -251,13 +252,14 @@ nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita -splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime -strfntime_local strftime strftime_local string strip strlen strpntime -strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2 -sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate -typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement -urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - . -\&.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ +splitax splitkv splitkvx splitnv splitnvx sqrt ssub stat stddev strfntime +strfntime_local strftime strftime_local string strip strlen strmatch strmatchx +strpntime strpntime_local strptime strptime_local sub substr substr0 substr1 +sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper +truncate typeof unflatten unformat unformatx upntime uptime urand urand32 +urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % & +&& * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | +|| ~ .fi .if n \{\ .RE @@ -289,12 +291,14 @@ Notes: within the input. --pass-comments-with {string} Immediately print commented lines within input, with - specified prefix. + specified prefix. For CSV input format, the prefix + must be a single character. --skip-comments Ignore commented lines (prefixed by `#`) within the input. --skip-comments-with {string} Ignore commented lines within input, with specified - prefix. + prefix. For CSV input format, the prefix must be a + single character. .fi .if n \{\ .RE @@ -389,6 +393,10 @@ These are flags which are applicable to CSV format. recreate missing headers. --lazy-quotes Accepts quotes appearing in unquoted fields, and non-doubled quotes appearing in quoted fields. +--no-auto-unsparsify For CSV/TSV output: if the record keys change from + one row to another, emit a blank line and a new + header line. This is non-compliant with RFC 4180 but + it helpful for heterogeneous data. --no-implicit-csv-header or --no-implicit-tsv-header Opposite of `--implicit-csv-header`. This is the default anyway -- the main use is for the flags to @@ -405,6 +413,24 @@ These are flags which are applicable to CSV format. .fi .if n \{\ .RE +.SH "DKVP-ONLY FLAGS" +.sp + +.if n \{\ +.RS 0 +.\} +.nf +These are flags which are applicable to DKVP format. + +--incr-key Without this option, keyless DKVP fields are keyed by + field number. For example: `a=10,b=20,30,d=40,50` is + ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With + this option, they're keyed by a running counter of + keyless fields. For example: `a=10,b=20,30,d=40,50` + is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`. +.fi +.if n \{\ +.RE .SH "FILE-FORMAT FLAGS" .sp @@ -423,9 +449,9 @@ The latter sets up input and output flags for `format1`, not all of which are overridden in all cases by setting output format to `format2`. --asv or --asvlite Use ASV format for input and output data. ---csv or -c Use CSV format for input and output data. +--csv or -c or --c2c Use CSV format for input and output data. --csvlite Use CSV-lite format for input and output data. ---dkvp Use DKVP format for input and output data. +--dkvp or --d2d Use DKVP format for input and output data. --gen-field-name Specify field name for --igen. Defaults to "i". --gen-start Specify start value for --igen. Defaults to 1. --gen-step Specify step value for --igen. Defaults to 1. @@ -440,6 +466,7 @@ are overridden in all cases by setting output format to `format2`. seqgen verb, which is more useful/intuitive. --ijson Use JSON format for input data. --ijsonl Use JSON Lines format for input data. +--imd or --imarkdown Use markdown-tabular format for input data. --inidx Use NIDX format for input data. --io {format name} Use format name for input and output data. For example: `--io csv` is the same as `--csv`. @@ -448,27 +475,27 @@ are overridden in all cases by setting output format to `format2`. --itsvlite Use TSV-lite format for input data. --iusv or --iusvlite Use USV format for input data. --ixtab Use XTAB format for input data. ---json or -j Use JSON format for input and output data. ---jsonl Use JSON Lines format for input and output data. ---nidx Use NIDX format for input and output data. +--json or -j or --j2j Use JSON format for input and output data. +--jsonl or --l2l Use JSON Lines format for input and output data. +--nidx or --n2n Use NIDX format for input and output data. --oasv or --oasvlite Use ASV format for output data. --ocsv Use CSV format for output data. --ocsvlite Use CSV-lite format for output data. --odkvp Use DKVP format for output data. --ojson Use JSON format for output data. --ojsonl Use JSON Lines format for output data. ---omd Use markdown-tabular format for output data. +--omd or --omarkdown Use markdown-tabular format for output data. --onidx Use NIDX format for output data. --opprint Use PPRINT format for output data. --otsv Use TSV format for output data. --otsvlite Use TSV-lite format for output data. --ousv or --ousvlite Use USV format for output data. --oxtab Use XTAB format for output data. ---pprint Use PPRINT format for input and output data. ---tsv or -t Use TSV format for input and output data. +--pprint or --p2p Use PPRINT format for input and output data. +--tsv or -t or --t2t Use TSV format for input and output data. --tsvlite Use TSV-lite format for input and output data. --usv or --usvlite Use USV format for input and output data. ---xtab Use XTAB format for input and output data. +--xtab or --x2x Use XTAB format for input and output data. --xvright Right-justify values for XTAB format. -i {format name} Use format name for input data. For example: `-i csv` is the same as `--icsv`. @@ -486,7 +513,7 @@ are overridden in all cases by setting output format to `format2`. .nf These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening). -See the Flatten/unflatten doc page for more information. +See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information. --flatsep or --jflatsep {string} Separator for flattening multi-level JSON keys, e.g. @@ -494,15 +521,14 @@ See the Flatten/unflatten doc page for more information. formats. Defaults to `.`. --no-auto-flatten When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` - then this flattens to `y.1=7,y.2=8,y.3=9, and + then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. ---no-auto-unflatten When input non-JSON and output is JSON, suppress the - default auto-unflatten behavior. Default: if the +--no-auto-unflatten When input is non-JSON and output is JSON, suppress + the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to - `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With - `--no-auto-flatten`, instead we get - `${y.1}=7,${y.2}=8,${y.3}=9`. + `$y=[7,8,9]`. With `--no-auto-flatten`, instead we + get `${y.1}=7,${y.2}=8,${y.3}=9`. .fi .if n \{\ .RE @@ -515,19 +541,19 @@ See the Flatten/unflatten doc page for more information. .nf As keystroke-savers for format-conversion you may use the following. The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX, -JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively. Note that markdown -format is available for output only. +JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively. -| In\eout | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown | -+--------+-------+-------+--------+--------+--------+--------+--------+----------+ -| CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m | -| TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m | -| JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m | -| JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m | -| DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m | -| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m | -| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m | -| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m | +| In\eout | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown | ++----------+----------+----------+----------+-------+-------+-------+-------+--------+----------| +| CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m | +| TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m | +| JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m | +| JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m | +| DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m | +| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m | +| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m | +| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m | +| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | | -p Keystroke-saver for `--nidx --fs space --repifs`. -T Keystroke-saver for `--nidx --fs tab`. @@ -550,8 +576,8 @@ These are flags which are applicable to JSON output format. --jvstack Put one key-value pair per line for JSON output (multi-line output). This is the default for JSON output format. ---no-jlistwrap Wrap JSON output in outermost `[ ]`. This is the - default for JSON Lines output format. +--no-jlistwrap Do not wrap JSON output in outermost `[ ]`. This is + the default for JSON Lines output format. --no-jvstack Put objects/arrays all on one line for JSON output. This is the default for JSON Lines output format. .fi @@ -666,6 +692,7 @@ These are flags which don't fit into any other category. since direct-to-screen output for large files has its own overhead. --no-hash-records See --hash-records. +--norc Do not load a .mlrrc file. --nr-progress-mod {m} With m a positive integer: print filename and record count to os.Stderr every m input records. --ofmt {format} E.g. `%.18f`, `%.0f`, `%9.6e`. Please use @@ -749,8 +776,8 @@ Mechanisms for coloring: How you can control colorization: * Suppression/unsuppression: - * Environment variable `export MLR_NO_COLOR=true` means don't color - even if stdout+TTY. + * Environment variable `export MLR_NO_COLOR=true` or `export NO_COLOR=true` + means don't color even if stdout+TTY. * Environment variable `export MLR_ALWAYS_COLOR=true` means do color even if not stdout+TTY. For example, you might want to use this when piping mlr output to `less -r`. @@ -809,8 +836,10 @@ those can be joined with a "-", like "red-bold", "bold-170", "bold-underline", e .nf These are flags which are applicable to PPRINT format. ---barred Prints a border around PPRINT output (not available - for input). +--barred or --barred-output + Prints a border around PPRINT output. +--barred-input When used in conjunction with --pprint, accepts + barred input. --right Right-justifies all fields for PPRINT output. .fi .if n \{\ @@ -891,13 +920,13 @@ Notes about all other separators: - To avoid backslashing, you can use any of the following names: ascii_esc = "\ex1b" - ascii_etx = "\ex04" + ascii_etx = "\ex03" ascii_fs = "\ex1c" ascii_gs = "\ex1d" - ascii_null = "\ex01" + ascii_null = "\ex00" ascii_rs = "\ex1e" - ascii_soh = "\ex02" - ascii_stx = "\ex03" + ascii_soh = "\ex01" + ascii_stx = "\ex02" ascii_us = "\ex1f" asv_fs = "\ex1f" asv_rs = "\ex1e" @@ -931,6 +960,7 @@ Notes about all other separators: csv "," N/A "\en" csvlite "," N/A "\en" dkvp "," "=" "\en" + gen "," N/A "\en" json N/A N/A N/A markdown " " N/A "\en" nidx " " N/A "\en" @@ -1184,6 +1214,7 @@ Same as uniq -c. Options: -f {a,b,c} Field names for distinct count. +-x {a,b,c} Field names to exclude for distinct count: use each record's others instead. -n Show only the number of distinct values. Not compatible with -u. -o {name} Field name for output count. Default "count". Ignored with -u. @@ -1240,7 +1271,7 @@ Options: -r Treat field names as regular expressions. "ab", "a.*b" will match any field name containing the substring "ab" or matching "a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may - be used. The -o flag is ignored when -r is present. + be used. -h|--help Show this message. Examples: mlr cut -f hostname,status @@ -1308,6 +1339,10 @@ Options: .\} .nf Usage: mlr filter [options] {DSL expression} +Lets you use a domain-specific language to programmatically filter which +stream records will be output. +See also: https://miller.readthedocs.io/en/latest/reference-verbs + Options: -f {file name} File containing a DSL expression (see examples below). If the filename is a directory, all *.mlr files in that directory are loaded. @@ -1320,7 +1355,7 @@ Options: Since the expression pieces are simply concatenated, please be sure to use intervening semicolons to separate expressions.) --s name=value: Predefines out-of-stream variable @name to have +-s name=value: Predefines out-of-stream variable @name to have Thus mlr put -s foo=97 '$column += @foo' is like mlr put 'begin {@foo = 97} $column += @foo'. The value part is subject to type-inferencing. @@ -1550,6 +1585,8 @@ for the old string and handling multiple matches, like the `gsub` DSL function. See also the `sub` and `ssub` verbs. Options: -f {a,b,c} Field names to convert. +-r {regex} Regular expression for field names to convert. +-a Convert all fields. -h|--help Show this message. .fi .if n \{\ @@ -1659,6 +1696,8 @@ Options: --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field names from the left file. Automatically includes the join-field name(s). Helpful for when you only want a limited subset of information from the left file. + Tip: you can use --lk "": this means the left file becomes solely a row-selector + for the input files. --lp {text} Additional prefix for non-join output field names from the left file --rp {text} Additional prefix for non-join output field names from @@ -1693,7 +1732,7 @@ be specified CSV as well unless you override with 'mlr --csv ... join --ijson -l Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'. Please use "mlr --usage-separator-options" for information on specifying separators. -Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information +Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information including examples. .fi .if n \{\ @@ -1761,6 +1800,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -1891,6 +1931,9 @@ Options: .\} .nf Usage: mlr put [options] {DSL expression} +Lets you use a domain-specific language to programmatically alter stream records. +See also: https://miller.readthedocs.io/en/latest/reference-verbs + Options: -f {file name} File containing a DSL expression (see examples below). If the filename is a directory, all *.mlr files in that directory are loaded. @@ -1903,7 +1946,7 @@ Options: Since the expression pieces are simply concatenated, please be sure to use intervening semicolons to separate expressions.) --s name=value: Predefines out-of-stream variable @name to have +-s name=value: Predefines out-of-stream variable @name to have Thus mlr put -s foo=97 '$column += @foo' is like mlr put 'begin {@foo = 97} $column += @foo'. The value part is subject to type-inferencing. @@ -2015,9 +2058,9 @@ Options: first-match replacement. -h|--help Show this message. Examples: -mlr rename old_name,new_name' -mlr rename old_name_1,new_name_1,old_name_2,new_name_2' -mlr rename -r 'Date_[0-9]+,Date,' Rename all such fields to be "Date" +mlr rename old_name,new_name +mlr rename old_name_1,new_name_1,old_name_2,new_name_2 +mlr rename -r 'Date_[0-9]+,Date' Rename all such fields to be "Date" mlr rename -r '"Date_[0-9]+",Date' Same mlr rename -r 'Date_([0-9]+).*,\e1' Rename all such fields to be of the form 20151015 mlr rename -r '"name"i,Name' Rename "name", "Name", "NAME", etc. to "Name" @@ -2267,6 +2310,7 @@ Options: -nf {comma-separated field names} Same as -n -nr {comma-separated field names} Numerical descending; nulls sort first -t {comma-separated field names} Natural ascending +-b Move sort fields to start of record, as in reorder -b -tr|-rt {comma-separated field names} Natural descending -h|--help Show this message. @@ -2290,6 +2334,23 @@ Options: .fi .if n \{\ .RE +.SS "sparsify" +.if n \{\ +.RS 0 +.\} +.nf +Usage: mlr sparsify [options] +Unsets fields for which the key is the empty string (or, optionally, another +specified value). Only makes sense with output format not being CSV or TSV. +Options: +-s {filler string} What values to remove. Defaults to the empty string. +-f {a,b,c} Specify field names to be operated on; any other fields won't be + modified. The default is to modify all fields. +-h|--help Show this message. +Example: if input is a=1,b=,c=3 then output is a=1,c=3. +.fi +.if n \{\ +.RE .SS "split" .if n \{\ .RS 0 @@ -2348,6 +2409,8 @@ Replaces old string with new string in specified field(s), without regex support the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs. Options: -f {a,b,c} Field names to convert. +-r {regex} Regular expression for field names to convert. +-a Convert all fields. -h|--help Show this message. .fi .if n \{\ @@ -2371,6 +2434,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -2405,7 +2469,7 @@ Options: Example: mlr stats1 -a min,p10,p50,p90,max -f value -g size,shape Example: mlr stats1 -a count,mode -f size Example: mlr stats1 -a count,mode -f size -g shape -Example: mlr stats1 -a count,mode --fr '^[a-h].*$' -gr '^k.*$' +Example: mlr stats1 -a count,mode --fr '^[a-h].*$' --gr '^k.*$' This computes count and mode statistics on all field names beginning with a through h, grouped by all field names starting with k. @@ -2518,6 +2582,8 @@ for the old string and not handling multiple matches, like the `sub` DSL functio See also the `gsub` and `ssub` verbs. Options: -f {a,b,c} Field names to convert. +-r {regex} Regular expression for field names to convert. +-a Convert all fields. -h|--help Show this message. .fi .if n \{\ @@ -2566,10 +2632,26 @@ Options: -a {mean,sum,etc.} Use only the specified summarizers. -x {mean,sum,etc.} Use all summarizers, except the specified ones. --all Use all available summarizers. +--transpose Show output with field names as column names.. -h|--help Show this message. .fi .if n \{\ .RE +.SS "surv" +.if n \{\ +.RS 0 +.\} +.nf +Usage: mlr surv -d {duration-field} -s {status-field} + +Estimate Kaplan-Meier survival curve (right-censored). +Options: + -d {field} Name of duration field (time-to-event or censoring). + -s {field} Name of status field (0=censored, 1=event). + -h, --help Show this message. +.fi +.if n \{\ +.RE .SS "tac" .if n \{\ .RS 0 @@ -2698,6 +2780,7 @@ count-distinct. For uniq, -f is a synonym for -g. Options: -g {d,e,f} Group-by-field names for uniq counts. +-x {a,b,c} Field names to exclude for uniq: use each record's others instead. -c Show repeat counts in addition to unique values. -n Show only the number of distinct values. -o {name} Field name for output count. Default "count". @@ -3094,7 +3177,7 @@ Map example: apply({"a":1, "b":3, "c":5}, func(k,v) {return {toupper(k): v ** 2} .RS 0 .\} .nf - (class=string #args=1) Same as collapse_whitespace and strip. + (class=string #args=1) Same as collapse_whitespace and strip, followed by type inference. .fi .if n \{\ .RE @@ -3312,9 +3395,14 @@ $* = fmtifnum($*, "%.6f") formats numeric fields in the current record, leaving .RS 0 .\} .nf - (class=conversion #args=2) Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. '$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. This function recurses on array and map values. -Example: -$x = fmtnum($x, "%.6f") + (class=conversion #args=2) Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. '$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. Miller-specific extension: "%_d" and "%_f" for comma-separated thousands. This function recurses on array and map values. +Examples: +$y = fmtnum($x, "%.6f") +$o = fmtnum($n, "%d") +$o = fmtnum($n, "%12d") +$y = fmtnum($x, "%.6_f") +$o = fmtnum($n, "%_d") +$o = fmtnum($n, "%12_d") .fi .if n \{\ .RE @@ -4559,6 +4647,24 @@ ssub("abc.def", ".", "X") gives "abcXdef" .fi .if n \{\ .RE +.SS "stat" +.if n \{\ +.RS 0 +.\} +.nf + (class=system #args=1) Returns a map containing information about the provided path: "name" with string value, "size" as decimal int value, "mode" as octal int value, "modtime" as int-valued epoch seconds, and "isdir" as boolean value. +Examples: +stat("./mlr") gives { + "name": "mlr", + "size": 38391584, + "mode": 0755, + "modtime": 1715207874, + "isdir": false +} +stat("./mlr")["size"] gives 38391584 +.fi +.if n \{\ +.RE .SS "stddev" .if n \{\ .RS 0 @@ -4649,6 +4755,52 @@ strftime_local(1440768801.7, "%Y-%m-%d %H:%M:%3S %z", "Asia/Istanbul") = "2015-0 .fi .if n \{\ .RE +.SS "strmatch" +.if n \{\ +.RS 0 +.\} +.nf + (class=string #args=2) Boolean yes/no for whether the stringable first argument matches the regular-expression second argument. No regex captures are provided; please see `strmatch`. +Examples: +strmatch("a", "abc") is false +strmatch("abc", "a") is true +strmatch("abc", "a[a-z]c") is true +strmatch("abc", "(a).(c)") is true +strmatch(12345, "34") is true +.fi +.if n \{\ +.RE +.SS "strmatchx" +.if n \{\ +.RS 0 +.\} +.nf + (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \e1, \e2, etc. are not set, in contrast to the `=~` operator. As well, while the `=~` operator limits matches to \e1 through \e9, an arbitrary number are supported here. +Examples: +strmatchx("a", "abc") returns: + { + "matched": false + } +strmatchx("abc", "a") returns: + { + "matched": true, + "full_capture": "a", + "full_start": 1, + "full_end": 1 + } +strmatchx("[zy:3458]", "([a-z]+):([0-9]+)") returns: + { + "matched": true, + "full_capture": "zy:3458", + "full_start": 2, + "full_end": 8, + "captures": ["zy", "3458"], + "starts": [2, 5], + "ends": [3, 8] + } +.fi +.if n \{\ +.RE .SS "strpntime" .if n \{\ .RS 0 diff --git a/miller.spec b/miller.spec index 413f6cdce..166cb35e0 100644 --- a/miller.spec +++ b/miller.spec @@ -1,6 +1,6 @@ Summary: Name-indexed data processing tool Name: miller -Version: 6.10.0 +Version: 6.16.0 Release: 1%{?dist} License: BSD Source: https://github.com/johnkerl/miller/releases/download/%{version}/miller-%{version}.tar.gz @@ -36,6 +36,24 @@ make install %{_mandir}/man1/mlr.1* %changelog +* Fri Jan 2 2026 John Kerl- 6.16.0-1 +- 6.16.0 release + +* Thu Aug 14 2025 John Kerl - 6.15.0-1 +- 6.15.0 release + +* Fri Jul 4 2025 John Kerl - 6.14.0-1 +- 6.14.0 release + +* Sat Oct 5 2024 John Kerl - 6.13.0-1 +- 6.13.0 release + +* Sat Mar 16 2024 John Kerl - 6.12.0-1 +- 6.12.0 release + +* Tue Jan 23 2024 John Kerl - 6.11.0-1 +- 6.11.0 release + * Wed Dec 13 2023 John Kerl - 6.10.0-1 - 6.10.0 release diff --git a/pkg/bifs/arithmetic.go b/pkg/bifs/arithmetic.go index f5a2b853e..bffcc5309 100644 --- a/pkg/bifs/arithmetic.go +++ b/pkg/bifs/arithmetic.go @@ -4,8 +4,8 @@ import ( "fmt" "math" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ================================================================ @@ -834,7 +834,7 @@ func min_i_ii(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { // a=F | min=a min=a // a=T | min=b min=b func min_b_bb(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { - if input1.AcquireBoolValue() == false { + if !input1.AcquireBoolValue() { return input1 } else { return input2 @@ -946,7 +946,7 @@ func BIF_minlen_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval { } // Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation. retval := lib.UTF8Strlen(mlrvals[0].OriginalString()) - for i, _ := range mlrvals { + for i := range mlrvals { clen := lib.UTF8Strlen(mlrvals[i].OriginalString()) if clen < retval { retval = clen @@ -1004,7 +1004,7 @@ func max_i_ii(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { // a=F | max=a max=b // a=T | max=a max=b func max_b_bb(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { - if input2.AcquireBoolValue() == false { + if !input2.AcquireBoolValue() { return input1 } else { return input2 @@ -1116,7 +1116,7 @@ func BIF_maxlen_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval { } // Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation. retval := lib.UTF8Strlen(mlrvals[0].OriginalString()) - for i, _ := range mlrvals { + for i := range mlrvals { clen := lib.UTF8Strlen(mlrvals[i].OriginalString()) if clen > retval { retval = clen diff --git a/pkg/bifs/arithmetic_test.go b/pkg/bifs/arithmetic_test.go index 76efd45ea..393a3a968 100644 --- a/pkg/bifs/arithmetic_test.go +++ b/pkg/bifs/arithmetic_test.go @@ -5,7 +5,7 @@ import ( "github.com/stretchr/testify/assert" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) func TestBIF_plus_unary(t *testing.T) { diff --git a/pkg/bifs/base.go b/pkg/bifs/base.go index 28aa0d6bf..c259cc7fe 100644 --- a/pkg/bifs/base.go +++ b/pkg/bifs/base.go @@ -50,9 +50,9 @@ package bifs import ( "fmt" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // Function-pointer type for zary functions. @@ -104,6 +104,8 @@ func _zero1(input1 *mlrval.Mlrval) *mlrval.Mlrval { } // Return one (unary) +// +//lint:ignore U1000 util function might be used later func __one1(input1 *mlrval.Mlrval) *mlrval.Mlrval { return mlrval.FromInt(1) } diff --git a/pkg/bifs/bits.go b/pkg/bifs/bits.go index 5ed8cc20e..c9001c431 100644 --- a/pkg/bifs/bits.go +++ b/pkg/bifs/bits.go @@ -1,7 +1,7 @@ package bifs import ( - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ================================================================ diff --git a/pkg/bifs/bits_test.go b/pkg/bifs/bits_test.go index 96718e00d..9239d58a4 100644 --- a/pkg/bifs/bits_test.go +++ b/pkg/bifs/bits_test.go @@ -5,7 +5,7 @@ import ( "github.com/stretchr/testify/assert" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) func TestBIF_bitcount(t *testing.T) { diff --git a/pkg/bifs/booleans.go b/pkg/bifs/booleans.go index c0b3bc3db..181e5cbc6 100644 --- a/pkg/bifs/booleans.go +++ b/pkg/bifs/booleans.go @@ -5,7 +5,7 @@ package bifs import ( - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) func BIF_logical_NOT(input1 *mlrval.Mlrval) *mlrval.Mlrval { diff --git a/pkg/bifs/cmp.go b/pkg/bifs/cmp.go index 832feab57..3be517990 100644 --- a/pkg/bifs/cmp.go +++ b/pkg/bifs/cmp.go @@ -5,8 +5,8 @@ package bifs import ( - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -246,7 +246,7 @@ func eq_b_aa(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { for i := range a { eq := BIF_equals(a[i], b[i]) lib.InternalCodingErrorIf(eq.Type() != mlrval.MT_BOOL) - if eq.AcquireBoolValue() == false { + if !eq.AcquireBoolValue() { return mlrval.FALSE } } diff --git a/pkg/bifs/collections.go b/pkg/bifs/collections.go index a734ee451..a47730702 100644 --- a/pkg/bifs/collections.go +++ b/pkg/bifs/collections.go @@ -5,8 +5,8 @@ import ( "strconv" "strings" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ================================================================ @@ -373,7 +373,7 @@ func BIF_joink(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { // joinv([3,4,5], ",") -> "3,4,5" // joinv({"a":3,"b":4,"c":5}, ",") -> "3,4,5" func BIF_joinv(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { - if !input2.IsString() { + if !input2.IsStringOrVoid() { return mlrval.FromNotStringError("joinv", input2) } fieldSeparator := input2.AcquireStringValue() @@ -568,15 +568,16 @@ func BIF_splitnvx(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { // ---------------------------------------------------------------- // splita("3,4,5", ",") -> [3,4,5] func BIF_splita(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { - if !input1.IsStringOrVoid() { - return mlrval.FromNotStringError("splita", input1) + if !input1.IsLegit() { + return input1 } + input1String := input1.String() if !input2.IsString() { return mlrval.FromNotStringError("splita", input2) } fieldSeparator := input2.AcquireStringValue() - fields := lib.SplitString(input1.AcquireStringValue(), fieldSeparator) + fields := lib.SplitString(input1String, fieldSeparator) arrayval := make([]*mlrval.Mlrval, len(fields)) @@ -592,16 +593,16 @@ func BIF_splita(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { // BIF_splitax splits a string to an array, without type-inference: // e.g. splitax("3,4,5", ",") -> ["3","4","5"] func BIF_splitax(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { - if !input1.IsStringOrVoid() { - return mlrval.FromNotStringError("splitax", input1) + if !input1.IsLegit() { + return input1 } + input1String := input1.String() if !input2.IsString() { return mlrval.FromNotStringError("splitax", input2) } - input := input1.AcquireStringValue() fieldSeparator := input2.AcquireStringValue() - return bif_splitax_helper(input, fieldSeparator) + return bif_splitax_helper(input1String, fieldSeparator) } // bif_splitax_helper is split out for the benefit of BIF_splitax and diff --git a/pkg/bifs/collections_test.go b/pkg/bifs/collections_test.go index 16ffba8c6..595e8c670 100644 --- a/pkg/bifs/collections_test.go +++ b/pkg/bifs/collections_test.go @@ -5,7 +5,7 @@ import ( "github.com/stretchr/testify/assert" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) func TestBIF_length(t *testing.T) { diff --git a/pkg/bifs/datetime.go b/pkg/bifs/datetime.go index 9fa11e6b0..84bd48fd1 100644 --- a/pkg/bifs/datetime.go +++ b/pkg/bifs/datetime.go @@ -5,11 +5,11 @@ import ( "regexp" "time" - strptime "github.com/johnkerl/miller/pkg/pbnjay-strptime" + strptime "github.com/johnkerl/miller/v6/pkg/pbnjay-strptime" "github.com/lestrrat-go/strftime" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) const ISO8601_TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" diff --git a/pkg/bifs/hashing.go b/pkg/bifs/hashing.go index e2d09d1e4..829c67601 100644 --- a/pkg/bifs/hashing.go +++ b/pkg/bifs/hashing.go @@ -7,7 +7,7 @@ import ( "crypto/sha512" "fmt" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) func BIF_md5(input1 *mlrval.Mlrval) *mlrval.Mlrval { diff --git a/pkg/bifs/hashing_test.go b/pkg/bifs/hashing_test.go index 6b44028be..5e3c177f1 100644 --- a/pkg/bifs/hashing_test.go +++ b/pkg/bifs/hashing_test.go @@ -5,7 +5,7 @@ import ( "github.com/stretchr/testify/assert" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) func TestBIF_md5(t *testing.T) { diff --git a/pkg/bifs/mathlib.go b/pkg/bifs/mathlib.go index b415cb809..1c5395f69 100644 --- a/pkg/bifs/mathlib.go +++ b/pkg/bifs/mathlib.go @@ -7,8 +7,8 @@ package bifs import ( "math" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ---------------------------------------------------------------- diff --git a/pkg/bifs/percentiles.go b/pkg/bifs/percentiles.go index cecb98aec..ef1083879 100644 --- a/pkg/bifs/percentiles.go +++ b/pkg/bifs/percentiles.go @@ -3,7 +3,7 @@ package bifs import ( "math" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) func GetPercentileLinearlyInterpolated( diff --git a/pkg/bifs/random.go b/pkg/bifs/random.go index c85509da6..007174db9 100644 --- a/pkg/bifs/random.go +++ b/pkg/bifs/random.go @@ -3,8 +3,8 @@ package bifs import ( "math" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) func BIF_urand() *mlrval.Mlrval { diff --git a/pkg/bifs/regex.go b/pkg/bifs/regex.go index 52cab9ac5..2095003a6 100644 --- a/pkg/bifs/regex.go +++ b/pkg/bifs/regex.go @@ -3,8 +3,8 @@ package bifs import ( "strings" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // BIF_ssub implements the ssub function -- no-frills string-replace, no @@ -52,11 +52,6 @@ func bif_ssub_gssub(input1, input2, input3 *mlrval.Mlrval, doAll bool, funcname // BIF_sub implements the sub function, with support for regexes and regex captures // of the form "\1" .. "\9". -// -// TODO: make a variant which allows compiling the regexp once and reusing it -// on each record. Likewise for other regex-using functions in this file. But -// first, do a profiling run to see how much time would be saved, and if this -// precomputing+caching would be worthwhile. func BIF_sub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval { if input1.IsErrorOrAbsent() { return input1 @@ -81,7 +76,7 @@ func BIF_sub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval { sregex := input2.AcquireStringValue() replacement := input3.AcquireStringValue() - stringOutput := lib.RegexSub(input, sregex, replacement) + stringOutput := lib.RegexStringSub(input, sregex, replacement) return mlrval.FromString(stringOutput) } @@ -111,10 +106,83 @@ func BIF_gsub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval { sregex := input2.AcquireStringValue() replacement := input3.AcquireStringValue() - stringOutput := lib.RegexGsub(input, sregex, replacement) + stringOutput := lib.RegexStringGsub(input, sregex, replacement) return mlrval.FromString(stringOutput) } +func BIF_strmatch(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { + if !input1.IsLegit() { + return mlrval.FromNotStringError("strmatch", input1) + } + if !input2.IsLegit() { + return mlrval.FromNotStringError("strmatch", input2) + } + input1string := input1.String() + if !input2.IsStringOrVoid() { + return mlrval.FromNotStringError("strmatch", input2) + } + + boolOutput := lib.RegexStringMatchSimple(input1string, input2.AcquireStringValue()) + + return mlrval.FromBool(boolOutput) +} + +func BIF_strmatchx(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { + if !input1.IsLegit() { + return mlrval.FromNotStringError("strmatchx", input1) + } + if !input2.IsLegit() { + return mlrval.FromNotStringError("strmatchx", input2) + } + input1string := input1.String() + if !input2.IsStringOrVoid() { + return mlrval.FromNotStringError("strmatchx", input2) + } + + boolOutput, captures, starts, ends := lib.RegexStringMatchWithMapResults(input1string, input2.AcquireStringValue()) + + results := mlrval.NewMlrmap() + results.PutReference("matched", mlrval.FromBool(boolOutput)) + + captures_array := make([]*mlrval.Mlrval, len(captures)) + + if len(captures) > 0 { + for i := range captures { + if i == 0 { + results.PutReference("full_capture", mlrval.FromString(captures[i])) + } else { + captures_array[i] = mlrval.FromString(captures[i]) + } + } + + starts_array := make([]*mlrval.Mlrval, len(starts)) + for i := range starts { + if i == 0 { + results.PutReference("full_start", mlrval.FromInt(int64(starts[i]))) + } else { + starts_array[i] = mlrval.FromInt(int64(starts[i])) + } + } + + ends_array := make([]*mlrval.Mlrval, len(ends)) + for i := range ends { + if i == 0 { + results.PutReference("full_end", mlrval.FromInt(int64(ends[i]))) + } else { + ends_array[i] = mlrval.FromInt(int64(ends[i])) + } + } + + if len(captures) > 1 { + results.PutReference("captures", mlrval.FromArray(captures_array[1:])) + results.PutReference("starts", mlrval.FromArray(starts_array[1:])) + results.PutReference("ends", mlrval.FromArray(ends_array[1:])) + } + } + + return mlrval.FromMap(results) +} + // BIF_string_matches_regexp implements the =~ operator, with support for // setting regex-captures for later expressions to access using "\1" .. "\9". func BIF_string_matches_regexp(input1, input2 *mlrval.Mlrval) (retval *mlrval.Mlrval, captures []string) { @@ -129,11 +197,11 @@ func BIF_string_matches_regexp(input1, input2 *mlrval.Mlrval) (retval *mlrval.Ml return mlrval.FromNotStringError("=~", input2), nil } - boolOutput, captures := lib.RegexMatches(input1string, input2.AcquireStringValue()) + boolOutput, captures := lib.RegexStringMatchWithCaptures(input1string, input2.AcquireStringValue()) return mlrval.FromBool(boolOutput), captures } -// BIF_string_matches_regexp implements the !=~ operator. +// BIF_string_does_not_match_regexp implements the !=~ operator. func BIF_string_does_not_match_regexp(input1, input2 *mlrval.Mlrval) (retval *mlrval.Mlrval, captures []string) { output, captures := BIF_string_matches_regexp(input1, input2) if output.IsBool() { diff --git a/pkg/bifs/relative_time.go b/pkg/bifs/relative_time.go index f36258ffe..d05ce3900 100644 --- a/pkg/bifs/relative_time.go +++ b/pkg/bifs/relative_time.go @@ -5,7 +5,7 @@ import ( "math" "strings" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) func BIF_dhms2sec(input1 *mlrval.Mlrval) *mlrval.Mlrval { diff --git a/pkg/bifs/stats.go b/pkg/bifs/stats.go index ff3531a31..bc9bc6e43 100644 --- a/pkg/bifs/stats.go +++ b/pkg/bifs/stats.go @@ -4,8 +4,8 @@ import ( "math" "sort" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ---------------------------------------------------------------- @@ -636,7 +636,7 @@ func bif_percentiles_impl( outputs := make([]*mlrval.Mlrval, len(ps)) - for i, _ := range ps { + for i := range ps { p, ok := ps[i].GetNumericToFloatValue() if !ok { outputs[i] = type_error_named_argument(funcname, "numeric", "percentile", ps[i]) @@ -655,7 +655,7 @@ func bif_percentiles_impl( return mlrval.FromArray(outputs) } else { m := mlrval.NewMlrmap() - for i, _ := range ps { + for i := range ps { sp := ps[i].String() m.PutCopy(sp, outputs[i]) } diff --git a/pkg/bifs/stats_test.go b/pkg/bifs/stats_test.go index 735ceab83..a8e846897 100644 --- a/pkg/bifs/stats_test.go +++ b/pkg/bifs/stats_test.go @@ -6,7 +6,7 @@ import ( "github.com/stretchr/testify/assert" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) func stats_test_array(n int) *mlrval.Mlrval { diff --git a/pkg/bifs/strings.go b/pkg/bifs/strings.go index cd68ee480..73aef62bf 100644 --- a/pkg/bifs/strings.go +++ b/pkg/bifs/strings.go @@ -7,8 +7,8 @@ import ( "strconv" "strings" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ================================================================ @@ -344,11 +344,12 @@ func BIF_capitalize(input1 *mlrval.Mlrval) *mlrval.Mlrval { // ---------------------------------------------------------------- func BIF_clean_whitespace(input1 *mlrval.Mlrval) *mlrval.Mlrval { - return BIF_strip( + mv := BIF_strip( BIF_collapse_whitespace_regexp( input1, _whitespace_regexp, ), ) + return mlrval.FromInferredType(mv.String()) } // ================================================================ diff --git a/pkg/bifs/system.go b/pkg/bifs/system.go index d56f0bb66..e734f2998 100644 --- a/pkg/bifs/system.go +++ b/pkg/bifs/system.go @@ -6,9 +6,9 @@ import ( "runtime" "strings" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/platform" - "github.com/johnkerl/miller/pkg/version" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/platform" + "github.com/johnkerl/miller/v6/pkg/version" ) func BIF_version() *mlrval.Mlrval { @@ -102,3 +102,25 @@ func BIF_exec(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval { outputString := strings.TrimRight(string(outputBytes), "\n") return mlrval.FromString(outputString) } + +func BIF_stat(input1 *mlrval.Mlrval) *mlrval.Mlrval { + if !input1.IsStringOrVoid() { + return mlrval.FromNotStringError("system", input1) + } + path := input1.AcquireStringValue() + + fileInfo, err := os.Stat(path) + + if err != nil { + return mlrval.FromError(err) + } + + output := mlrval.NewMlrmap() + output.PutReference("name", mlrval.FromString(fileInfo.Name())) + output.PutReference("size", mlrval.FromInt(fileInfo.Size())) + output.PutReference("mode", mlrval.FromIntShowingOctal(int64(fileInfo.Mode()))) + output.PutReference("modtime", mlrval.FromInt(fileInfo.ModTime().UTC().Unix())) + output.PutReference("isdir", mlrval.FromBool(fileInfo.IsDir())) + + return mlrval.FromMap(output) +} diff --git a/pkg/bifs/types.go b/pkg/bifs/types.go index 87ee80448..d3e8d61ef 100644 --- a/pkg/bifs/types.go +++ b/pkg/bifs/types.go @@ -5,9 +5,9 @@ import ( "math" "os" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ================================================================ @@ -37,7 +37,7 @@ func float_to_int(input1 *mlrval.Mlrval) *mlrval.Mlrval { } func bool_to_int(input1 *mlrval.Mlrval) *mlrval.Mlrval { - if input1.AcquireBoolValue() == true { + if input1.AcquireBoolValue() { return mlrval.FromInt(1) } else { return mlrval.FromInt(0) @@ -92,7 +92,7 @@ func float_to_int_with_base(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { } func bool_to_int_with_base(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { - if input1.AcquireBoolValue() == true { + if input1.AcquireBoolValue() { return mlrval.FromInt(1) } else { return mlrval.FromInt(0) @@ -146,7 +146,7 @@ func int_to_float(input1 *mlrval.Mlrval) *mlrval.Mlrval { } func bool_to_float(input1 *mlrval.Mlrval) *mlrval.Mlrval { - if input1.AcquireBoolValue() == true { + if input1.AcquireBoolValue() { return mlrval.FromFloat(1.0) } else { return mlrval.FromFloat(0.0) diff --git a/pkg/cli/flag_types.go b/pkg/cli/flag_types.go index 590487d43..f1eef4772 100644 --- a/pkg/cli/flag_types.go +++ b/pkg/cli/flag_types.go @@ -42,8 +42,8 @@ import ( "sort" "strings" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/lib" ) // ---------------------------------------------------------------- diff --git a/pkg/cli/flatten_unflatten.go b/pkg/cli/flatten_unflatten.go index cd92ec6f7..006b65f53 100644 --- a/pkg/cli/flatten_unflatten.go +++ b/pkg/cli/flatten_unflatten.go @@ -52,6 +52,10 @@ package cli // * If input is non-JSON and output is JSON: // o Default is to auto-unflatten at output. // o There is a --no-auto-unflatten for those who want it. +// +// * Overriding these: if the last verb the user has explicitly provided is +// flatten, don't undo that by putting an unflatten right after. +// // ================================================================ func DecideFinalFlatten(writerOptions *TWriterOptions) bool { @@ -64,7 +68,22 @@ func DecideFinalFlatten(writerOptions *TWriterOptions) bool { return false } -func DecideFinalUnflatten(options *TOptions) bool { +func DecideFinalUnflatten( + options *TOptions, + verbSequences [][]string, +) bool { + + numVerbs := len(verbSequences) + if numVerbs > 0 { + lastVerbSequence := verbSequences[numVerbs-1] + if len(lastVerbSequence) > 0 { + lastVerbName := lastVerbSequence[0] + if lastVerbName == "flatten" { + return false + } + } + } + ifmt := options.ReaderOptions.InputFileFormat ofmt := options.WriterOptions.OutputFileFormat diff --git a/pkg/cli/option_parse.go b/pkg/cli/option_parse.go index c9732b025..0070b60c8 100644 --- a/pkg/cli/option_parse.go +++ b/pkg/cli/option_parse.go @@ -8,6 +8,7 @@ package cli import ( "bufio" + "errors" "fmt" "io" "os" @@ -15,9 +16,9 @@ import ( "github.com/mattn/go-isatty" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // FinalizeReaderOptions does a few things. @@ -29,7 +30,7 @@ import ( // - IFS/IPS can have escapes like "\x1f" which aren't valid regex literals // so we unhex them. For example, from "\x1f" -- the four bytes '\', 'x', '1', 'f' // -- to the single byte with hex code 0x1f. -func FinalizeReaderOptions(readerOptions *TReaderOptions) { +func FinalizeReaderOptions(readerOptions *TReaderOptions) error { readerOptions.IFS = lib.UnhexStringLiteral(readerOptions.IFS) readerOptions.IPS = lib.UnhexStringLiteral(readerOptions.IPS) @@ -57,12 +58,17 @@ func FinalizeReaderOptions(readerOptions *TReaderOptions) { readerOptions.IFS = lib.UnbackslashStringLiteral(readerOptions.IFS) readerOptions.IPS = lib.UnbackslashStringLiteral(readerOptions.IPS) readerOptions.IRS = lib.UnbackslashStringLiteral(readerOptions.IRS) + + if readerOptions.IRS == "" { + return errors.New("empty IRS") + } + return nil } // FinalizeWriterOptions unbackslashes OPS, OFS, and ORS. This is because // because the '\n' at the command line which is Go "\\n" (a backslash and an // n) needs to become the single newline character., and likewise for "\t", etc. -func FinalizeWriterOptions(writerOptions *TWriterOptions) { +func FinalizeWriterOptions(writerOptions *TWriterOptions) error { if !writerOptions.ofsWasSpecified { writerOptions.OFS = defaultFSes[writerOptions.OutputFileFormat] } @@ -84,6 +90,8 @@ func FinalizeWriterOptions(writerOptions *TWriterOptions) { writerOptions.OFS = lib.UnbackslashStringLiteral(writerOptions.OFS) writerOptions.OPS = lib.UnbackslashStringLiteral(writerOptions.OPS) writerOptions.ORS = lib.UnbackslashStringLiteral(writerOptions.ORS) + + return nil } // ================================================================ @@ -96,6 +104,7 @@ var FLAG_TABLE = FlagTable{ &CSVTSVOnlyFlagSection, &JSONOnlyFlagSection, &PPRINTOnlyFlagSection, + &DKVPOnlyFlagSection, &CompressedDataFlagSection, &CommentsInDataFlagSection, &OutputColorizationFlagSection, @@ -452,7 +461,7 @@ var JSONOnlyFlagSection = FlagSection{ { name: "--no-jlistwrap", - help: "Wrap JSON output in outermost `[ ]`. This is the default for JSON Lines output format.", + help: "Do not wrap JSON output in outermost `[ ]`. This is the default for JSON Lines output format.", parser: func(args []string, argc int, pargi *int, options *TOptions) { options.WriterOptions.WrapJSONOutputInOuterList = false *pargi += 1 @@ -494,8 +503,44 @@ var PPRINTOnlyFlagSection = FlagSection{ }, { - name: "--barred", - help: "Prints a border around PPRINT output (not available for input).", + name: "--barred", + altNames: []string{"--barred-output"}, + help: "Prints a border around PPRINT output.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.WriterOptions.BarredPprintOutput = true + *pargi += 1 + }, + }, + + { + name: "--barred-input", + help: "When used in conjunction with --pprint, accepts barred input.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.BarredPprintInput = true + options.ReaderOptions.IFS = "|" + *pargi += 1 + }, + }, + }, +} + +// ================================================================ +// DKVP-ONLY FLAGS + +func DKVPOnlyPrintInfo() { + fmt.Println("These are flags which are applicable to DKVP format.") +} + +func init() { DKVPOnlyFlagSection.Sort() } + +var DKVPOnlyFlagSection = FlagSection{ + name: "DKVP-only flags", + infoPrinter: DKVPOnlyPrintInfo, + flags: []Flag{ + + { + name: "--incr-key", + help: "Without this option, keyless DKVP fields are keyed by field number. For example: `a=10,b=20,30,d=40,50` is ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With this option, they're keyed by a running counter of keyless fields. For example: `a=10,b=20,30,d=40,50` is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`.", parser: func(args []string, argc int, pargi *int, options *TOptions) { options.WriterOptions.BarredPprintOutput = true *pargi += 1 @@ -751,6 +796,9 @@ var FileFormatFlagSection = FlagSection{ parser: func(args []string, argc int, pargi *int, options *TOptions) { CheckArgCount(args, *pargi, argc, 2) options.ReaderOptions.InputFileFormat = args[*pargi+1] + if options.ReaderOptions.InputFileFormat == "md" { + options.ReaderOptions.InputFileFormat = "markdown" // alias + } *pargi += 2 }, }, @@ -812,6 +860,9 @@ var FileFormatFlagSection = FlagSection{ parser: func(args []string, argc int, pargi *int, options *TOptions) { CheckArgCount(args, *pargi, argc, 2) options.WriterOptions.OutputFileFormat = args[*pargi+1] + if options.WriterOptions.OutputFileFormat == "md" { + options.WriterOptions.OutputFileFormat = "markdown" // alias + } *pargi += 2 }, }, @@ -885,8 +936,19 @@ var FileFormatFlagSection = FlagSection{ }, { - name: "--omd", - help: "Use markdown-tabular format for output data.", + name: "--imd", + altNames: []string{"--imarkdown"}, + help: "Use markdown-tabular format for input data.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "markdown" + *pargi += 1 + }, + }, + + { + name: "--omd", + altNames: []string{"--omarkdown"}, + help: "Use markdown-tabular format for output data.", parser: func(args []string, argc int, pargi *int, options *TOptions) { options.WriterOptions.OutputFileFormat = "markdown" *pargi += 1 @@ -917,9 +979,7 @@ var FileFormatFlagSection = FlagSection{ name: "--ojsonl", help: "Use JSON Lines format for output data.", parser: func(args []string, argc int, pargi *int, options *TOptions) { - options.WriterOptions.OutputFileFormat = "json" - options.WriterOptions.WrapJSONOutputInOuterList = false - options.WriterOptions.JSONOutputMultiline = false + options.WriterOptions.OutputFileFormat = "jsonl" *pargi += 1 }, }, @@ -973,7 +1033,7 @@ var FileFormatFlagSection = FlagSection{ { name: "--csv", help: "Use CSV format for input and output data.", - altNames: []string{"-c"}, + altNames: []string{"-c", "--c2c"}, parser: func(args []string, argc int, pargi *int, options *TOptions) { options.ReaderOptions.InputFileFormat = "csv" options.WriterOptions.OutputFileFormat = "csv" @@ -994,7 +1054,7 @@ var FileFormatFlagSection = FlagSection{ { name: "--tsv", help: "Use TSV format for input and output data.", - altNames: []string{"-t"}, + altNames: []string{"-t", "--t2t"}, parser: func(args []string, argc int, pargi *int, options *TOptions) { options.ReaderOptions.InputFileFormat = "tsv" options.WriterOptions.OutputFileFormat = "tsv" @@ -1057,8 +1117,9 @@ var FileFormatFlagSection = FlagSection{ }, { - name: "--dkvp", - help: "Use DKVP format for input and output data.", + name: "--dkvp", + help: "Use DKVP format for input and output data.", + altNames: []string{"--d2d"}, parser: func(args []string, argc int, pargi *int, options *TOptions) { options.ReaderOptions.InputFileFormat = "dkvp" options.WriterOptions.OutputFileFormat = "dkvp" @@ -1069,7 +1130,7 @@ var FileFormatFlagSection = FlagSection{ { name: "--json", help: "Use JSON format for input and output data.", - altNames: []string{"-j"}, + altNames: []string{"-j", "--j2j"}, parser: func(args []string, argc int, pargi *int, options *TOptions) { options.ReaderOptions.InputFileFormat = "json" options.WriterOptions.OutputFileFormat = "json" @@ -1080,20 +1141,20 @@ var FileFormatFlagSection = FlagSection{ }, { - name: "--jsonl", - help: "Use JSON Lines format for input and output data.", + name: "--jsonl", + help: "Use JSON Lines format for input and output data.", + altNames: []string{"--l2l"}, parser: func(args []string, argc int, pargi *int, options *TOptions) { options.ReaderOptions.InputFileFormat = "json" - options.WriterOptions.OutputFileFormat = "json" - options.WriterOptions.WrapJSONOutputInOuterList = false - options.WriterOptions.JSONOutputMultiline = false + options.WriterOptions.OutputFileFormat = "jsonl" *pargi += 1 }, }, { - name: "--nidx", - help: "Use NIDX format for input and output data.", + name: "--nidx", + help: "Use NIDX format for input and output data.", + altNames: []string{"--n2n"}, parser: func(args []string, argc int, pargi *int, options *TOptions) { options.ReaderOptions.InputFileFormat = "nidx" options.WriterOptions.OutputFileFormat = "nidx" @@ -1102,8 +1163,9 @@ var FileFormatFlagSection = FlagSection{ }, { - name: "--xtab", - help: "Use XTAB format for input and output data.", + name: "--xtab", + help: "Use XTAB format for input and output data.", + altNames: []string{"--x2x"}, parser: func(args []string, argc int, pargi *int, options *TOptions) { options.ReaderOptions.InputFileFormat = "xtab" options.WriterOptions.OutputFileFormat = "xtab" @@ -1121,8 +1183,9 @@ var FileFormatFlagSection = FlagSection{ }, { - name: "--pprint", - help: "Use PPRINT format for input and output data.", + name: "--pprint", + help: "Use PPRINT format for input and output data.", + altNames: []string{"--p2p"}, parser: func(args []string, argc int, pargi *int, options *TOptions) { options.ReaderOptions.InputFileFormat = "pprint" options.ReaderOptions.IFS = " " @@ -1140,19 +1203,19 @@ var FileFormatFlagSection = FlagSection{ func FormatConversionKeystrokeSaverPrintInfo() { fmt.Println(`As keystroke-savers for format-conversion you may use the following. The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX, -JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively. Note that markdown -format is available for output only. +JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively. -| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown | -+--------+-------+-------+--------+--------+--------+--------+--------+----------+ -| CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m | -| TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m | -| JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m | -| JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m | -| DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m | -| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m | -| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m | -| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m |`) +| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown | ++----------+----------+----------+----------+-------+-------+-------+-------+--------+----------| +| CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m | +| TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m | +| JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m | +| JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m | +| DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m | +| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m | +| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m | +| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m | +| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |`) } func init() { FormatConversionKeystrokeSaverFlagSection.Sort() } @@ -1277,6 +1340,18 @@ var FormatConversionKeystrokeSaverFlagSection = FlagSection{ *pargi += 1 }, }, + { + name: "--c2m", + help: "Use CSV for input, markdown-tabular for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "csv" + options.WriterOptions.OutputFileFormat = "markdown" + *pargi += 1 + }, + }, { name: "--c2b", help: "Use CSV for input, PPRINT with `--barred` for output.", @@ -1397,6 +1472,18 @@ var FormatConversionKeystrokeSaverFlagSection = FlagSection{ *pargi += 1 }, }, + { + name: "--t2m", + help: "Use TSV for input, markdown tabular for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "tsv" + options.WriterOptions.OutputFileFormat = "markdown" + *pargi += 1 + }, + }, { name: "--t2b", help: "Use TSV for input, PPRINT with `--barred` for output.", @@ -1516,6 +1603,18 @@ var FormatConversionKeystrokeSaverFlagSection = FlagSection{ *pargi += 1 }, }, + { + name: "--d2m", + help: "Use DKVP for input, markdown tabular for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "dkvp" + options.WriterOptions.OutputFileFormat = "markdown" + *pargi += 1 + }, + }, { name: "--d2b", help: "Use DKVP for input, PPRINT with `--barred` for output.", @@ -1631,6 +1730,18 @@ var FormatConversionKeystrokeSaverFlagSection = FlagSection{ *pargi += 1 }, }, + { + name: "--n2m", + help: "Use NIDX for input, markdown tabular for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "nidx" + options.WriterOptions.OutputFileFormat = "markdown" + *pargi += 1 + }, + }, { name: "--n2b", help: "Use NIDX for input, PPRINT with `--barred` for output.", @@ -1745,6 +1856,18 @@ var FormatConversionKeystrokeSaverFlagSection = FlagSection{ *pargi += 1 }, }, + { + name: "--j2m", + help: "Use JSON for input, markdown-tabular for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "json" + options.WriterOptions.OutputFileFormat = "markdown" + *pargi += 1 + }, + }, { name: "--j2b", help: "Use JSON for input, PPRINT with --barred for output.", @@ -1856,6 +1979,18 @@ var FormatConversionKeystrokeSaverFlagSection = FlagSection{ *pargi += 1 }, }, + { + name: "--l2m", + help: "Use JSON Lines for input, markdown-tabular for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "json" + options.WriterOptions.OutputFileFormat = "markdown" + *pargi += 1 + }, + }, { name: "--l2b", help: "Use JSON Lines for input, PPRINT with --barred for output.", @@ -2012,6 +2147,115 @@ var FormatConversionKeystrokeSaverFlagSection = FlagSection{ }, }, + { + name: "--m2c", + help: "Use markdown-tabular for input, CSV for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "markdown" + options.WriterOptions.OutputFileFormat = "csv" + options.ReaderOptions.ifsWasSpecified = true + options.WriterOptions.orsWasSpecified = true + *pargi += 1 + }, + }, + { + name: "--m2t", + help: "Use markdown-tabular for input, TSV for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "markdown" + options.WriterOptions.OutputFileFormat = "tsv" + options.ReaderOptions.ifsWasSpecified = true + *pargi += 1 + }, + }, + { + name: "--m2d", + help: "Use markdown-tabular for input, DKVP for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "markdown" + options.WriterOptions.OutputFileFormat = "dkvp" + options.ReaderOptions.ifsWasSpecified = true + *pargi += 1 + }, + }, + { + name: "--m2n", + help: "Use markdown-tabular for input, NIDX for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "markdown" + options.WriterOptions.OutputFileFormat = "nidx" + options.ReaderOptions.ifsWasSpecified = true + *pargi += 1 + }, + }, + { + name: "--m2j", + help: "Use markdown-tabular for input, JSON for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "markdown" + options.WriterOptions.OutputFileFormat = "json" + options.WriterOptions.WrapJSONOutputInOuterList = true + options.WriterOptions.JSONOutputMultiline = true + options.ReaderOptions.ifsWasSpecified = true + *pargi += 1 + }, + }, + { + name: "--m2l", + help: "Use markdown-tabular for input, JSON Lines for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "markdown" + options.WriterOptions.OutputFileFormat = "json" + options.WriterOptions.WrapJSONOutputInOuterList = false + options.WriterOptions.JSONOutputMultiline = false + options.ReaderOptions.ifsWasSpecified = true + *pargi += 1 + }, + }, + { + name: "--m2x", + help: "Use markdown-tabular for input, XTAB for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "markdown" + options.WriterOptions.OutputFileFormat = "xtab" + options.ReaderOptions.ifsWasSpecified = true + *pargi += 1 + }, + }, + { + name: "--m2p", + help: "Use markdown-tabular for input, PPRINT for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "markdown" + options.WriterOptions.OutputFileFormat = "pprint" + *pargi += 1 + }, + }, + { name: "--x2c", help: "Use XTAB for input, CSV for output.", @@ -2101,6 +2345,18 @@ var FormatConversionKeystrokeSaverFlagSection = FlagSection{ *pargi += 1 }, }, + { + name: "--x2m", + help: "Use XTAB for input, markdown-tabular for output.", + // For format-conversion keystroke-savers, a matrix is plenty -- we don't + // need to print a tedious 60-line list. + suppressFlagEnumeration: true, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.InputFileFormat = "xtab" + options.WriterOptions.OutputFileFormat = "markdown" + *pargi += 1 + }, + }, { name: "--x2b", help: "Use XTAB for input, PPRINT with `--barred` for output.", @@ -2148,7 +2404,7 @@ var CSVTSVOnlyFlagSection = FlagSection{ altNames: []string{"--no-implicit-tsv-header"}, help: "Opposite of `--implicit-csv-header`. This is the default anyway -- the main use is for the flags to `mlr join` if you have main file(s) which are headerless but you want to join in on a file which does have a CSV/TSV header. Then you could use `mlr --csv --implicit-csv-header join --no-implicit-csv-header -l your-join-in-with-header.csv ... your-headerless.csv`.", parser: func(args []string, argc int, pargi *int, options *TOptions) { - options.ReaderOptions.UseImplicitCSVHeader = false + options.ReaderOptions.UseImplicitHeader = false *pargi += 1 }, }, @@ -2163,12 +2419,21 @@ var CSVTSVOnlyFlagSection = FlagSection{ }, }, + { + name: "--no-auto-unsparsify", + help: "For CSV/TSV output: if the record keys change from one row to another, emit a blank line and a new header line. This is non-compliant with RFC 4180 but it helpful for heterogeneous data.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.WriterOptions.NoAutoUnsparsify = true + *pargi += 1 + }, + }, + { name: "--implicit-csv-header", altNames: []string{"--headerless-csv-input", "--hi", "--implicit-tsv-header"}, help: "Use 1,2,3,... as field labels, rather than from line 1 of input files. Tip: combine with `label` to recreate missing headers.", parser: func(args []string, argc int, pargi *int, options *TOptions) { - options.ReaderOptions.UseImplicitCSVHeader = true + options.ReaderOptions.UseImplicitHeader = true *pargi += 1 }, }, @@ -2178,7 +2443,7 @@ var CSVTSVOnlyFlagSection = FlagSection{ altNames: []string{"--ho", "--headerless-tsv-output"}, help: "Print only CSV/TSV data lines; do not print CSV/TSV header lines.", parser: func(args []string, argc int, pargi *int, options *TOptions) { - options.WriterOptions.HeaderlessCSVOutput = true + options.WriterOptions.HeaderlessOutput = true *pargi += 1 }, }, @@ -2187,8 +2452,8 @@ var CSVTSVOnlyFlagSection = FlagSection{ name: "-N", help: "Keystroke-saver for `--implicit-csv-header --headerless-csv-output`.", parser: func(args []string, argc int, pargi *int, options *TOptions) { - options.ReaderOptions.UseImplicitCSVHeader = true - options.WriterOptions.HeaderlessCSVOutput = true + options.ReaderOptions.UseImplicitHeader = true + options.WriterOptions.HeaderlessOutput = true *pargi += 1 }, }, @@ -2409,7 +2674,7 @@ var CommentsInDataFlagSection = FlagSection{ { name: "--skip-comments-with", arg: "{string}", - help: "Ignore commented lines within input, with specified prefix.", + help: "Ignore commented lines within input, with specified prefix. For CSV input format, the prefix must be a single character.", parser: func(args []string, argc int, pargi *int, options *TOptions) { CheckArgCount(args, *pargi, argc, 2) options.ReaderOptions.CommentString = args[*pargi+1] @@ -2431,7 +2696,7 @@ var CommentsInDataFlagSection = FlagSection{ { name: "--pass-comments-with", arg: "{string}", - help: "Immediately print commented lines within input, with specified prefix.", + help: "Immediately print commented lines within input, with specified prefix. For CSV input format, the prefix must be a single character.", parser: func(args []string, argc int, pargi *int, options *TOptions) { CheckArgCount(args, *pargi, argc, 2) options.ReaderOptions.CommentString = args[*pargi+1] @@ -2477,8 +2742,8 @@ Mechanisms for coloring: How you can control colorization: * Suppression/unsuppression: - * Environment variable ` + "`export MLR_NO_COLOR=true`" + ` means don't color - even if stdout+TTY. + * Environment variable ` + "`export MLR_NO_COLOR=true` or `export NO_COLOR=true`" + ` + means don't color even if stdout+TTY. * Environment variable ` + "`export MLR_ALWAYS_COLOR=true`" + ` means do color even if not stdout+TTY. For example, you might want to use this when piping mlr output to ` + "`less -r`" + `. @@ -2639,7 +2904,7 @@ var OutputColorizationFlagSection = FlagSection{ func FlattenUnflattenPrintInfo() { fmt.Println("These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).") fmt.Println() - fmt.Println("See the Flatten/unflatten doc page for more information.") + fmt.Println("See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.") } func init() { FlattenUnflattenFlagSection.Sort() } @@ -2663,7 +2928,7 @@ var FlattenUnflattenFlagSection = FlagSection{ { name: "--no-auto-flatten", - help: "When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.", + help: "When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.", parser: func(args []string, argc int, pargi *int, options *TOptions) { options.WriterOptions.AutoFlatten = false *pargi += 1 @@ -2672,7 +2937,7 @@ var FlattenUnflattenFlagSection = FlagSection{ { name: "--no-auto-unflatten", - help: "When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.", + help: "When input is non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.", parser: func(args []string, argc int, pargi *int, options *TOptions) { options.WriterOptions.AutoUnflatten = false *pargi += 1 @@ -3102,5 +3367,13 @@ has its own overhead.`, *pargi += 2 }, }, + + { + name: "--norc", + help: "Do not load a .mlrrc file.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + *pargi += 1 + }, + }, }, } diff --git a/pkg/cli/option_types.go b/pkg/cli/option_types.go index d959e0c52..58917728a 100644 --- a/pkg/cli/option_types.go +++ b/pkg/cli/option_types.go @@ -9,7 +9,7 @@ package cli import ( "regexp" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) type TCommentHandling int @@ -53,10 +53,12 @@ type TReaderOptions struct { irsWasSpecified bool allowRepeatIFSWasSpecified bool - UseImplicitCSVHeader bool + UseImplicitHeader bool AllowRaggedCSVInput bool CSVLazyQuotes bool CSVTrimLeadingSpace bool + BarredPprintInput bool + IncrementImplicitKey bool CommentHandling TCommentHandling CommentString string @@ -95,7 +97,7 @@ type TWriterOptions struct { opsWasSpecified bool orsWasSpecified bool - HeaderlessCSVOutput bool + HeaderlessOutput bool BarredPprintOutput bool RightAlignedPPRINTOutput bool RightAlignedXTABOutput bool @@ -133,6 +135,40 @@ type TWriterOptions struct { // (all but JSON) -- unless the user explicitly asks to suppress that. AutoFlatten bool + // Default CSV/TSV: + // a=1,b=2,c=3 + // a=4,b=5 + // leads to + // a,b,c + // 1 2,3 + // 4,5, <-- note trailing empty field + // and + // a=1,b=2,c=3 + // d=4,e=5 + // leads to + // fatal error + // + // With this flag: + // a=1,b=2,c=3 + // a=4,b=5 + // leads to + // a,b,c + // 1 2,3 + // + // a,b + // 4,5 + // + // and + // a=1,b=2,c=3 + // d=4,e=5 + // leads to + // a,b,c + // 1,2,3 + // + // d,e + // 4,5 + NoAutoUnsparsify bool + // For floating-point numbers: "" means use the Go default. FPOFMT string @@ -213,7 +249,7 @@ func DefaultWriterOptions() TWriterOptions { FLATSEP: ".", FlushOnEveryRecord: true, - HeaderlessCSVOutput: false, + HeaderlessOutput: false, WrapJSONOutputInOuterList: true, JSONOutputMultiline: true, diff --git a/pkg/cli/separators.go b/pkg/cli/separators.go index 6a52c3f2c..26c976497 100644 --- a/pkg/cli/separators.go +++ b/pkg/cli/separators.go @@ -21,13 +21,13 @@ const TABS_REGEX = "(\\t)+" const WHITESPACE_REGEX = "([ \\t])+" const ASCII_ESC = "\\x1b" -const ASCII_ETX = "\\x04" +const ASCII_ETX = "\\x03" const ASCII_FS = "\\x1c" const ASCII_GS = "\\x1d" -const ASCII_NULL = "\\x01" +const ASCII_NULL = "\\x00" const ASCII_RS = "\\x1e" -const ASCII_SOH = "\\x02" -const ASCII_STX = "\\x03" +const ASCII_SOH = "\\x01" +const ASCII_STX = "\\x02" const ASCII_US = "\\x1f" const ASV_FS = "\\x1f" @@ -82,6 +82,7 @@ var SEPARATOR_REGEX_NAMES_TO_VALUES = map[string]string{ // E.g. if IFS isn't specified, it's space for NIDX and comma for DKVP, etc. var defaultFSes = map[string]string{ + "gen": ",", "csv": ",", "csvlite": ",", "dkvp": ",", @@ -94,6 +95,7 @@ var defaultFSes = map[string]string{ } var defaultPSes = map[string]string{ + "gen": "N/A", "csv": "N/A", "csvlite": "N/A", "dkvp": "=", @@ -106,6 +108,7 @@ var defaultPSes = map[string]string{ } var defaultRSes = map[string]string{ + "gen": "\n", "csv": "\n", "csvlite": "\n", "dkvp": "\n", @@ -118,6 +121,7 @@ var defaultRSes = map[string]string{ } var defaultAllowRepeatIFSes = map[string]bool{ + "gen": false, "csv": false, "csvlite": false, "dkvp": false, diff --git a/pkg/cli/verb_utils.go b/pkg/cli/verb_utils.go index 421af9af8..d20901998 100644 --- a/pkg/cli/verb_utils.go +++ b/pkg/cli/verb_utils.go @@ -9,7 +9,7 @@ import ( "os" "strconv" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) // For flags with values, e.g. ["-n" "10"], while we're looking at the "-n" this let us see if the "10" slot exists. diff --git a/pkg/climain/mlrcli_mlrrc.go b/pkg/climain/mlrcli_mlrrc.go index d3c5c1401..ce0a85789 100644 --- a/pkg/climain/mlrcli_mlrrc.go +++ b/pkg/climain/mlrcli_mlrrc.go @@ -8,7 +8,7 @@ import ( "regexp" "strings" - "github.com/johnkerl/miller/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/cli" ) // loadMlrrcOrDie rule: If $MLRRC is set, use it and only it. Otherwise try diff --git a/pkg/climain/mlrcli_parse.go b/pkg/climain/mlrcli_parse.go index 9e8679eef..68ba5abea 100644 --- a/pkg/climain/mlrcli_parse.go +++ b/pkg/climain/mlrcli_parse.go @@ -74,13 +74,13 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/terminals" - "github.com/johnkerl/miller/pkg/terminals/help" - "github.com/johnkerl/miller/pkg/transformers" - "github.com/johnkerl/miller/pkg/version" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/terminals" + "github.com/johnkerl/miller/v6/pkg/terminals/help" + "github.com/johnkerl/miller/v6/pkg/transformers" + "github.com/johnkerl/miller/v6/pkg/version" ) // ParseCommandLine is the entrypoint for handling the Miller command line: @@ -94,11 +94,21 @@ func ParseCommandLine( ) { // mlr -s scriptfile {data-file names ...} means take the contents of // scriptfile as if it were command-line items. + args, err = maybeInterpolateDashS(args) if err != nil { return nil, nil, err } + // Expand "-xyz" into "-x -y -z" while leaving "--xyz" intact. This is a + // keystroke-saver for the user. + // + // This is OK to do globally here since Miller is quite consistent (in + // main, verbs, and auxents) that multi-character options start with two + // dashes, e.g. "--csv". (The sole exception is the sort verb's -nf/-nr + // which are handled specially there.) + args = lib.Getoptify(args) + // Pass one as described at the top of this file. flagSequences, terminalSequence, verbSequences, dataFileNames := parseCommandLinePassOne(args) @@ -151,8 +161,8 @@ func parseCommandLinePassOne( os.Exit(0) } else if args[argi] == "--norc" { - flagSequences = append(flagSequences, args[oargi:argi]) argi += 1 + flagSequences = append(flagSequences, args[oargi:argi]) } else if cli.FLAG_TABLE.Parse(args, argc, &argi, options) { flagSequences = append(flagSequences, args[oargi:argi]) @@ -192,8 +202,8 @@ func parseCommandLinePassOne( transformerSetup := transformers.LookUp(verb) if transformerSetup == nil { fmt.Fprintf(os.Stderr, - "%s: verb \"%s\" not found. Please use \"%s --help\" for a list.\n", - "mlr", verb, "mlr") + "mlr: verb \"%s\" not found. Please use \"mlr -l\" for a list.\n", + verb) os.Exit(1) } @@ -295,7 +305,7 @@ func parseCommandLinePassTwo( rc := cli.FLAG_TABLE.Parse(args, argc, &argi, options) // Should have been parsed OK in pass one. - lib.InternalCodingErrorIf(rc != true) + lib.InternalCodingErrorIf(!rc) // Make sure we consumed the entire flag sequence as parsed by pass one. lib.InternalCodingErrorIf(argi != argc) } @@ -306,8 +316,14 @@ func parseCommandLinePassTwo( return nil, nil, err } - cli.FinalizeReaderOptions(&options.ReaderOptions) - cli.FinalizeWriterOptions(&options.WriterOptions) + err = cli.FinalizeReaderOptions(&options.ReaderOptions) + if err != nil { + return nil, nil, err + } + err = cli.FinalizeWriterOptions(&options.WriterOptions) + if err != nil { + return nil, nil, err + } // Set an optional global formatter for floating-point values if options.WriterOptions.FPOFMT != "" { @@ -370,7 +386,7 @@ func parseCommandLinePassTwo( recordTransformers = append(recordTransformers, transformer) } - if cli.DecideFinalUnflatten(options) { + if cli.DecideFinalUnflatten(options, verbSequences) { // E.g. req.method=GET,req.path=/api/check becomes // '{"req": {"method": "GET", "path": "/api/check"}}' transformer, err := transformers.NewTransformerUnflatten(options.WriterOptions.FLATSEP, options, nil) @@ -388,7 +404,7 @@ func parseCommandLinePassTwo( options.FileNames = nil } - if options.DoInPlace && (options.FileNames == nil || len(options.FileNames) == 0) { + if options.DoInPlace && len(options.FileNames) == 0 { fmt.Fprintf(os.Stderr, "%s: -I option (in-place operation) requires input files.\n", "mlr") os.Exit(1) } diff --git a/pkg/climain/mlrcli_shebang.go b/pkg/climain/mlrcli_shebang.go index b05643bce..cfcab1b39 100644 --- a/pkg/climain/mlrcli_shebang.go +++ b/pkg/climain/mlrcli_shebang.go @@ -2,11 +2,11 @@ package climain import ( "fmt" - "io/ioutil" + "os" "regexp" "strings" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" shellquote "github.com/kballard/go-shellquote" ) @@ -39,7 +39,7 @@ func maybeInterpolateDashS(args []string) ([]string, error) { return args, nil } if len(args) < 3 { - return nil, fmt.Errorf("mlr: -s flag requires a filename after it.") + return nil, fmt.Errorf("mlr: -s flag requires a filename after it") } // mlr -s scriptfile input1.csv input2.csv @@ -49,7 +49,7 @@ func maybeInterpolateDashS(args []string) ([]string, error) { remainingArgs := args[3:] // Read the bytes in the filename given after -s. - byteContents, rerr := ioutil.ReadFile(filename) + byteContents, rerr := os.ReadFile(filename) if rerr != nil { return nil, fmt.Errorf("mlr: cannot read %s: %v", filename, rerr) } @@ -68,7 +68,7 @@ func maybeInterpolateDashS(args []string) ([]string, error) { if stripComments { re := regexp.MustCompile(`#.*`) - for i, _ := range lines { + for i := range lines { lines[i] = re.ReplaceAllString(lines[i], "") } } diff --git a/pkg/colorizer/colorizer.go b/pkg/colorizer/colorizer.go index 3bc4525ad..cd012cffa 100644 --- a/pkg/colorizer/colorizer.go +++ b/pkg/colorizer/colorizer.go @@ -198,7 +198,8 @@ var stdoutIsATTY = getStdoutIsATTY() // Read environment variables at startup time. These can be overridden // afterward using command-line flags. func init() { - if os.Getenv("MLR_NO_COLOR") != "" { + if os.Getenv("MLR_NO_COLOR") != "" || os.Getenv("NO_COLOR") != "" { + colorization = ColorizeOutputNever colorization = ColorizeOutputNever } else if os.Getenv("MLR_ALWAYS_COLOR") != "" { colorization = ColorizeOutputAlways diff --git a/pkg/dsl/ast_build.go b/pkg/dsl/ast_build.go index f417998f6..bba98e1e3 100644 --- a/pkg/dsl/ast_build.go +++ b/pkg/dsl/ast_build.go @@ -8,8 +8,8 @@ package dsl import ( "fmt" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/parsing/token" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/ast_print.go b/pkg/dsl/ast_print.go index 4210e5b17..c735ef84c 100644 --- a/pkg/dsl/ast_print.go +++ b/pkg/dsl/ast_print.go @@ -157,7 +157,7 @@ func (node *ASTNode) printParexOneLineAux() { // IsLeaf determines if an AST node is a leaf node. func (node *ASTNode) IsLeaf() bool { - return node.Children == nil || len(node.Children) == 0 + return len(node.Children) == 0 } // ChildrenAreAllLeaves determines if an AST node's children are all leaf nodes. diff --git a/pkg/dsl/ast_types.go b/pkg/dsl/ast_types.go index 6856ce830..80fd7dcc5 100644 --- a/pkg/dsl/ast_types.go +++ b/pkg/dsl/ast_types.go @@ -5,7 +5,7 @@ package dsl import ( - "github.com/johnkerl/miller/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/parsing/token" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/cst/assignments.go b/pkg/dsl/cst/assignments.go index 129ec850c..81bcac85f 100644 --- a/pkg/dsl/cst/assignments.go +++ b/pkg/dsl/cst/assignments.go @@ -5,9 +5,9 @@ package cst import ( - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ================================================================ diff --git a/pkg/dsl/cst/block_exit.go b/pkg/dsl/cst/block_exit.go index b52b363cb..8542e88e7 100644 --- a/pkg/dsl/cst/block_exit.go +++ b/pkg/dsl/cst/block_exit.go @@ -8,9 +8,9 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- @@ -70,7 +70,7 @@ func (root *RootNode) BuildReturnNode(astNode *dsl.ASTNode) (*ReturnNode, error) } else { lib.InternalCodingErrorIf(true) } - return nil, fmt.Errorf("internal coding error: statement should not be reached.") + return nil, fmt.Errorf("internal coding error: statement should not be reached") } func (node *ReturnNode) Execute(state *runtime.State) (*BlockExitPayload, error) { diff --git a/pkg/dsl/cst/blocks.go b/pkg/dsl/cst/blocks.go index d51c70d75..4b47bf574 100644 --- a/pkg/dsl/cst/blocks.go +++ b/pkg/dsl/cst/blocks.go @@ -6,9 +6,9 @@ package cst import ( - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/cst/builtin_function_manager.go b/pkg/dsl/cst/builtin_function_manager.go index 7fbf60f3d..4a1d83bfa 100644 --- a/pkg/dsl/cst/builtin_function_manager.go +++ b/pkg/dsl/cst/builtin_function_manager.go @@ -19,9 +19,9 @@ import ( "sort" "strings" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/lib" ) type TFunctionClass string @@ -338,6 +338,50 @@ used within subsequent DSL statements. See also "Regular expressions" at ` + lib regexCaptureBinaryFunc: bifs.BIF_string_does_not_match_regexp, }, + { + name: "strmatch", + class: FUNC_CLASS_STRING, + help: `Boolean yes/no for whether the stringable first argument matches the regular-expression second argument. No regex captures are provided; please see ` + "`strmatch`.", + examples: []string{ + `strmatch("a", "abc") is false`, + `strmatch("abc", "a") is true`, + `strmatch("abc", "a[a-z]c") is true`, + `strmatch("abc", "(a).(c)") is true`, + `strmatch(12345, "34") is true`, + }, + binaryFunc: bifs.BIF_strmatch, + }, + + { + name: "strmatchx", + class: FUNC_CLASS_STRING, + help: `Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in contrast to the ` + "`=~` operator. As well, while the `=~` operator limits matches to \\1 through \\9, an arbitrary number are supported here.", + examples: []string{ + `strmatchx("a", "abc") returns:`, + ` {`, + ` "matched": false`, + ` }`, + `strmatchx("abc", "a") returns:`, + ` {`, + ` "matched": true,`, + ` "full_capture": "a",`, + ` "full_start": 1,`, + ` "full_end": 1`, + ` }`, + `strmatchx("[zy:3458]", "([a-z]+):([0-9]+)") returns:`, + ` {`, + ` "matched": true,`, + ` "full_capture": "zy:3458",`, + ` "full_start": 2,`, + ` "full_end": 8,`, + ` "captures": ["zy", "3458"],`, + ` "starts": [2, 5],`, + ` "ends": [3, 8]`, + ` }`, + }, + binaryFunc: bifs.BIF_strmatchx, + }, + { name: "&&", class: FUNC_CLASS_BOOLEAN, @@ -400,7 +444,7 @@ used within subsequent DSL statements. See also "Regular expressions" at ` + lib { name: "clean_whitespace", class: FUNC_CLASS_STRING, - help: "Same as collapse_whitespace and strip.", + help: "Same as collapse_whitespace and strip, followed by type inference.", unaryFunc: bifs.BIF_clean_whitespace, }, @@ -1956,10 +2000,15 @@ Note that NaN has the property that NaN != NaN, so you need 'is_nan(x)' rather t name: "fmtnum", class: FUNC_CLASS_CONVERSION, help: `Convert int/float/bool to string using printf-style format string (https://pkg.go.dev/fmt), e.g. -'$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. This function recurses on array and map values.`, +'$s = fmtnum($n, "%08d")' or '$t = fmtnum($n, "%.6e")'. Miller-specific extension: "%_d" and "%_f" for comma-separated thousands. This function recurses on array and map values.`, binaryFunc: bifs.BIF_fmtnum, examples: []string{ - `$x = fmtnum($x, "%.6f")`, + `$y = fmtnum($x, "%.6f")`, + `$o = fmtnum($n, "%d")`, + `$o = fmtnum($n, "%12d")`, + `$y = fmtnum($x, "%.6_f")`, + `$o = fmtnum($n, "%_d")`, + `$o = fmtnum($n, "%12_d")`, }, }, @@ -2438,6 +2487,23 @@ Run a command via executable, path, args and environment, yielding its stdout mi variadicFunc: bifs.BIF_exec, }, + { + name: "stat", + class: FUNC_CLASS_SYSTEM, + help: `Returns a map containing information about the provided path: "name" with string value, "size" as decimal int value, "mode" as octal int value, "modtime" as int-valued epoch seconds, and "isdir" as boolean value.`, + unaryFunc: bifs.BIF_stat, + examples: []string{ + `stat("./mlr") gives {`, + ` "name": "mlr",`, + ` "size": 38391584,`, + ` "mode": 0755,`, + ` "modtime": 1715207874,`, + ` "isdir": false`, + `}`, + `stat("./mlr")["size"] gives 38391584`, + }, + }, + { name: "version", class: FUNC_CLASS_SYSTEM, @@ -2529,7 +2595,7 @@ func (manager *BuiltinFunctionManager) getBuiltinFunctionClasses() []string { classesList := make([]string, 0) for _, builtinFunctionInfo := range *manager.lookupTable { class := string(builtinFunctionInfo.class) - if classesSeen[class] == false { + if !classesSeen[class] { classesList = append(classesList, class) classesSeen[class] = true } diff --git a/pkg/dsl/cst/builtin_functions.go b/pkg/dsl/cst/builtin_functions.go index 397e7869c..12c16ea45 100644 --- a/pkg/dsl/cst/builtin_functions.go +++ b/pkg/dsl/cst/builtin_functions.go @@ -5,13 +5,14 @@ package cst import ( + "errors" "fmt" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- @@ -78,7 +79,7 @@ func (root *RootNode) BuildMultipleArityFunctionCallsiteNode( return root.BuildTernaryFunctionCallsiteNode(astNode, builtinFunctionInfo) } - return nil, fmt.Errorf( + return nil, errors.New( "at CST BuildMultipleArityFunctionCallsiteNode: function name not found: " + builtinFunctionInfo.name, ) @@ -450,7 +451,7 @@ func (node *RegexCaptureBinaryFunctionCallsiteNode) Evaluate( node.evaluable1.Evaluate(state), node.evaluable2.Evaluate(state), ) - state.RegexCaptures = captures + state.SetRegexCaptures(captures) return output } @@ -599,7 +600,7 @@ func (root *RootNode) BuildVariadicFunctionCallsiteNode( if callsiteArity < builtinFunctionInfo.minimumVariadicArity { return nil, fmt.Errorf( - "mlr: function %s takes minimum argument count %d; got %d.\n", + "mlr: function %s takes minimum argument count %d; got %d", builtinFunctionInfo.name, builtinFunctionInfo.minimumVariadicArity, callsiteArity, @@ -609,7 +610,7 @@ func (root *RootNode) BuildVariadicFunctionCallsiteNode( if builtinFunctionInfo.maximumVariadicArity != 0 { if callsiteArity > builtinFunctionInfo.maximumVariadicArity { return nil, fmt.Errorf( - "mlr: function %s takes maximum argument count %d; got %d.\n", + "mlr: function %s takes maximum argument count %d; got %d", builtinFunctionInfo.name, builtinFunctionInfo.maximumVariadicArity, callsiteArity, @@ -657,7 +658,7 @@ func (root *RootNode) BuildVariadicFunctionWithStateCallsiteNode( if callsiteArity < builtinFunctionInfo.minimumVariadicArity { return nil, fmt.Errorf( - "mlr: function %s takes minimum argument count %d; got %d.\n", + "mlr: function %s takes minimum argument count %d; got %d", builtinFunctionInfo.name, builtinFunctionInfo.minimumVariadicArity, callsiteArity, @@ -667,7 +668,7 @@ func (root *RootNode) BuildVariadicFunctionWithStateCallsiteNode( if builtinFunctionInfo.maximumVariadicArity != 0 { if callsiteArity > builtinFunctionInfo.maximumVariadicArity { return nil, fmt.Errorf( - "mlr: function %s takes maximum argument count %d; got %d.\n", + "mlr: function %s takes maximum argument count %d; got %d", builtinFunctionInfo.name, builtinFunctionInfo.maximumVariadicArity, callsiteArity, @@ -945,7 +946,7 @@ func (node *StandardTernaryOperatorNode) Evaluate( } // Short-circuit: defer evaluation unless needed - if boolValue == true { + if boolValue { return node.b.Evaluate(state) } else { return node.c.Evaluate(state) diff --git a/pkg/dsl/cst/collections.go b/pkg/dsl/cst/collections.go index 1dcee4daf..85866f7d7 100644 --- a/pkg/dsl/cst/collections.go +++ b/pkg/dsl/cst/collections.go @@ -8,11 +8,11 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/cst/cond.go b/pkg/dsl/cst/cond.go index f7f0063e3..52dd30dea 100644 --- a/pkg/dsl/cst/cond.go +++ b/pkg/dsl/cst/cond.go @@ -8,11 +8,11 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/parsing/token" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/runtime" ) type CondBlockNode struct { @@ -61,12 +61,12 @@ func (node *CondBlockNode) Execute( boolValue = false } else if !isBool { return nil, fmt.Errorf( - "mlr: conditional expression did not evaluate to boolean%s.", + "mlr: conditional expression did not evaluate to boolean%s", dsl.TokenToLocationInfo(node.conditionToken), ) } - if boolValue == true { + if boolValue { blockExitPayload, err := node.statementBlockNode.Execute(state) if err != nil { return nil, err diff --git a/pkg/dsl/cst/dump.go b/pkg/dsl/cst/dump.go index 14070527d..1114043ff 100644 --- a/pkg/dsl/cst/dump.go +++ b/pkg/dsl/cst/dump.go @@ -21,11 +21,11 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/types" ) // ================================================================ @@ -137,7 +137,7 @@ func (root *RootNode) buildDumpxStatementNode( } else if redirectorNode.Type == dsl.NodeTypeRedirectPipe { retval.outputHandlerManager = output.NewPipeWriteHandlerManager(root.recordWriterOptions) } else { - return nil, fmt.Errorf("mlr: unhandled redirector node type %s.", string(redirectorNode.Type)) + return nil, fmt.Errorf("mlr: unhandled redirector node type %s", string(redirectorNode.Type)) } } } @@ -199,7 +199,7 @@ func (node *DumpStatementNode) dumpToStderr( outputString string, state *runtime.State, ) error { - fmt.Fprintf(os.Stderr, outputString) + fmt.Fprint(os.Stderr, outputString) return nil } @@ -211,7 +211,7 @@ func (node *DumpStatementNode) dumpToFileOrPipe( redirectorTarget := node.redirectorTargetEvaluable.Evaluate(state) if !redirectorTarget.IsString() { return fmt.Errorf( - "mlr: output redirection yielded %s, not string.", + "mlr: output redirection yielded %s, not string", redirectorTarget.GetTypeName(), ) } diff --git a/pkg/dsl/cst/emit1.go b/pkg/dsl/cst/emit1.go index a4996e312..5ea14c8f4 100644 --- a/pkg/dsl/cst/emit1.go +++ b/pkg/dsl/cst/emit1.go @@ -22,10 +22,10 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/types" ) type Emit1StatementNode struct { diff --git a/pkg/dsl/cst/emit_emitp.go b/pkg/dsl/cst/emit_emitp.go index 3552f023d..85b9e374a 100644 --- a/pkg/dsl/cst/emit_emitp.go +++ b/pkg/dsl/cst/emit_emitp.go @@ -41,13 +41,13 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/types" ) // ================================================================ @@ -171,7 +171,7 @@ func (root *RootNode) buildEmitXStatementNode( } else { return nil, fmt.Errorf( - "mlr: unlashed-emit node types must be local variables, field names, oosvars, or maps; got %s.", + "mlr: unlashed-emit node types must be local variables, field names, oosvars, or maps; got %s", childNode.Type, ) } @@ -181,7 +181,7 @@ func (root *RootNode) buildEmitXStatementNode( for _, childNode := range emittablesNode.Children { if !EMITX_NAMED_NODE_TYPES[childNode.Type] { return nil, fmt.Errorf( - "mlr: lashed-emit node types must be local variables, field names, or oosvars; got %s.", + "mlr: lashed-emit node types must be local variables, field names, or oosvars; got %s", childNode.Type, ) } @@ -271,7 +271,7 @@ func (root *RootNode) buildEmitXStatementNode( } else if redirectorNode.Type == dsl.NodeTypeRedirectPipe { retval.outputHandlerManager = output.NewPipeWriteHandlerManager(root.recordWriterOptions) } else { - return nil, fmt.Errorf("mlr: unhandled redirector node type %s.", string(redirectorNode.Type)) + return nil, fmt.Errorf("mlr: unhandled redirector node type %s", string(redirectorNode.Type)) } } } @@ -989,7 +989,7 @@ func (node *EmitXStatementNode) emitRecordToFileOrPipe( ) error { redirectorTarget := node.redirectorTargetEvaluable.Evaluate(state) if !redirectorTarget.IsString() { - return fmt.Errorf("mlr: output redirection yielded %s, not string.", redirectorTarget.GetTypeName()) + return fmt.Errorf("mlr: output redirection yielded %s, not string", redirectorTarget.GetTypeName()) } outputFileName := redirectorTarget.String() diff --git a/pkg/dsl/cst/emitf.go b/pkg/dsl/cst/emitf.go index 97aebfe98..eb6812ef7 100644 --- a/pkg/dsl/cst/emitf.go +++ b/pkg/dsl/cst/emitf.go @@ -8,12 +8,12 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/types" ) // ================================================================ @@ -119,7 +119,7 @@ func (root *RootNode) BuildEmitFStatementNode(astNode *dsl.ASTNode) (IExecutable } else if redirectorNode.Type == dsl.NodeTypeRedirectPipe { retval.outputHandlerManager = output.NewPipeWriteHandlerManager(root.recordWriterOptions) } else { - return nil, fmt.Errorf("mlr: unhandled redirector node type %s.", string(redirectorNode.Type)) + return nil, fmt.Errorf("mlr: unhandled redirector node type %s", string(redirectorNode.Type)) } } } @@ -163,7 +163,7 @@ func getNameFromNamedNode(astNode *dsl.ASTNode, description string) (string, err } else if astNode.Type == dsl.NodeTypeDirectFieldValue { return string(astNode.Token.Lit), nil } - return "", fmt.Errorf("mlr: can't get name of node type \"%s\" for %s.", string(astNode.Type), description) + return "", fmt.Errorf(`mlr: can't get name of node type "%s" for %s`, string(astNode.Type), description) } // ---------------------------------------------------------------- @@ -187,7 +187,7 @@ func (node *EmitFStatementNode) emitfToFileOrPipe( ) error { redirectorTarget := node.redirectorTargetEvaluable.Evaluate(state) if !redirectorTarget.IsString() { - return fmt.Errorf("mlr: output redirection yielded %s, not string.", redirectorTarget.GetTypeName()) + return fmt.Errorf("mlr: output redirection yielded %s, not string", redirectorTarget.GetTypeName()) } outputFileName := redirectorTarget.String() diff --git a/pkg/dsl/cst/env.go b/pkg/dsl/cst/env.go index c2f038f2d..25e70a511 100644 --- a/pkg/dsl/cst/env.go +++ b/pkg/dsl/cst/env.go @@ -10,10 +10,10 @@ package cst import ( "os" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" ) type EnvironmentVariableNode struct { diff --git a/pkg/dsl/cst/evaluable.go b/pkg/dsl/cst/evaluable.go index 9440537ba..ea5673be9 100644 --- a/pkg/dsl/cst/evaluable.go +++ b/pkg/dsl/cst/evaluable.go @@ -10,10 +10,10 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/cst/filter.go b/pkg/dsl/cst/filter.go index 4a4d3984e..dbcbb2252 100644 --- a/pkg/dsl/cst/filter.go +++ b/pkg/dsl/cst/filter.go @@ -19,9 +19,9 @@ package cst import ( - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/cst/for.go b/pkg/dsl/cst/for.go index 34d6b808d..074f3b500 100644 --- a/pkg/dsl/cst/for.go +++ b/pkg/dsl/cst/for.go @@ -7,11 +7,11 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/parsing/token" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- @@ -805,7 +805,7 @@ func (root *RootNode) BuildTripleForLoopNode(astNode *dsl.ASTNode) (*TripleForLo for i := 0; i < n-1; i++ { if continuationExpressionASTNode.Children[i].Type != dsl.NodeTypeAssignment { return nil, fmt.Errorf( - "mlr: the non-final triple-for continutation statements must be assignments.", + "mlr: the non-final triple-for continuation statements must be assignments", ) } precontinuationAssignment, err := root.BuildAssignmentNode( @@ -822,11 +822,11 @@ func (root *RootNode) BuildTripleForLoopNode(astNode *dsl.ASTNode) (*TripleForLo if bareBooleanASTNode.Type != dsl.NodeTypeBareBoolean { if n == 1 { return nil, fmt.Errorf( - "mlr: the triple-for continutation statement must be a bare boolean.", + "mlr: the triple-for continuation statement must be a bare boolean", ) } else { return nil, fmt.Errorf( - "mlr: the final triple-for continutation statement must be a bare boolean.", + "mlr: the final triple-for continuation statement must be a bare boolean", ) } } @@ -898,11 +898,11 @@ func (node *TripleForLoopNode) Execute(state *runtime.State) (*BlockExitPayload, boolValue, isBool := continuationValue.GetBoolValue() if !isBool { return nil, fmt.Errorf( - "mlr: for-loop continuation did not evaluate to boolean%s.", + "mlr: for-loop continuation did not evaluate to boolean%s", dsl.TokenToLocationInfo(node.continuationExpressionToken), ) } - if boolValue == false { + if !boolValue { break } } diff --git a/pkg/dsl/cst/functions.go b/pkg/dsl/cst/functions.go index 5aca6d397..c214cd349 100644 --- a/pkg/dsl/cst/functions.go +++ b/pkg/dsl/cst/functions.go @@ -9,8 +9,8 @@ package cst import ( - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/cst/hofs.go b/pkg/dsl/cst/hofs.go index 87edc01b2..67ab64b4a 100644 --- a/pkg/dsl/cst/hofs.go +++ b/pkg/dsl/cst/hofs.go @@ -14,12 +14,12 @@ import ( "github.com/facette/natsort" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" ) -// Most function types are in the github.com/johnkerl/miller/pkg/types package. These types, though, +// Most function types are in the github.com/johnkerl/miller/v6/pkg/types package. These types, though, // include functions which need to access CST state in order to call back to // user-defined functions. To avoid a package-cycle dependency, they are // defined here. @@ -875,7 +875,7 @@ func sortAF( return mlrval.FromArray(outputArray) } -// sortAF implements sort on arrays with callback UDF. +// sortMF implements sort on arrays with callback UDF. func sortMF( input1 *mlrval.Mlrval, input2 *mlrval.Mlrval, diff --git a/pkg/dsl/cst/if.go b/pkg/dsl/cst/if.go index b947c7f6e..b85f68119 100644 --- a/pkg/dsl/cst/if.go +++ b/pkg/dsl/cst/if.go @@ -7,11 +7,11 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/parsing/token" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- @@ -130,11 +130,11 @@ func (node *IfChainNode) Execute(state *runtime.State) (*BlockExitPayload, error boolValue, isBool := condition.GetBoolValue() if !isBool { return nil, fmt.Errorf( - "mlr: conditional expression did not evaluate to boolean%s.", + "mlr: conditional expression did not evaluate to boolean%s", dsl.TokenToLocationInfo(ifItem.conditionToken), ) } - if boolValue == true { + if boolValue { blockExitPayload, err := ifItem.statementBlockNode.Execute(state) if err != nil { return nil, err diff --git a/pkg/dsl/cst/keyword_usage.go b/pkg/dsl/cst/keyword_usage.go index c5bec8f7a..0f5341980 100644 --- a/pkg/dsl/cst/keyword_usage.go +++ b/pkg/dsl/cst/keyword_usage.go @@ -4,8 +4,8 @@ import ( "fmt" "strings" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/lib" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/cst/leaves.go b/pkg/dsl/cst/leaves.go index 08b3200a9..81612d85f 100644 --- a/pkg/dsl/cst/leaves.go +++ b/pkg/dsl/cst/leaves.go @@ -8,10 +8,10 @@ import ( "fmt" "math" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- @@ -266,7 +266,7 @@ func (root *RootNode) BuildStringLiteralNode(literal string) IEvaluable { // RegexLiteralNode. See also https://github.com/johnkerl/miller/issues/297. literal = lib.UnbackslashStringLiteral(literal) - hasCaptures, replacementCaptureMatrix := lib.RegexReplacementHasCaptures(literal) + hasCaptures, replacementCaptureMatrix := lib.ReplacementHasCaptures(literal) if !hasCaptures { return &StringLiteralNode{ literal: mlrval.FromString(literal), @@ -293,7 +293,7 @@ func (node *StringLiteralNode) Evaluate( // } // // the captures can be set (by =~ or !=~) quite far from where they are used. -// This is why we consult the state.RegexCaptures here, to see if they've been +// This is why we consult the state's regex captures here, to see if they've been // set on some previous invocation of =~ or !=~. func (node *RegexCaptureReplacementNode) Evaluate( state *runtime.State, @@ -302,7 +302,7 @@ func (node *RegexCaptureReplacementNode) Evaluate( lib.InterpolateCaptures( node.replacementString, node.replacementCaptureMatrix, - state.RegexCaptures, + state.GetRegexCaptures(), ), ) } diff --git a/pkg/dsl/cst/lvalues.go b/pkg/dsl/cst/lvalues.go index 073c5d991..799d8801e 100644 --- a/pkg/dsl/cst/lvalues.go +++ b/pkg/dsl/cst/lvalues.go @@ -6,13 +6,14 @@ package cst import ( + "errors" "fmt" "os" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- @@ -45,11 +46,11 @@ func (root *RootNode) BuildAssignableNode( case dsl.NodeTypeArrayOrMapPositionalNameAccess: return nil, fmt.Errorf( - "mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$'.", + "mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$'", ) case dsl.NodeTypeArrayOrMapPositionalValueAccess: return nil, fmt.Errorf( - "mlr: '[[[...]]]' is allowed on assignment left-hand sides only when immediately preceded by '$'.", + "mlr: '[[[...]]]' is allowed on assignment left-hand sides only when immediately preceded by '$'", ) case dsl.NodeTypeArrayOrMapIndexAccess: @@ -62,7 +63,7 @@ func (root *RootNode) BuildAssignableNode( return root.BuildEnvironmentVariableLvalueNode(astNode) } - return nil, fmt.Errorf( + return nil, errors.New( "at CST BuildAssignableNode: unhandled AST node " + string(astNode.Type), ) } @@ -106,7 +107,7 @@ func (node *DirectFieldValueLvalueNode) AssignIndexed( // print inrec attributes. Also, a UDF/UDS invoked from begin/end could try // to access the inrec, and that would get past the validator. if state.Inrec == nil { - return fmt.Errorf("there is no current record to assign to.") + return fmt.Errorf("there is no current record to assign to") } // AssignmentNode checks for absent, so we just assign whatever we get @@ -205,7 +206,7 @@ func (node *IndirectFieldValueLvalueNode) AssignIndexed( // print inrec attributes. Also, a UDF/UDS invoked from begin/end could try // to access the inrec, and that would get past the validator. if state.Inrec == nil { - return fmt.Errorf("there is no current record to assign to.") + return fmt.Errorf("there is no current record to assign to") } lhsFieldName := node.lhsFieldNameExpression.Evaluate(state) @@ -298,7 +299,7 @@ func (node *PositionalFieldNameLvalueNode) Assign( // print inrec attributes. Also, a UDF/UDS invoked from begin/end could try // to access the inrec, and that would get past the validator. if state.Inrec == nil { - return fmt.Errorf("there is no current record to assign to.") + return fmt.Errorf("there is no current record to assign to") } lhsFieldIndex := node.lhsFieldIndexExpression.Evaluate(state) @@ -310,7 +311,7 @@ func (node *PositionalFieldNameLvalueNode) Assign( return nil } else { return fmt.Errorf( - "mlr: positional index for $[[...]] assignment must be integer; got %s.", + "mlr: positional index for $[[...]] assignment must be integer; got %s", lhsFieldIndex.GetTypeName(), ) } @@ -324,7 +325,7 @@ func (node *PositionalFieldNameLvalueNode) AssignIndexed( // TODO: reconsider this if /when we decide to allow string-slice // assignments. return fmt.Errorf( - "mlr: $[[...]] = ... expressions are not indexable.", + "mlr: $[[...]] = ... expressions are not indexable", ) } @@ -416,7 +417,7 @@ func (node *PositionalFieldValueLvalueNode) AssignIndexed( // print inrec attributes. Also, a UDF/UDS invoked from begin/end could try // to access the inrec, and that would get past the validator. if state.Inrec == nil { - return fmt.Errorf("there is no current record to assign to.") + return fmt.Errorf("there is no current record to assign to") } lhsFieldIndex := node.lhsFieldIndexExpression.Evaluate(state) @@ -434,7 +435,7 @@ func (node *PositionalFieldValueLvalueNode) AssignIndexed( return nil } else { return fmt.Errorf( - "mlr: positional index for $[[[...]]] assignment must be integer; got %s.", + "mlr: positional index for $[[[...]]] assignment must be integer; got %s", lhsFieldIndex.GetTypeName(), ) } @@ -517,7 +518,7 @@ func (node *FullSrecLvalueNode) AssignIndexed( // print inrec attributes. Also, a UDF/UDS invoked from begin/end could try // to access the inrec, and that would get past the validator. if state.Inrec == nil { - return fmt.Errorf("there is no current record to assign to.") + return fmt.Errorf("there is no current record to assign to") } // AssignmentNode checks for absentness of the rvalue, so we just assign @@ -787,7 +788,7 @@ func (root *RootNode) BuildLocalVariableLvalueNode(astNode *dsl.ASTNode) (IAssig if astNode.Children == nil { // untyped, like 'x = 3' if root.strictMode { return nil, fmt.Errorf( - "mlr: need typedecl such as \"var\", \"str\", \"num\", etc. for variable \"%s\" in strict mode", + `mlr: need typedecl such as "var", "str", "num", etc. for variable "%s" in strict mode`, variableName, ) } @@ -1086,7 +1087,7 @@ func (node *EnvironmentVariableLvalueNode) Assign( if !name.IsString() { return fmt.Errorf( - "assignments to ENV[...] must have string names; got %s \"%s\"\n", + `assignments to ENV[...] must have string names; got %s "%s"`, name.GetTypeName(), name.String(), ) @@ -1109,7 +1110,7 @@ func (node *EnvironmentVariableLvalueNode) AssignIndexed( indices []*mlrval.Mlrval, state *runtime.State, ) error { - return fmt.Errorf("mlr: ENV[...] cannot be indexed.") + return fmt.Errorf("mlr: ENV[...] cannot be indexed") } func (node *EnvironmentVariableLvalueNode) Unassign( @@ -1133,5 +1134,5 @@ func (node *EnvironmentVariableLvalueNode) UnassignIndexed( state *runtime.State, ) { // TODO: needs error return - //return errors.New("mlr: ENV[...] cannot be indexed.") + //return errors.New("mlr: ENV[...] cannot be indexed") } diff --git a/pkg/dsl/cst/print.go b/pkg/dsl/cst/print.go index 8c68593be..39c84f58d 100644 --- a/pkg/dsl/cst/print.go +++ b/pkg/dsl/cst/print.go @@ -9,11 +9,11 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -280,7 +280,7 @@ func (root *RootNode) buildPrintxStatementNode( } else if redirectorNode.Type == dsl.NodeTypeRedirectPipe { retval.outputHandlerManager = output.NewPipeWriteHandlerManager(root.recordWriterOptions) } else { - return nil, fmt.Errorf("mlr: unhandled redirector node type %s.", string(redirectorNode.Type)) + return nil, fmt.Errorf("mlr: unhandled redirector node type %s", string(redirectorNode.Type)) } } } @@ -356,7 +356,7 @@ func (node *PrintStatementNode) printToFileOrPipe( ) error { redirectorTarget := node.redirectorTargetEvaluable.Evaluate(state) if !redirectorTarget.IsString() { - return fmt.Errorf("mlr: output redirection yielded %s, not string.", redirectorTarget.GetTypeName()) + return fmt.Errorf("mlr: output redirection yielded %s, not string", redirectorTarget.GetTypeName()) } outputFileName := redirectorTarget.String() diff --git a/pkg/dsl/cst/root.go b/pkg/dsl/cst/root.go index f48ebc330..099301bac 100644 --- a/pkg/dsl/cst/root.go +++ b/pkg/dsl/cst/root.go @@ -6,18 +6,18 @@ package cst import ( - "container/list" + "errors" "fmt" "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/parsing/lexer" - "github.com/johnkerl/miller/pkg/parsing/parser" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/parsing/lexer" + "github.com/johnkerl/miller/v6/pkg/parsing/parser" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // NewEmptyRoot sets up an empty CST, before ingesting any DSL strings. For @@ -36,9 +36,9 @@ func NewEmptyRoot( udfManager: NewUDFManager(), udsManager: NewUDSManager(), allowUDFUDSRedefinitions: false, - unresolvedFunctionCallsites: list.New(), - unresolvedSubroutineCallsites: list.New(), - outputHandlerManagers: list.New(), + unresolvedFunctionCallsites: make([]*UDFCallsite, 0), + unresolvedSubroutineCallsites: make([]*UDSCallsite, 0), + outputHandlerManagers: make([]output.OutputHandlerManager, 0), recordWriterOptions: recordWriterOptions, dslInstanceType: dslInstanceType, } @@ -163,7 +163,7 @@ func (root *RootNode) IngestAST( err = nil if ast.RootNode == nil { - return hadWarnings, fmt.Errorf("cannot build CST from nil AST root") + return hadWarnings, errors.New("cannot build CST from nil AST root") } // Check for things that are syntax errors but not done in the AST for @@ -251,7 +251,7 @@ func (root *RootNode) regexProtectPrePass(ast *dsl.AST) { func (root *RootNode) regexProtectPrePassAux(astNode *dsl.ASTNode) { - if astNode.Children == nil || len(astNode.Children) == 0 { + if len(astNode.Children) == 0 { return } @@ -363,11 +363,11 @@ func (root *RootNode) buildMainPass(ast *dsl.AST, isReplImmediate bool) error { // This is invoked within the buildMainPass call tree whenever a function is // called before it's defined. func (root *RootNode) rememberUnresolvedFunctionCallsite(udfCallsite *UDFCallsite) { - root.unresolvedFunctionCallsites.PushBack(udfCallsite) + root.unresolvedFunctionCallsites = append(root.unresolvedFunctionCallsites, udfCallsite) } func (root *RootNode) rememberUnresolvedSubroutineCallsite(udsCallsite *UDSCallsite) { - root.unresolvedSubroutineCallsites.PushBack(udsCallsite) + root.unresolvedSubroutineCallsites = append(root.unresolvedSubroutineCallsites, udsCallsite) } // After-pass after buildMainPass returns, in case a function was called before @@ -380,10 +380,9 @@ func (root *RootNode) rememberUnresolvedSubroutineCallsite(udsCallsite *UDSCalls // So, our error message should reflect all those options. func (root *RootNode) resolveFunctionCallsites() error { - for root.unresolvedFunctionCallsites.Len() > 0 { - unresolvedFunctionCallsite := root.unresolvedFunctionCallsites.Remove( - root.unresolvedFunctionCallsites.Front(), - ).(*UDFCallsite) + for len(root.unresolvedFunctionCallsites) > 0 { + unresolvedFunctionCallsite := root.unresolvedFunctionCallsites[0] + root.unresolvedFunctionCallsites = root.unresolvedFunctionCallsites[1:] functionName := unresolvedFunctionCallsite.udf.signature.funcOrSubrName callsiteArity := unresolvedFunctionCallsite.udf.signature.arity @@ -404,10 +403,9 @@ func (root *RootNode) resolveFunctionCallsites() error { } func (root *RootNode) resolveSubroutineCallsites() error { - for root.unresolvedSubroutineCallsites.Len() > 0 { - unresolvedSubroutineCallsite := root.unresolvedSubroutineCallsites.Remove( - root.unresolvedSubroutineCallsites.Front(), - ).(*UDSCallsite) + for len(root.unresolvedSubroutineCallsites) > 0 { + unresolvedSubroutineCallsite := root.unresolvedSubroutineCallsites[0] + root.unresolvedSubroutineCallsites = root.unresolvedSubroutineCallsites[1:] subroutineName := unresolvedSubroutineCallsite.uds.signature.funcOrSubrName callsiteArity := unresolvedSubroutineCallsite.uds.signature.arity @@ -417,7 +415,7 @@ func (root *RootNode) resolveSubroutineCallsites() error { return err } if uds == nil { - return fmt.Errorf("mlr: subroutine name not found: " + subroutineName) + return errors.New("mlr: subroutine name not found: " + subroutineName) } unresolvedSubroutineCallsite.uds = uds @@ -437,12 +435,11 @@ func (root *RootNode) resolveSubroutineCallsites() error { func (root *RootNode) RegisterOutputHandlerManager( outputHandlerManager output.OutputHandlerManager, ) { - root.outputHandlerManagers.PushBack(outputHandlerManager) + root.outputHandlerManagers = append(root.outputHandlerManagers, outputHandlerManager) } func (root *RootNode) ProcessEndOfStream() { - for entry := root.outputHandlerManagers.Front(); entry != nil; entry = entry.Next() { - outputHandlerManager := entry.Value.(output.OutputHandlerManager) + for _, outputHandlerManager := range root.outputHandlerManagers { errs := outputHandlerManager.Close() if len(errs) != 0 { for _, err := range errs { @@ -500,8 +497,8 @@ func (root *RootNode) ExecuteREPLImmediate(state *runtime.State) (outrec *mlrval // This is the 'and then discarded' part of that. func (root *RootNode) ResetForREPL() { root.replImmediateBlock = NewStatementBlockNode() - root.unresolvedFunctionCallsites = list.New() - root.unresolvedSubroutineCallsites = list.New() + root.unresolvedFunctionCallsites = make([]*UDFCallsite, 0) + root.unresolvedSubroutineCallsites = make([]*UDSCallsite, 0) } // This is for the REPL's context-printer command. diff --git a/pkg/dsl/cst/signature.go b/pkg/dsl/cst/signature.go index 1ee554763..210ac4a4e 100644 --- a/pkg/dsl/cst/signature.go +++ b/pkg/dsl/cst/signature.go @@ -6,7 +6,7 @@ package cst import ( - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/cst/statements.go b/pkg/dsl/cst/statements.go index ce42cb854..8e8edba72 100644 --- a/pkg/dsl/cst/statements.go +++ b/pkg/dsl/cst/statements.go @@ -8,7 +8,7 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/dsl" ) // ---------------------------------------------------------------- @@ -67,9 +67,9 @@ func (root *RootNode) BuildStatementNode( return root.BuildEmitPStatementNode(astNode) case dsl.NodeTypeBeginBlock: - return nil, fmt.Errorf("mlr: begin blocks may only be declared at top level.") + return nil, fmt.Errorf("mlr: begin blocks may only be declared at top level") case dsl.NodeTypeEndBlock: - return nil, fmt.Errorf("mlr: end blocks may only be declared at top level.") + return nil, fmt.Errorf("mlr: end blocks may only be declared at top level") case dsl.NodeTypeIfChain: return root.BuildIfChainNode(astNode) @@ -89,9 +89,9 @@ func (root *RootNode) BuildStatementNode( return root.BuildTripleForLoopNode(astNode) case dsl.NodeTypeNamedFunctionDefinition: - return nil, fmt.Errorf("mlr: functions may only be declared at top level.") + return nil, fmt.Errorf("mlr: functions may only be declared at top level") case dsl.NodeTypeSubroutineDefinition: - return nil, fmt.Errorf("mlr: subroutines may only be declared at top level.") + return nil, fmt.Errorf("mlr: subroutines may only be declared at top level") case dsl.NodeTypeSubroutineCallsite: return root.BuildSubroutineCallsiteNode(astNode) @@ -104,7 +104,6 @@ func (root *RootNode) BuildStatementNode( default: return nil, fmt.Errorf("at CST BuildStatementNode: unhandled AST node %s", string(astNode.Type)) - break } return statement, nil } diff --git a/pkg/dsl/cst/subroutines.go b/pkg/dsl/cst/subroutines.go index 6c1b76dae..3f04de745 100644 --- a/pkg/dsl/cst/subroutines.go +++ b/pkg/dsl/cst/subroutines.go @@ -9,8 +9,8 @@ package cst import ( - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/cst/tee.go b/pkg/dsl/cst/tee.go index df9b8ff0b..c0ef376f1 100644 --- a/pkg/dsl/cst/tee.go +++ b/pkg/dsl/cst/tee.go @@ -7,12 +7,12 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -121,7 +121,7 @@ func (root *RootNode) BuildTeeStatementNode(astNode *dsl.ASTNode) (IExecutable, } else if redirectorNode.Type == dsl.NodeTypeRedirectPipe { retval.outputHandlerManager = output.NewPipeWriteHandlerManager(root.recordWriterOptions) } else { - return nil, fmt.Errorf("mlr: unhandled redirector node type %s.", string(redirectorNode.Type)) + return nil, fmt.Errorf("mlr: unhandled redirector node type %s", string(redirectorNode.Type)) } } @@ -138,7 +138,7 @@ func (root *RootNode) BuildTeeStatementNode(astNode *dsl.ASTNode) (IExecutable, func (node *TeeStatementNode) Execute(state *runtime.State) (*BlockExitPayload, error) { expression := node.expressionEvaluable.Evaluate(state) if !expression.IsMap() { - return nil, fmt.Errorf("mlr: tee-evaluaiton yielded %s, not map.", expression.GetTypeName()) + return nil, fmt.Errorf("mlr: tee-evaluaiton yielded %s, not map", expression.GetTypeName()) } err := node.teeToRedirectFunc(expression.GetMap(), state) return nil, err @@ -151,7 +151,7 @@ func (node *TeeStatementNode) teeToFileOrPipe( ) error { redirectorTarget := node.redirectorTargetEvaluable.Evaluate(state) if !redirectorTarget.IsString() { - return fmt.Errorf("mlr: output redirection yielded %s, not string.", redirectorTarget.GetTypeName()) + return fmt.Errorf("mlr: output redirection yielded %s, not string", redirectorTarget.GetTypeName()) } outputFileName := redirectorTarget.String() diff --git a/pkg/dsl/cst/types.go b/pkg/dsl/cst/types.go index 4fa935edf..dea4861a6 100644 --- a/pkg/dsl/cst/types.go +++ b/pkg/dsl/cst/types.go @@ -5,12 +5,11 @@ package cst import ( - "container/list" - - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ---------------------------------------------------------------- @@ -44,9 +43,9 @@ type RootNode struct { udfManager *UDFManager udsManager *UDSManager allowUDFUDSRedefinitions bool - unresolvedFunctionCallsites *list.List - unresolvedSubroutineCallsites *list.List - outputHandlerManagers *list.List + unresolvedFunctionCallsites []*UDFCallsite + unresolvedSubroutineCallsites []*UDSCallsite + outputHandlerManagers []output.OutputHandlerManager recordWriterOptions *cli.TWriterOptions dslInstanceType DSLInstanceType // put, filter, repl strictMode bool @@ -58,7 +57,7 @@ type RootNode struct { type NodeBuilder func(astNode *dsl.ASTNode) (IEvaluable, error) // ---------------------------------------------------------------- -// This is for all statements and statemnt blocks within the CST. +// This is for all statements and statement blocks within the CST. type IExecutable interface { Execute(state *runtime.State) (*BlockExitPayload, error) } diff --git a/pkg/dsl/cst/udf.go b/pkg/dsl/cst/udf.go index 9be4bf59c..f3550669e 100644 --- a/pkg/dsl/cst/udf.go +++ b/pkg/dsl/cst/udf.go @@ -8,11 +8,11 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -223,6 +223,8 @@ func (site *UDFCallsite) EvaluateWithArguments( state.Stack.PushStackFrameSet() defer state.Stack.PopStackFrameSet() } + state.PushRegexCapturesFrame() + defer state.PopRegexCapturesFrame() cacheable := !udf.isFunctionLiteral @@ -401,7 +403,7 @@ func (root *RootNode) BuildAndInstallUDF(astNode *dsl.ASTNode) error { if BuiltinFunctionManagerInstance.LookUp(functionName) != nil { return fmt.Errorf( - "mlr: function named \"%s\" must not override a built-in function of the same name.", + `mlr: function named "%s" must not override a built-in function of the same name`, functionName, ) } @@ -409,7 +411,7 @@ func (root *RootNode) BuildAndInstallUDF(astNode *dsl.ASTNode) error { if !root.allowUDFUDSRedefinitions { if root.udfManager.ExistsByName(functionName) { return fmt.Errorf( - "mlr: function named \"%s\" has already been defined.", + `mlr: function named "%s" has already been defined`, functionName, ) } @@ -493,6 +495,9 @@ func (root *RootNode) BuildUDF( "function return value", returnValueTypeName, ) + if err != nil { + return nil, err + } lib.InternalCodingErrorIf(parameterListASTNode.Type != dsl.NodeTypeParameterList) lib.InternalCodingErrorIf(parameterListASTNode.Children == nil) diff --git a/pkg/dsl/cst/uds.go b/pkg/dsl/cst/uds.go index 3a72e4c23..f42d5fc20 100644 --- a/pkg/dsl/cst/uds.go +++ b/pkg/dsl/cst/uds.go @@ -7,11 +7,11 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -120,6 +120,8 @@ func (site *UDSCallsite) Execute(state *runtime.State) (*BlockExitPayload, error // Bind the arguments to the parameters state.Stack.PushStackFrameSet() defer state.Stack.PopStackFrameSet() + state.PushRegexCapturesFrame() + defer state.PopRegexCapturesFrame() for i := range arguments { err := state.Stack.DefineTypedAtScope( @@ -242,7 +244,7 @@ func (root *RootNode) BuildAndInstallUDS(astNode *dsl.ASTNode) error { if !root.allowUDFUDSRedefinitions { if root.udsManager.ExistsByName(subroutineName) { return fmt.Errorf( - "mlr: subroutine named \"%s\" has already been defined.", + `mlr: subroutine named "%s" has already been defined`, subroutineName, ) } diff --git a/pkg/dsl/cst/validate.go b/pkg/dsl/cst/validate.go index 305c1bf7f..8783c6003 100644 --- a/pkg/dsl/cst/validate.go +++ b/pkg/dsl/cst/validate.go @@ -9,8 +9,8 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" ) // ---------------------------------------------------------------- @@ -29,9 +29,9 @@ func ValidateAST( // They can do mlr put '': there are simply zero statements. // But filter '' is an error. - if ast.RootNode.Children == nil || len(ast.RootNode.Children) == 0 { + if len(ast.RootNode.Children) == 0 { if dslInstanceType == DSLInstanceTypeFilter { - return fmt.Errorf("mlr: filter statement must not be empty.") + return fmt.Errorf("mlr: filter statement must not be empty") } } @@ -80,7 +80,7 @@ func validateASTAux( if astNode.Type == dsl.NodeTypeFilterStatement { if dslInstanceType == DSLInstanceTypeFilter { return fmt.Errorf( - "mlr: filter expressions must not also contain the \"filter\" keyword.", + `mlr: filter expressions must not also contain the "filter" keyword`, ) } } @@ -89,21 +89,21 @@ func validateASTAux( if astNode.Type == dsl.NodeTypeBeginBlock { if !atTopLevel { return fmt.Errorf( - "mlr: begin blocks can only be at top level.", + "mlr: begin blocks can only be at top level", ) } nextLevelInBeginOrEnd = true } else if astNode.Type == dsl.NodeTypeEndBlock { if !atTopLevel { return fmt.Errorf( - "mlr: end blocks can only be at top level.", + "mlr: end blocks can only be at top level", ) } nextLevelInBeginOrEnd = true } else if astNode.Type == dsl.NodeTypeNamedFunctionDefinition { if !atTopLevel { return fmt.Errorf( - "mlr: func blocks can only be at top level.", + "mlr: func blocks can only be at top level", ) } nextLevelInUDF = true @@ -112,7 +112,7 @@ func validateASTAux( } else if astNode.Type == dsl.NodeTypeSubroutineDefinition { if !atTopLevel { return fmt.Errorf( - "mlr: subr blocks can only be at top level.", + "mlr: subr blocks can only be at top level", ) } nextLevelInUDS = true @@ -134,7 +134,7 @@ func validateASTAux( astNode.Type == dsl.NodeTypeIndirectFieldValue || astNode.Type == dsl.NodeTypeFullSrec { return fmt.Errorf( - "mlr: begin/end blocks cannot refer to records via $x, $*, etc.", + "mlr: begin/end blocks cannot refer to records via $x, $*, etc", ) } } @@ -143,7 +143,7 @@ func validateASTAux( if !inLoop { if astNode.Type == dsl.NodeTypeBreak { return fmt.Errorf( - "mlr: break statements are only valid within for/do/while loops.", + "mlr: break statements are only valid within for/do/while loops", ) } } @@ -151,7 +151,7 @@ func validateASTAux( if !inLoop { if astNode.Type == dsl.NodeTypeContinue { return fmt.Errorf( - "mlr: break statements are only valid within for/do/while loops.", + "mlr: break statements are only valid within for/do/while loops", ) } } @@ -169,7 +169,7 @@ func validateASTAux( if !inUDF && !inUDS { if astNode.Type == dsl.NodeTypeReturn { return fmt.Errorf( - "mlr: return statements are only valid within func/subr blocks.", + "mlr: return statements are only valid within func/subr blocks", ) } } @@ -179,14 +179,14 @@ func validateASTAux( if inUDF { if len(astNode.Children) != 1 { return fmt.Errorf( - "mlr: return statements in func blocks must return a value.", + "mlr: return statements in func blocks must return a value", ) } } if inUDS { if len(astNode.Children) != 0 { return fmt.Errorf( - "mlr: return statements in subr blocks must not return a value.", + "mlr: return statements in subr blocks must not return a value", ) } } @@ -197,7 +197,7 @@ func validateASTAux( ok := VALID_LHS_NODE_TYPES[astNode.Type] if !ok { return fmt.Errorf( - "mlr: %s is not valid on the left-hand side of an assignment.", + "mlr: %s is not valid on the left-hand side of an assignment", astNode.Type, ) } @@ -208,7 +208,7 @@ func validateASTAux( ok := VALID_LHS_NODE_TYPES[astNode.Type] if !ok { return fmt.Errorf( - "mlr: %s is not valid for unset statement.", + "mlr: %s is not valid for unset statement", astNode.Type, ) } @@ -259,7 +259,7 @@ func validateForLoopTwoVariableUniqueNames(astNode *dsl.ASTNode) error { keyVarName := string(keyVarNode.Token.Lit) valVarName := string(valVarNode.Token.Lit) if keyVarName == valVarName { - return fmt.Errorf("mlr: redefinition of variable %s in the same scope.", keyVarName) + return fmt.Errorf("mlr: redefinition of variable %s in the same scope", keyVarName) } else { return nil } @@ -289,14 +289,14 @@ func validateForLoopMultivariableUniqueNames(astNode *dsl.ASTNode) error { name := string(keyVarNode.Token.Lit) _, present := seen[name] if present { - return fmt.Errorf("mlr: redefinition of variable %s in the same scope.", name) + return fmt.Errorf("mlr: redefinition of variable %s in the same scope", name) } seen[name] = true } valVarName := string(valVarNode.Token.Lit) if seen[valVarName] { - return fmt.Errorf("mlr: redefinition of variable %s in the same scope.", valVarName) + return fmt.Errorf("mlr: redefinition of variable %s in the same scope", valVarName) } return nil diff --git a/pkg/dsl/cst/warn.go b/pkg/dsl/cst/warn.go index 55850c8b1..75c5d0436 100644 --- a/pkg/dsl/cst/warn.go +++ b/pkg/dsl/cst/warn.go @@ -11,8 +11,8 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" ) // ---------------------------------------------------------------- diff --git a/pkg/dsl/cst/while.go b/pkg/dsl/cst/while.go index 4e088df6c..ccdde0052 100644 --- a/pkg/dsl/cst/while.go +++ b/pkg/dsl/cst/while.go @@ -7,10 +7,10 @@ package cst import ( "fmt" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/parsing/token" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ================================================================ @@ -60,11 +60,11 @@ func (node *WhileLoopNode) Execute(state *runtime.State) (*BlockExitPayload, err boolValue, isBool := condition.GetBoolValue() if !isBool { return nil, fmt.Errorf( - "mlr: conditional expression did not evaluate to boolean%s.", + "mlr: conditional expression did not evaluate to boolean%s", dsl.TokenToLocationInfo(node.conditionToken), ) } - if boolValue != true { + if !boolValue { break } blockExitPayload, err := node.statementBlockNode.Execute(state) @@ -157,11 +157,11 @@ func (node *DoWhileLoopNode) Execute(state *runtime.State) (*BlockExitPayload, e boolValue, isBool := condition.GetBoolValue() if !isBool { return nil, fmt.Errorf( - "mlr: conditional expression did not evaluate to boolean%s.", + "mlr: conditional expression did not evaluate to boolean%s", dsl.TokenToLocationInfo(node.conditionToken), ) } - if boolValue == false { + if !boolValue { break } } diff --git a/pkg/dsl/token.go b/pkg/dsl/token.go index ff79d26e0..6808941d2 100644 --- a/pkg/dsl/token.go +++ b/pkg/dsl/token.go @@ -3,7 +3,7 @@ package dsl import ( "fmt" - "github.com/johnkerl/miller/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/parsing/token" ) // TokenToLocationInfo is used to track runtime errors back to source-code locations in DSL diff --git a/pkg/entrypoint/entrypoint.go b/pkg/entrypoint/entrypoint.go index 7f7fab711..d8c56c8cf 100644 --- a/pkg/entrypoint/entrypoint.go +++ b/pkg/entrypoint/entrypoint.go @@ -7,17 +7,16 @@ package entrypoint import ( "fmt" - "io/ioutil" "os" "path" - "github.com/johnkerl/miller/pkg/auxents" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/climain" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/platform" - "github.com/johnkerl/miller/pkg/stream" - "github.com/johnkerl/miller/pkg/transformers" + "github.com/johnkerl/miller/v6/pkg/auxents" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/climain" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/platform" + "github.com/johnkerl/miller/v6/pkg/stream" + "github.com/johnkerl/miller/v6/pkg/transformers" ) type MainReturn struct { @@ -37,15 +36,6 @@ func Main() MainReturn { // otherwise, we only raw ANSI escape sequences like ←[0;30m 0←[0m ←[0;31m 1 platform.EnableAnsiEscapeSequences() - // Expand "-xyz" into "-x -y -z" while leaving "--xyz" intact. This is a - // keystroke-saver for the user. - // - // This is OK to do globally here since Miller is quite consistent (in - // main, verbs, and auxents) that multi-character options start with two - // dashes, e.g. "--csv". (The sole exception is the sort verb's -nf/-nr - // which are handled specially there.) - os.Args = lib.Getoptify(os.Args) - // 'mlr repl' or 'mlr lecat' or any other non-miller-per-se toolery which // is delivered (for convenience) within the mlr executable. If argv[1] is // found then this function will not return. @@ -60,7 +50,7 @@ func Main() MainReturn { if !options.DoInPlace { err = processToStdout(options, recordTransformers) } else { - err = processInPlace(options) + err = processFilesInPlace(options) } if err != nil { fmt.Fprintf(os.Stderr, "mlr: %v.\n", err) @@ -83,7 +73,7 @@ func processToStdout( } // ---------------------------------------------------------------- -// processInPlace is in-place processing without mlr -I. +// processFilesInPlace is in-place processing without mlr -I. // // For in-place mode, reconstruct the transformers on each input file. E.g. // 'mlr -I head -n 2 foo bar' should do head -n 2 on foo as well as on bar. @@ -95,7 +85,7 @@ func processToStdout( // frequently used code path, this would likely lead to latent bugs. So this // approach leads to greater code stability. -func processInPlace( +func processFilesInPlace( originalOptions *cli.TOptions, ) error { // This should have been already checked by the CLI parser when validating @@ -105,84 +95,107 @@ func processInPlace( // Save off the file names from the command line. fileNames := make([]string, len(originalOptions.FileNames)) - for i, fileName := range originalOptions.FileNames { - fileNames[i] = fileName - } + copy(fileNames, originalOptions.FileNames) for _, fileName := range fileNames { - - if _, err := os.Stat(fileName); os.IsNotExist(err) { - return err - } - - // Reconstruct the transformers for each file name, and allocate - // reader, mappers, and writer individually for each file name. This - // way CSV headers appear in each file, head -n 10 puts 10 rows for - // each output file, and so on. - options, recordTransformers, err := climain.ParseCommandLine(os.Args) + err := processFileInPlace(fileName, originalOptions) if err != nil { return err } - - // We can't in-place update http://, https://, etc. Also, anything with - // --prepipe or --prepipex, we won't try to guess how to invert that - // command to produce re-compressed output. - err = lib.IsUpdateableInPlace(fileName, options.ReaderOptions.Prepipe) - if err != nil { - return err - } - - containingDirectory := path.Dir(fileName) - // Names like ./mlr-in-place-2148227797 and ./mlr-in-place-1792078347, - // as revealed by printing handle.Name(). - handle, err := ioutil.TempFile(containingDirectory, "mlr-in-place-") - if err != nil { - return err - } - tempFileName := handle.Name() - - // If the input file is compressed and we'll be doing in-process - // decompression as we read the input file, try to do in-process - // compression as we write the output. - inputFileEncoding := lib.FindInputEncoding(fileName, options.ReaderOptions.FileInputEncoding) - - // Get a handle with, perhaps, a recompression wrapper around it. - wrappedHandle, isNew, err := lib.WrapOutputHandle(handle, inputFileEncoding) - if err != nil { - os.Remove(tempFileName) - return err - } - - // Run the Miller processing stream from the input file to the temp-output file. - err = stream.Stream([]string{fileName}, options, recordTransformers, wrappedHandle, false) - if err != nil { - os.Remove(tempFileName) - return err - } - - // Close the recompressor handle, if any recompression is being applied. - if isNew { - err = wrappedHandle.Close() - if err != nil { - os.Remove(tempFileName) - return err - } - } - - // Close the handle to the output file. This may force final writes, so - // it must be error-checked. - err = handle.Close() - if err != nil { - os.Remove(tempFileName) - return err - } - - // Rename the temp-output file on top of the input file. - err = os.Rename(tempFileName, fileName) - if err != nil { - os.Remove(tempFileName) - return err - } } return nil } + +func processFileInPlace( + fileName string, + originalOptions *cli.TOptions, +) error { + + if _, err := os.Stat(fileName); os.IsNotExist(err) { + return err + } + + // Reconstruct the transformers for each file name, and allocate + // reader, mappers, and writer individually for each file name. This + // way CSV headers appear in each file, head -n 10 puts 10 rows for + // each output file, and so on. + options, recordTransformers, err := climain.ParseCommandLine(os.Args) + if err != nil { + return err + } + + // We can't in-place update http://, https://, etc. Also, anything with + // --prepipe or --prepipex, we won't try to guess how to invert that + // command to produce re-compressed output. + err = lib.IsUpdateableInPlace(fileName, options.ReaderOptions.Prepipe) + if err != nil { + return err + } + + // Get the original file's mode so we can preserve it. + fileInfo, err := os.Stat(fileName) + if err != nil { + return err + } + originalMode := fileInfo.Mode() + + containingDirectory := path.Dir(fileName) + // Names like ./mlr-in-place-2148227797 and ./mlr-in-place-1792078347, + // as revealed by printing handle.Name(). + handle, err := os.CreateTemp(containingDirectory, "mlr-in-place-") + if err != nil { + return err + } + tempFileName := handle.Name() + + // If the input file is compressed and we'll be doing in-process + // decompression as we read the input file, try to do in-process + // compression as we write the output. + inputFileEncoding := lib.FindInputEncoding(fileName, options.ReaderOptions.FileInputEncoding) + + // Get a handle with, perhaps, a recompression wrapper around it. + wrappedHandle, isNew, err := lib.WrapOutputHandle(handle, inputFileEncoding) + if err != nil { + os.Remove(tempFileName) + return err + } + + // Run the Miller processing stream from the input file to the temp-output file. + err = stream.Stream([]string{fileName}, options, recordTransformers, wrappedHandle, false) + if err != nil { + os.Remove(tempFileName) + return err + } + + // Close the recompressor handle, if any recompression is being applied. + if isNew { + err = wrappedHandle.Close() + if err != nil { + os.Remove(tempFileName) + return err + } + } + + // Close the handle to the output file. This may force final writes, so + // it must be error-checked. + err = handle.Close() + if err != nil { + os.Remove(tempFileName) + return err + } + + // Rename the temp-output file on top of the input file. + err = os.Rename(tempFileName, fileName) + if err != nil { + os.Remove(tempFileName) + return err + } + + // Set the mode to match the original. + err = os.Chmod(fileName, originalMode) + if err != nil { + return err + } + + return nil +} diff --git a/pkg/go-csv/csv_reader.go b/pkg/go-csv/csv_reader.go index 507e9a94c..5a0820a01 100644 --- a/pkg/go-csv/csv_reader.go +++ b/pkg/go-csv/csv_reader.go @@ -311,15 +311,28 @@ func (r *Reader) readRecord(dst []string) ([]string, error) { var errRead error for errRead == nil { line, errRead = r.readLine() - if r.Comment != 0 && nextRune(line) == r.Comment { - line = nil - continue // Skip comment lines - } + + // MILLER-SPECIFIC UPDATE: DO NOT DO THIS + // if r.Comment != 0 && nextRune(line) == r.Comment { + // line = nil + // continue // Skip comment lines + // } + // MILLER-SPECIFIC UPDATE: DO NOT DO THIS // if errRead == nil && len(line) == lengthNL(line) { - // line = nil - // continue // Skip empty lines + // line = nil + // continue // Skip empty lines // } + + // MILLER-SPECIFIC UPDATE: If the line starts with the comment character, + // don't attempt to CSV-parse it -- just hand it back as a single field. + // This allows two things: + // * User comments get passed through as intended, without being reformatted; + // * Users can do things like `# a"b` in their comments without getting an + // imbalanced-double-quote error. + if r.Comment != 0 && nextRune(line) == r.Comment { + return []string{string(line)}, nil + } break } if errRead == io.EOF { diff --git a/pkg/input/constants.go b/pkg/input/constants.go new file mode 100644 index 000000000..42030c3eb --- /dev/null +++ b/pkg/input/constants.go @@ -0,0 +1,3 @@ +package input + +const CSV_BOM = "\xef\xbb\xbf" diff --git a/pkg/input/line_reader.go b/pkg/input/line_reader.go new file mode 100644 index 000000000..b1f965307 --- /dev/null +++ b/pkg/input/line_reader.go @@ -0,0 +1,223 @@ +// This file contains the interface for file-format-specific record-readers, as +// well as a collection of utility functions. + +package input + +import ( + "bufio" + "container/list" + "io" + "strings" + + "github.com/johnkerl/miller/v6/pkg/lib" +) + +type ILineReader interface { + // Read returns the string without the final newline (or whatever terminator). + // The error condition io.EOF as non-error "error" case. + // EOF is always returned with empty line: the code here is structured so that + // we do not return a non-empty line along with an EOF indicator. + Read() (string, error) +} + +type DefaultLineReader struct { + underlying *bufio.Reader + eof bool +} + +// SingleIRSLineReader handles reading lines with a single-character terminator. +type SingleIRSLineReader struct { + underlying *bufio.Reader + end_irs byte + eof bool +} + +// MultiIRSLineReader handles reading lines which may be delimited by multi-line separators, e.g. +// "\xe2\x90\x9e" for USV. +type MultiIRSLineReader struct { + underlying *bufio.Reader + irs string + irs_len int + end_irs byte + eof bool +} + +func NewLineReader(handle io.Reader, irs string) ILineReader { + underlying := bufio.NewReader(handle) + + irs_len := len(irs) + + // Not worth complicating the API by adding an error return. + // Empty IRS is checked elsewhere. + if irs_len < 1 { + panic("Empty IRS") + + } else if irs == "\n" || irs == "\r\n" { + return &DefaultLineReader{ + underlying: underlying, + } + + } else if irs_len == 1 { + return &SingleIRSLineReader{ + underlying: underlying, + end_irs: irs[0], + } + + } else { + return &MultiIRSLineReader{ + underlying: underlying, + irs: irs, + irs_len: irs_len, + end_irs: irs[irs_len-1], + } + } +} + +func (r *DefaultLineReader) Read() (string, error) { + + if r.eof { + return "", io.EOF + } + + line, err := r.underlying.ReadString('\n') + + // If we have EOF and a non-empty line, defer the EOF return to the next Read call. + if len(line) > 0 && lib.IsEOF(err) { + r.eof = true + err = nil + } + + n := len(line) + if strings.HasSuffix(line, "\r\n") { + line = line[:n-2] + } else if strings.HasSuffix(line, "\n") { + line = line[:n-1] + } + + return line, err +} + +func (r *SingleIRSLineReader) Read() (string, error) { + + if r.eof { + return "", io.EOF + } + + line, err := r.underlying.ReadString(r.end_irs) + + // If we have EOF and a non-empty line, defer the EOF return to the next Read call. + if len(line) > 0 && lib.IsEOF(err) { + r.eof = true + err = nil + } + + n := len(line) + if n > 0 && line[n-1] == r.end_irs { + line = line[:n-1] + } + + return line, err +} + +func (r *MultiIRSLineReader) Read() (string, error) { + + // bufio.Reader.ReadString supports only a single-character terminator. So we read lines ending + // in the final character, until we get a line that ends in the entire sequence or EOF. + // + // Note that bufio.Scanner has a very nice bufio.Scanner.Split method which can be overridden to + // support custom line-ending logic. Sadly, though, bufio.Scanner _only_ supports a fixed + // maximum line length, and misbehaves badly when presented with longer lines. So we cannot use + // bufio.Scanner. See also https://github.com/johnkerl/miller/issues/1501. + + if r.eof { + return "", io.EOF + } + + line := "" + + for { + + piece, err := r.underlying.ReadString(r.end_irs) + + // If we have EOF and a non-empty line, defer the EOF return to the next Read call. + if len(piece) > 0 && lib.IsEOF(err) { + r.eof = true + err = nil + } + + if err != nil { + return line, err // includes io.EOF as a non-error "error" case + } + + if strings.HasSuffix(piece, r.irs) { + piece = piece[:len(piece)-r.irs_len] + line += piece + break + } + + if r.eof { + line += piece + break + } + + } + + return line, nil +} + +// channelizedLineReader puts the line reading/splitting into its own goroutine in order to pipeline +// the I/O with regard to further processing. Used by record-readers for multiple file formats. +// +// Lines are written to the channel with their trailing newline (or whatever +// IRS) stripped off. So, callers get "a=1,b=2" rather than "a=1,b=2\n". +func channelizedLineReader( + lineReader ILineReader, + linesChannel chan<- *list.List, + downstreamDoneChannel <-chan bool, // for mlr head + recordsPerBatch int64, +) { + i := int64(0) + done := false + + lines := list.New() + + for { + line, err := lineReader.Read() + if err != nil { + if lib.IsEOF(err) { + done = true + break + } else { + break + } + } + + i++ + + lines.PushBack(line) + + // See if downstream processors will be ignoring further data (e.g. mlr + // head). If so, stop reading. This makes 'mlr head hugefile' exit + // quickly, as it should. + if i%recordsPerBatch == 0 { + select { + case <-downstreamDoneChannel: + done = true + break + default: + break + } + if done { + break + } + linesChannel <- lines + lines = list.New() + } + + if done { + break + } + } + linesChannel <- lines + close(linesChannel) // end-of-stream marker +} diff --git a/pkg/input/pseudo_reader_gen.go b/pkg/input/pseudo_reader_gen.go index 6479cb4d7..e847b59ab 100644 --- a/pkg/input/pseudo_reader_gen.go +++ b/pkg/input/pseudo_reader_gen.go @@ -4,10 +4,10 @@ import ( "container/list" "fmt" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type PseudoReaderGen struct { @@ -96,7 +96,7 @@ func (reader *PseudoReaderGen) process( // avoid goroutine-scheduler thrash. eof := false select { - case _ = <-downstreamDoneChannel: + case <-downstreamDoneChannel: eof = true break default: @@ -113,7 +113,6 @@ func (reader *PseudoReaderGen) process( if recordsAndContexts.Len() > 0 { readerChannel <- recordsAndContexts - recordsAndContexts = list.New() } } diff --git a/pkg/input/record_reader.go b/pkg/input/record_reader.go index 280201936..3ad932f2f 100644 --- a/pkg/input/record_reader.go +++ b/pkg/input/record_reader.go @@ -4,19 +4,11 @@ package input import ( - "bufio" "container/list" - "io" - "regexp" - "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/types" ) -const CSV_BOM = "\xef\xbb\xbf" - // Since Go is concurrent, the context struct (AWK-like variables such as // FILENAME, NF, NF, FNR, etc.) needs to be duplicated and passed through the // channels along with each record. Hence the initial context, which readers @@ -32,166 +24,3 @@ type IRecordReader interface { downstreamDoneChannel <-chan bool, // for mlr head ) } - -// NewLineScanner handles read lines which may be delimited by multi-line separators, -// e.g. "\xe2\x90\x9e" for USV. -func NewLineScanner(handle io.Reader, irs string) *bufio.Scanner { - scanner := bufio.NewScanner(handle) - - // Handled by default scanner. - if irs == "\n" || irs == "\r\n" { - return scanner - } - - irsbytes := []byte(irs) - irslen := len(irsbytes) - - // Custom splitter - recordSplitter := func( - data []byte, - atEOF bool, - ) ( - advance int, - token []byte, - err error, - ) { - datalen := len(data) - end := datalen - irslen - for i := 0; i <= end; i++ { - if data[i] == irsbytes[0] { - match := true - for j := 1; j < irslen; j++ { - if data[i+j] != irsbytes[j] { - match = false - break - } - } - if match { - return i + irslen, data[:i], nil - } - } - } - if !atEOF { - return 0, nil, nil - } - // There is one final token to be delivered, which may be the empty string. - // Returning bufio.ErrFinalToken here tells Scan there are no more tokens after this - // but does not trigger an error to be returned from Scan itself. - return 0, data, bufio.ErrFinalToken - } - - scanner.Split(recordSplitter) - - return scanner -} - -// TODO: comment copiously -// -// Lines are written to the channel with their trailing newline (or whatever -// IRS) stripped off. So, callers get "a=1,b=2" rather than "a=1,b=2\n". -func channelizedLineScanner( - lineScanner *bufio.Scanner, - linesChannel chan<- *list.List, - downstreamDoneChannel <-chan bool, // for mlr head - recordsPerBatch int64, -) { - i := int64(0) - done := false - - lines := list.New() - - for lineScanner.Scan() { - i++ - - lines.PushBack(lineScanner.Text()) - - // See if downstream processors will be ignoring further data (e.g. mlr - // head). If so, stop reading. This makes 'mlr head hugefile' exit - // quickly, as it should. - if i%recordsPerBatch == 0 { - select { - case _ = <-downstreamDoneChannel: - done = true - break - default: - break - } - if done { - break - } - linesChannel <- lines - lines = list.New() - } - - if done { - break - } - } - linesChannel <- lines - close(linesChannel) // end-of-stream marker -} - -// IPairSplitter splits a string into left and right, e.g. for IPS. -// This helps us reuse code for splitting by IPS string, or IPS regex. -type iPairSplitter interface { - Split(input string) []string -} - -func newPairSplitter(options *cli.TReaderOptions) iPairSplitter { - if options.IPSRegex == nil { - return &tIPSSplitter{ips: options.IPS} - } else { - return &tIPSRegexSplitter{ipsRegex: options.IPSRegex} - } -} - -type tIPSSplitter struct { - ips string -} - -func (s *tIPSSplitter) Split(input string) []string { - return strings.SplitN(input, s.ips, 2) -} - -type tIPSRegexSplitter struct { - ipsRegex *regexp.Regexp -} - -func (s *tIPSRegexSplitter) Split(input string) []string { - return lib.RegexSplitString(s.ipsRegex, input, 2) -} - -// IFieldSplitter splits a string into pieces, e.g. for IFS. -// This helps us reuse code for splitting by IFS string, or IFS regex. -type iFieldSplitter interface { - Split(input string) []string -} - -func newFieldSplitter(options *cli.TReaderOptions) iFieldSplitter { - if options.IFSRegex == nil { - return &tIFSSplitter{ifs: options.IFS, allowRepeatIFS: options.AllowRepeatIFS} - } else { - return &tIFSRegexSplitter{ifsRegex: options.IFSRegex} - } -} - -type tIFSSplitter struct { - ifs string - allowRepeatIFS bool -} - -func (s *tIFSSplitter) Split(input string) []string { - fields := lib.SplitString(input, s.ifs) - if s.allowRepeatIFS { - fields = lib.StripEmpties(fields) // left/right trim - } - return fields -} - -type tIFSRegexSplitter struct { - ifsRegex *regexp.Regexp -} - -func (s *tIFSRegexSplitter) Split(input string) []string { - return lib.RegexSplitString(s.ifsRegex, input, -1) -} diff --git a/pkg/input/record_reader_benchmark_test.go b/pkg/input/record_reader_benchmark_test.go index 9d2352983..7b79c32ad 100644 --- a/pkg/input/record_reader_benchmark_test.go +++ b/pkg/input/record_reader_benchmark_test.go @@ -5,10 +5,10 @@ import ( "github.com/stretchr/testify/assert" - "github.com/johnkerl/miller/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/cli" ) -// go test -run=nonesuch -bench=. github.com/johnkerl/miller/pkg/input/... +// go test -run=nonesuch -bench=. github.com/johnkerl/miller/v6/pkg/input/... func BenchmarkDKVPParse(b *testing.B) { readerOptions := &cli.TReaderOptions{ diff --git a/pkg/input/record_reader_csv.go b/pkg/input/record_reader_csv.go index e7135e2fc..aa7dec084 100644 --- a/pkg/input/record_reader_csv.go +++ b/pkg/input/record_reader_csv.go @@ -1,19 +1,18 @@ package input import ( - "bytes" "container/list" "fmt" "io" "strconv" "strings" - csv "github.com/johnkerl/miller/pkg/go-csv" + csv "github.com/johnkerl/miller/v6/pkg/go-csv" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -40,6 +39,11 @@ func NewRecordReaderCSV( if len(readerOptions.IFS) != 1 { return nil, fmt.Errorf("for CSV, IFS can only be a single character") } + if readerOptions.CommentHandling != cli.CommentsAreData { + if len(readerOptions.CommentString) != 1 { + return nil, fmt.Errorf("for CSV, the comment prefix must be a single character") + } + } return &RecordReaderCSV{ readerOptions: readerOptions, ifs0: readerOptions.IFS[0], @@ -65,8 +69,9 @@ func (reader *RecordReaderCSV) Read( ) if err != nil { errorChannel <- err + } else { + reader.processHandle(handle, "(stdin)", &context, readerChannel, errorChannel, downstreamDoneChannel) } - reader.processHandle(handle, "(stdin)", &context, readerChannel, errorChannel, downstreamDoneChannel) } else { for _, filename := range filenames { handle, err := lib.OpenFileForRead( @@ -101,13 +106,21 @@ func (reader *RecordReaderCSV) processHandle( // Reset state for start of next input file reader.filename = filename reader.rowNumber = 0 - reader.needHeader = !reader.readerOptions.UseImplicitCSVHeader + reader.needHeader = !reader.readerOptions.UseImplicitHeader reader.header = nil csvReader := csv.NewReader(NewBOMStrippingReader(handle)) csvReader.Comma = rune(reader.ifs0) csvReader.LazyQuotes = reader.csvLazyQuotes csvReader.TrimLeadingSpace = reader.csvTrimLeadingSpace + + if reader.readerOptions.CommentHandling != cli.CommentsAreData { + if len(reader.readerOptions.CommentString) == 1 { + // Use our modified fork of the go-csv package + csvReader.Comment = rune(reader.readerOptions.CommentString[0]) + } + } + csvRecordsChannel := make(chan *list.List, recordsPerBatch) go channelizedCSVRecordScanner(csvReader, csvRecordsChannel, downstreamDoneChannel, errorChannel, recordsPerBatch) @@ -157,7 +170,7 @@ func channelizedCSVRecordScanner( // quickly, as it should. if i%recordsPerBatch == 0 { select { - case _ = <-downstreamDoneChannel: + case <-downstreamDoneChannel: done = true break default: @@ -243,8 +256,7 @@ func (reader *RecordReaderCSV) getRecordBatch( } else { if !reader.readerOptions.AllowRaggedCSVInput { err := fmt.Errorf( - "mlr: CSV header/data length mismatch %d != %d "+ - "at filename %s row %d.\n", + "mlr: CSV header/data length mismatch %d != %d at filename %s row %d", nh, nd, reader.filename, reader.rowNumber, ) errorChannel <- err @@ -318,46 +330,21 @@ func (reader *RecordReaderCSV) maybeConsumeComment( // However, sadly, bytes.Buffer does not implement io.Writer because // its Write method has pointer receiver. So we have a WorkaroundBuffer // struct below which has non-pointer receiver. - buffer := NewWorkaroundBuffer() - csvWriter := csv.NewWriter(buffer) - csvWriter.Comma = rune(reader.ifs0) - csvWriter.Write(csvRecord) - csvWriter.Flush() - recordsAndContexts.PushBack(types.NewOutputString(buffer.String(), context)) + + // Contract with our fork of the go-csv CSV Reader, and, our own constructor. + lib.InternalCodingErrorIf(len(csvRecord) != 1) + recordsAndContexts.PushBack(types.NewOutputString(csvRecord[0], context)) + } else /* reader.readerOptions.CommentHandling == cli.SkipComments */ { // discard entirely } return false } -// ---------------------------------------------------------------- -// As noted above: wraps a bytes.Buffer, whose Write method has pointer -// receiver, in a struct with non-pointer receiver so that it implements -// io.Writer. - -type WorkaroundBuffer struct { - pbuffer *bytes.Buffer -} - -func NewWorkaroundBuffer() WorkaroundBuffer { - var buffer bytes.Buffer - return WorkaroundBuffer{ - pbuffer: &buffer, - } -} - -func (wb WorkaroundBuffer) Write(p []byte) (n int, err error) { - return wb.pbuffer.Write(p) -} - -func (wb WorkaroundBuffer) String() string { - return wb.pbuffer.String() -} - // ---------------------------------------------------------------- // BOM-stripping // -// Some CSVs start with a "byte-order mark" which is the 3-byte sequene +// Some CSVs start with a "byte-order mark" which is the 3-byte sequence // \xef\xbb\xbf". Any file with such contents trips up csv.Reader: // // * If a header line is not double-quoted then we can simply look at the first diff --git a/pkg/input/record_reader_csvlite.go b/pkg/input/record_reader_csvlite.go index 222064358..d658a4f99 100644 --- a/pkg/input/record_reader_csvlite.go +++ b/pkg/input/record_reader_csvlite.go @@ -25,10 +25,10 @@ import ( "strconv" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // recordBatchGetterCSV points to either an explicit-CSV-header or @@ -70,27 +70,7 @@ func NewRecordReaderCSVLite( useVoidRep: false, voidRep: "", } - if reader.readerOptions.UseImplicitCSVHeader { - reader.recordBatchGetter = getRecordBatchImplicitCSVHeader - } else { - reader.recordBatchGetter = getRecordBatchExplicitCSVHeader - } - return reader, nil -} - -func NewRecordReaderPPRINT( - readerOptions *cli.TReaderOptions, - recordsPerBatch int64, -) (*RecordReaderCSVLite, error) { - reader := &RecordReaderCSVLite{ - readerOptions: readerOptions, - recordsPerBatch: recordsPerBatch, - fieldSplitter: newFieldSplitter(readerOptions), - - useVoidRep: true, - voidRep: "-", - } - if reader.readerOptions.UseImplicitCSVHeader { + if reader.readerOptions.UseImplicitHeader { reader.recordBatchGetter = getRecordBatchImplicitCSVHeader } else { reader.recordBatchGetter = getRecordBatchExplicitCSVHeader @@ -114,16 +94,16 @@ func (reader *RecordReaderCSVLite) Read( ) if err != nil { errorChannel <- err - return + } else { + reader.processHandle( + handle, + "(stdin)", + &context, + readerChannel, + errorChannel, + downstreamDoneChannel, + ) } - reader.processHandle( - handle, - "(stdin)", - &context, - readerChannel, - errorChannel, - downstreamDoneChannel, - ) } else { for _, filename := range filenames { handle, err := lib.OpenFileForRead( @@ -134,17 +114,17 @@ func (reader *RecordReaderCSVLite) Read( ) if err != nil { errorChannel <- err - return + } else { + reader.processHandle( + handle, + filename, + &context, + readerChannel, + errorChannel, + downstreamDoneChannel, + ) + handle.Close() } - reader.processHandle( - handle, - filename, - &context, - readerChannel, - errorChannel, - downstreamDoneChannel, - ) - handle.Close() } } } @@ -164,9 +144,9 @@ func (reader *RecordReaderCSVLite) processHandle( reader.headerStrings = nil recordsPerBatch := reader.recordsPerBatch - lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) + lineReader := NewLineReader(handle, reader.readerOptions.IRS) linesChannel := make(chan *list.List, recordsPerBatch) - go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) + go channelizedLineReader(lineReader, linesChannel, downstreamDoneChannel, recordsPerBatch) for { recordsAndContexts, eof := reader.recordBatchGetter(reader, linesChannel, filename, context, errorChannel) @@ -237,8 +217,7 @@ func getRecordBatchExplicitCSVHeader( } else { if !reader.readerOptions.AllowRaggedCSVInput && len(reader.headerStrings) != len(fields) { err := fmt.Errorf( - "mlr: CSV header/data length mismatch %d != %d "+ - "at filename %s line %d.\n", + "mlr: CSV header/data length mismatch %d != %d at filename %s line %d", len(reader.headerStrings), len(fields), filename, reader.inputLineNumber, ) errorChannel <- err @@ -362,8 +341,7 @@ func getRecordBatchImplicitCSVHeader( } else { if !reader.readerOptions.AllowRaggedCSVInput && len(reader.headerStrings) != len(fields) { err := fmt.Errorf( - "mlr: CSV header/data length mismatch %d != %d "+ - "at filename %s line %d.\n", + "mlr: CSV header/data length mismatch %d != %d at filename %s line %d", len(reader.headerStrings), len(fields), filename, reader.inputLineNumber, ) errorChannel <- err diff --git a/pkg/input/record_reader_dkvp_nidx.go b/pkg/input/record_reader_dkvp_nidx.go index 5cd92f77d..6a53c8c26 100644 --- a/pkg/input/record_reader_dkvp_nidx.go +++ b/pkg/input/record_reader_dkvp_nidx.go @@ -8,13 +8,13 @@ import ( "strconv" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) -// splitter_DKVP_NIDX is a function type for the one bit of code differing +// line_splitter_DKVP_NIDX is a function type for the one bit of code differing // between the DKVP reader and the NIDX reader, namely, how it splits lines. type line_splitter_DKVP_NIDX func(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrmap, error) @@ -68,8 +68,9 @@ func (reader *RecordReaderDKVPNIDX) Read( ) if err != nil { errorChannel <- err + } else { + reader.processHandle(handle, "(stdin)", &context, readerChannel, errorChannel, downstreamDoneChannel) } - reader.processHandle(handle, "(stdin)", &context, readerChannel, errorChannel, downstreamDoneChannel) } else { for _, filename := range filenames { handle, err := lib.OpenFileForRead( @@ -101,9 +102,9 @@ func (reader *RecordReaderDKVPNIDX) processHandle( context.UpdateForStartOfFile(filename) recordsPerBatch := reader.recordsPerBatch - lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) + lineReader := NewLineReader(handle, reader.readerOptions.IRS) linesChannel := make(chan *list.List, recordsPerBatch) - go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) + go channelizedLineReader(lineReader, linesChannel, downstreamDoneChannel, recordsPerBatch) for { recordsAndContexts, eof := reader.getRecordBatch(linesChannel, errorChannel, context) @@ -168,25 +169,42 @@ func recordFromDKVPLine(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrm pairs := reader.fieldSplitter.Split(line) + // Without --incr-key: + // echo 'a,z=b,c' | mlr cat gives 1=a,z=b,3=c + // I.e. implicit keys are taken from the 1-up field counter. + // With it: + // echo 'a,z=b,c' | mlr cat gives 1=a,z=b,2=c + // I.e. implicit keys are taken from a 1-up count of fields lacking explicit keys. + incr_key := 0 + for i, pair := range pairs { kv := reader.pairSplitter.Split(pair) if len(kv) == 0 || (len(kv) == 1 && kv[0] == "") { // Ignore. This is expected when splitting with repeated IFS. } else if len(kv) == 1 { - // E.g the pair has no equals sign: "a" rather than "a=1" or + // E.g. the pair has no equals sign: "a" rather than "a=1" or // "a=". Here we use the positional index as the key. This way // DKVP is a generalization of NIDX. - key := strconv.Itoa(i + 1) // Miller userspace indices are 1-up + // + // Also: recall that Miller userspace indices are 1-up. + var int_key int + if reader.readerOptions.IncrementImplicitKey { + int_key = incr_key + } else { + int_key = i + } + str_key := strconv.Itoa(int_key + 1) + incr_key++ value := mlrval.FromDeferredType(kv[0]) - _, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames) + _, err := record.PutReferenceMaybeDedupe(str_key, value, dedupeFieldNames) if err != nil { return nil, err } } else { - key := kv[0] + str_key := kv[0] value := mlrval.FromDeferredType(kv[1]) - _, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames) + _, err := record.PutReferenceMaybeDedupe(str_key, value, dedupeFieldNames) if err != nil { return nil, err } @@ -203,9 +221,9 @@ func recordFromNIDXLine(reader *RecordReaderDKVPNIDX, line string) (*mlrval.Mlrm var i int = 0 for _, value := range values { i++ - key := strconv.Itoa(i) + str_key := strconv.Itoa(i) mval := mlrval.FromDeferredType(value) - record.PutReference(key, mval) + record.PutReference(str_key, mval) } return record, nil } diff --git a/pkg/input/record_reader_dkvp_test.go b/pkg/input/record_reader_dkvp_test.go index 77e0e557f..b73b97103 100644 --- a/pkg/input/record_reader_dkvp_test.go +++ b/pkg/input/record_reader_dkvp_test.go @@ -5,7 +5,7 @@ import ( "github.com/stretchr/testify/assert" - "github.com/johnkerl/miller/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/cli" ) func TestRecordFromDKVPLine(t *testing.T) { diff --git a/pkg/input/record_reader_factory.go b/pkg/input/record_reader_factory.go index e8328fd51..c4fd13934 100644 --- a/pkg/input/record_reader_factory.go +++ b/pkg/input/record_reader_factory.go @@ -3,7 +3,7 @@ package input import ( "fmt" - "github.com/johnkerl/miller/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/cli" ) func Create(readerOptions *cli.TReaderOptions, recordsPerBatch int64) (IRecordReader, error) { @@ -18,6 +18,10 @@ func Create(readerOptions *cli.TReaderOptions, recordsPerBatch int64) (IRecordRe return NewRecordReaderJSON(readerOptions, recordsPerBatch) case "nidx": return NewRecordReaderNIDX(readerOptions, recordsPerBatch) + case "md": + return NewRecordReaderMarkdown(readerOptions, recordsPerBatch) + case "markdown": + return NewRecordReaderMarkdown(readerOptions, recordsPerBatch) case "pprint": return NewRecordReaderPPRINT(readerOptions, recordsPerBatch) case "tsv": diff --git a/pkg/input/record_reader_json.go b/pkg/input/record_reader_json.go index 27b9b8e2c..63d9f7368 100644 --- a/pkg/input/record_reader_json.go +++ b/pkg/input/record_reader_json.go @@ -1,7 +1,6 @@ package input import ( - "bufio" "container/list" "fmt" "io" @@ -9,15 +8,17 @@ import ( "encoding/json" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type RecordReaderJSON struct { readerOptions *cli.TReaderOptions recordsPerBatch int64 // distinct from readerOptions.RecordsPerBatch for join/repl + // XXX 1513 + sawBrackets bool } func NewRecordReaderJSON( @@ -46,8 +47,9 @@ func (reader *RecordReaderJSON) Read( ) if err != nil { errorChannel <- err + } else { + reader.processHandle(handle, "(stdin)", &context, readerChannel, errorChannel, downstreamDoneChannel) } - reader.processHandle(handle, "(stdin)", &context, readerChannel, errorChannel, downstreamDoneChannel) } else { for _, filename := range filenames { handle, err := lib.OpenFileForRead( @@ -65,6 +67,7 @@ func (reader *RecordReaderJSON) Read( } } } + context.JSONHadBrackets = reader.sawBrackets readerChannel <- types.NewEndOfStreamMarkerList(&context) } @@ -96,7 +99,7 @@ func (reader *RecordReaderJSON) processHandle( i++ if i%recordsPerBatch == 0 { select { - case _ = <-downstreamDoneChannel: + case <-downstreamDoneChannel: eof = true break default: @@ -137,6 +140,9 @@ func (reader *RecordReaderJSON) processHandle( } } else if mlrval.IsArray() { + + reader.sawBrackets = true + records := mlrval.GetArray() if records == nil { errorChannel <- fmt.Errorf("internal coding error detected in JSON record-reader") @@ -147,7 +153,7 @@ func (reader *RecordReaderJSON) processHandle( if !mlrval.IsMap() { // TODO: more context errorChannel <- fmt.Errorf( - "valid but unmillerable JSON. Expected map (JSON object); got %s.", + "valid but unmillerable JSON. Expected map (JSON object); got %s", mlrval.GetTypeName(), ) return @@ -168,7 +174,7 @@ func (reader *RecordReaderJSON) processHandle( } else { errorChannel <- fmt.Errorf( - "valid but unmillerable JSON. Expected map (JSON object); got %s.", + "valid but unmillerable JSON. Expected map (JSON object); got %s", mlrval.GetTypeName(), ) return @@ -203,7 +209,7 @@ func (reader *RecordReaderJSON) processHandle( // JSONCommentEnabledReader implements io.Reader to strip comment lines // off of CSV data. type JSONCommentEnabledReader struct { - lineScanner *bufio.Scanner + lineReader ILineReader readerOptions *cli.TReaderOptions context *types.Context // Needed for channelized stdout-printing logic readerChannel chan<- *list.List // list of *types.RecordAndContext @@ -220,7 +226,7 @@ func NewJSONCommentEnabledReader( readerChannel chan<- *list.List, // list of *types.RecordAndContext ) *JSONCommentEnabledReader { return &JSONCommentEnabledReader{ - lineScanner: bufio.NewScanner(underlying), + lineReader: NewLineReader(underlying, "\n"), readerOptions: readerOptions, context: types.NewNilContext(), readerChannel: readerChannel, @@ -234,13 +240,15 @@ func (bsr *JSONCommentEnabledReader) Read(p []byte) (n int, err error) { return bsr.populateFromLine(p), nil } + done := false + // Loop until we can get a non-comment line to pass on, or end of file. - for { + for !done { // EOF - if !bsr.lineScanner.Scan() { - return 0, io.EOF + line, err := bsr.lineReader.Read() + if err != nil { + return 0, err } - line := bsr.lineScanner.Text() // Non-comment line if !strings.HasPrefix(line, bsr.readerOptions.CommentString) { @@ -256,7 +264,12 @@ func (bsr *JSONCommentEnabledReader) Read(p []byte) (n int, err error) { ell.PushBack(types.NewOutputString(line+"\n", bsr.context)) bsr.readerChannel <- ell } + + if done { + break + } } + return 0, nil } // populateFromLine is a helper for Read. It takes a full line from the @@ -268,9 +281,7 @@ func (bsr *JSONCommentEnabledReader) Read(p []byte) (n int, err error) { func (bsr *JSONCommentEnabledReader) populateFromLine(p []byte) int { numBytesWritten := 0 if len(bsr.lineBytes) < len(p) { - for i := 0; i < len(bsr.lineBytes); i++ { - p[i] = bsr.lineBytes[i] - } + copy(p, bsr.lineBytes) numBytesWritten = len(bsr.lineBytes) bsr.lineBytes = nil } else { diff --git a/pkg/input/record_reader_markdown.go b/pkg/input/record_reader_markdown.go new file mode 100644 index 000000000..1766967fc --- /dev/null +++ b/pkg/input/record_reader_markdown.go @@ -0,0 +1,30 @@ +package input + +import ( + "regexp" + + "github.com/johnkerl/miller/v6/pkg/cli" +) + +func NewRecordReaderMarkdown( + readerOptions *cli.TReaderOptions, + recordsPerBatch int64, +) (IRecordReader, error) { + + readerOptions.IFS = "|" + readerOptions.AllowRepeatIFS = false + + reader := &RecordReaderPprintBarredOrMarkdown{ + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, + separatorMatcher: regexp.MustCompile(`^\|[-\| ]+\|$`), + fieldSplitter: newFieldSplitter(readerOptions), + } + if reader.readerOptions.UseImplicitHeader { + reader.recordBatchGetter = getRecordBatchImplicitPprintHeader + } else { + reader.recordBatchGetter = getRecordBatchExplicitPprintHeader + } + return reader, nil + +} diff --git a/pkg/input/record_reader_pprint.go b/pkg/input/record_reader_pprint.go new file mode 100644 index 000000000..aad87769c --- /dev/null +++ b/pkg/input/record_reader_pprint.go @@ -0,0 +1,443 @@ +package input + +import ( + "container/list" + "fmt" + "io" + "regexp" + "strconv" + "strings" + + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" +) + +func NewRecordReaderPPRINT( + readerOptions *cli.TReaderOptions, + recordsPerBatch int64, +) (IRecordReader, error) { + if readerOptions.BarredPprintInput { + // Implemented in this file + + readerOptions.IFS = "|" + readerOptions.AllowRepeatIFS = false + + reader := &RecordReaderPprintBarredOrMarkdown{ + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, + separatorMatcher: regexp.MustCompile(`^\+[-+]*\+$`), + fieldSplitter: newFieldSplitter(readerOptions), + } + if reader.readerOptions.UseImplicitHeader { + reader.recordBatchGetter = getRecordBatchImplicitPprintHeader + } else { + reader.recordBatchGetter = getRecordBatchExplicitPprintHeader + } + return reader, nil + + } else { + // Use the CSVLite record-reader, which is implemented in another file, + // with multiple spaces instead of commas + reader := &RecordReaderCSVLite{ + readerOptions: readerOptions, + recordsPerBatch: recordsPerBatch, + fieldSplitter: newFieldSplitter(readerOptions), + + useVoidRep: true, + voidRep: "-", + } + if reader.readerOptions.UseImplicitHeader { + reader.recordBatchGetter = getRecordBatchImplicitCSVHeader + } else { + reader.recordBatchGetter = getRecordBatchExplicitCSVHeader + } + return reader, nil + } +} + +type RecordReaderPprintBarredOrMarkdown struct { + readerOptions *cli.TReaderOptions + recordsPerBatch int64 // distinct from readerOptions.RecordsPerBatch for join/repl + + separatorMatcher *regexp.Regexp + fieldSplitter iFieldSplitter + recordBatchGetter recordBatchGetterPprint + + inputLineNumber int64 + headerStrings []string +} + +// recordBatchGetterPprint points to either an explicit-PPRINT-header or +// implicit-PPRINT-header record-batch getter. +type recordBatchGetterPprint func( + reader *RecordReaderPprintBarredOrMarkdown, + linesChannel <-chan *list.List, + filename string, + context *types.Context, + errorChannel chan error, +) ( + recordsAndContexts *list.List, + eof bool, +) + +func (reader *RecordReaderPprintBarredOrMarkdown) Read( + filenames []string, + context types.Context, + readerChannel chan<- *list.List, // list of *types.RecordAndContext + errorChannel chan error, + downstreamDoneChannel <-chan bool, // for mlr head +) { + if filenames != nil { // nil for mlr -n + if len(filenames) == 0 { // read from stdin + handle, err := lib.OpenStdin( + reader.readerOptions.Prepipe, + reader.readerOptions.PrepipeIsRaw, + reader.readerOptions.FileInputEncoding, + ) + if err != nil { + errorChannel <- err + } else { + reader.processHandle( + handle, + "(stdin)", + &context, + readerChannel, + errorChannel, + downstreamDoneChannel, + ) + } + } else { + for _, filename := range filenames { + handle, err := lib.OpenFileForRead( + filename, + reader.readerOptions.Prepipe, + reader.readerOptions.PrepipeIsRaw, + reader.readerOptions.FileInputEncoding, + ) + if err != nil { + errorChannel <- err + } else { + reader.processHandle( + handle, + filename, + &context, + readerChannel, + errorChannel, + downstreamDoneChannel, + ) + handle.Close() + } + } + } + } + readerChannel <- types.NewEndOfStreamMarkerList(&context) +} + +func (reader *RecordReaderPprintBarredOrMarkdown) processHandle( + handle io.Reader, + filename string, + context *types.Context, + readerChannel chan<- *list.List, // list of *types.RecordAndContext + errorChannel chan error, + downstreamDoneChannel <-chan bool, // for mlr head +) { + context.UpdateForStartOfFile(filename) + reader.inputLineNumber = 0 + reader.headerStrings = nil + + recordsPerBatch := reader.recordsPerBatch + lineReader := NewLineReader(handle, reader.readerOptions.IRS) + linesChannel := make(chan *list.List, recordsPerBatch) + go channelizedLineReader(lineReader, linesChannel, downstreamDoneChannel, recordsPerBatch) + + for { + recordsAndContexts, eof := reader.recordBatchGetter(reader, linesChannel, filename, context, errorChannel) + if recordsAndContexts.Len() > 0 { + readerChannel <- recordsAndContexts + } + if eof { + break + } + } +} + +func getRecordBatchExplicitPprintHeader( + reader *RecordReaderPprintBarredOrMarkdown, + linesChannel <-chan *list.List, + filename string, + context *types.Context, + errorChannel chan error, +) ( + recordsAndContexts *list.List, + eof bool, +) { + recordsAndContexts = list.New() + dedupeFieldNames := reader.readerOptions.DedupeFieldNames + + lines, more := <-linesChannel + if !more { + return recordsAndContexts, true + } + + for e := lines.Front(); e != nil; e = e.Next() { + line := e.Value.(string) + + reader.inputLineNumber++ + + // Check for comments-in-data feature + // TODO: function-pointer this away + if reader.readerOptions.CommentHandling != cli.CommentsAreData { + if strings.HasPrefix(line, reader.readerOptions.CommentString) { + if reader.readerOptions.CommentHandling == cli.PassComments { + recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context)) + continue + } else if reader.readerOptions.CommentHandling == cli.SkipComments { + continue + } + // else comments are data + } + } + + if line == "" { + // Reset to new schema + reader.headerStrings = nil + continue + } + + // Example input: + // +-----+-----+----+---------------------+---------------------+ + // | a | b | i | x | y | + // +-----+-----+----+---------------------+---------------------+ + // | pan | pan | 1 | 0.3467901443380824 | 0.7268028627434533 | + // | eks | pan | 2 | 0.7586799647899636 | 0.5221511083334797 | + // +-----+-----+----+---------------------+---------------------+ + + // Skip lines like + // +-----+-----+----+---------------------+---------------------+ + if reader.separatorMatcher.MatchString(line) { + continue + } + + // Skip the leading and trailing pipes + paddedFields := reader.fieldSplitter.Split(line) + npad := len(paddedFields) + if npad < 2 { + continue + } + fields := make([]string, npad-2) + for i := range paddedFields { + if i == 0 || i == npad-1 { + continue + } + fields[i-1] = strings.TrimSpace(paddedFields[i]) + } + + if reader.headerStrings == nil { + reader.headerStrings = fields + // Get data lines on subsequent loop iterations + } else { + if !reader.readerOptions.AllowRaggedCSVInput && len(reader.headerStrings) != len(fields) { + err := fmt.Errorf( + "mlr: PPRINT-barred header/data length mismatch %d != %d at filename %s line %d", + len(reader.headerStrings), len(fields), filename, reader.inputLineNumber, + ) + errorChannel <- err + return + } + + record := mlrval.NewMlrmapAsRecord() + if !reader.readerOptions.AllowRaggedCSVInput { + for i, field := range fields { + value := mlrval.FromDeferredType(field) + _, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames) + if err != nil { + errorChannel <- err + return + } + } + } else { + nh := int64(len(reader.headerStrings)) + nd := int64(len(fields)) + n := lib.IntMin2(nh, nd) + var i int64 + for i = 0; i < n; i++ { + field := fields[i] + value := mlrval.FromDeferredType(field) + _, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames) + if err != nil { + errorChannel <- err + return + } + } + if nh < nd { + // if header shorter than data: use 1-up itoa keys + for i = nh; i < nd; i++ { + key := strconv.FormatInt(i+1, 10) + value := mlrval.FromDeferredType(fields[i]) + _, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames) + if err != nil { + errorChannel <- err + return + } + } + } + if nh > nd { + // if header longer than data: use "" values + for i = nd; i < nh; i++ { + record.PutCopy(reader.headerStrings[i], mlrval.VOID) + } + } + } + + context.UpdateForInputRecord() + recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) + + } + } + + return recordsAndContexts, false +} + +func getRecordBatchImplicitPprintHeader( + reader *RecordReaderPprintBarredOrMarkdown, + linesChannel <-chan *list.List, + filename string, + context *types.Context, + errorChannel chan error, +) ( + recordsAndContexts *list.List, + eof bool, +) { + recordsAndContexts = list.New() + dedupeFieldNames := reader.readerOptions.DedupeFieldNames + + lines, more := <-linesChannel + if !more { + return recordsAndContexts, true + } + + for e := lines.Front(); e != nil; e = e.Next() { + line := e.Value.(string) + + reader.inputLineNumber++ + + // Check for comments-in-data feature + // TODO: function-pointer this away + if reader.readerOptions.CommentHandling != cli.CommentsAreData { + if strings.HasPrefix(line, reader.readerOptions.CommentString) { + if reader.readerOptions.CommentHandling == cli.PassComments { + recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context)) + continue + } else if reader.readerOptions.CommentHandling == cli.SkipComments { + continue + } + // else comments are data + } + } + + if line == "" { + // Reset to new schema + reader.headerStrings = nil + continue + } + + // Example input: + // +-----+-----+----+---------------------+---------------------+ + // | a | b | i | x | y | + // +-----+-----+----+---------------------+---------------------+ + // | pan | pan | 1 | 0.3467901443380824 | 0.7268028627434533 | + // | eks | pan | 2 | 0.7586799647899636 | 0.5221511083334797 | + // +-----+-----+----+---------------------+---------------------+ + + // Skip lines like + // +-----+-----+----+---------------------+---------------------+ + if reader.separatorMatcher.MatchString(line) { + continue + } + + // Skip the leading and trailing pipes + paddedFields := reader.fieldSplitter.Split(line) + npad := len(paddedFields) + fields := make([]string, npad-2) + for i := range paddedFields { + if i == 0 || i == npad-1 { + continue + } + fields[i-1] = strings.TrimSpace(paddedFields[i]) + } + + if reader.headerStrings == nil { + n := len(fields) + reader.headerStrings = make([]string, n) + for i := 0; i < n; i++ { + reader.headerStrings[i] = strconv.Itoa(i + 1) + } + } else { + if !reader.readerOptions.AllowRaggedCSVInput && len(reader.headerStrings) != len(fields) { + err := fmt.Errorf( + "mlr: CSV header/data length mismatch %d != %d at filename %s line %d", + len(reader.headerStrings), len(fields), filename, reader.inputLineNumber, + ) + errorChannel <- err + return + } + } + + record := mlrval.NewMlrmapAsRecord() + if !reader.readerOptions.AllowRaggedCSVInput { + for i, field := range fields { + value := mlrval.FromDeferredType(field) + _, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames) + if err != nil { + errorChannel <- err + return + } + } + } else { + nh := int64(len(reader.headerStrings)) + nd := int64(len(fields)) + n := lib.IntMin2(nh, nd) + var i int64 + for i = 0; i < n; i++ { + field := fields[i] + value := mlrval.FromDeferredType(field) + _, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames) + if err != nil { + errorChannel <- err + return + } + } + if nh < nd { + // if header shorter than data: use 1-up itoa keys + key := strconv.FormatInt(i+1, 10) + value := mlrval.FromDeferredType(fields[i]) + _, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames) + if err != nil { + errorChannel <- err + return + } + } + if nh > nd { + // if header longer than data: use "" values + for i = nd; i < nh; i++ { + _, err := record.PutReferenceMaybeDedupe( + reader.headerStrings[i], + mlrval.VOID.Copy(), + dedupeFieldNames, + ) + if err != nil { + errorChannel <- err + return + } + } + } + } + + context.UpdateForInputRecord() + recordsAndContexts.PushBack(types.NewRecordAndContext(record, context)) + } + + return recordsAndContexts, false +} diff --git a/pkg/input/record_reader_tsv.go b/pkg/input/record_reader_tsv.go index d3b9d75a3..f70042bbe 100644 --- a/pkg/input/record_reader_tsv.go +++ b/pkg/input/record_reader_tsv.go @@ -7,10 +7,10 @@ import ( "strconv" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // recordBatchGetterTSV points to either an explicit-TSV-header or @@ -52,7 +52,7 @@ func NewRecordReaderTSV( recordsPerBatch: recordsPerBatch, fieldSplitter: newFieldSplitter(readerOptions), } - if reader.readerOptions.UseImplicitCSVHeader { + if reader.readerOptions.UseImplicitHeader { reader.recordBatchGetter = getRecordBatchImplicitTSVHeader } else { reader.recordBatchGetter = getRecordBatchExplicitTSVHeader @@ -76,16 +76,16 @@ func (reader *RecordReaderTSV) Read( ) if err != nil { errorChannel <- err - return + } else { + reader.processHandle( + handle, + "(stdin)", + &context, + readerChannel, + errorChannel, + downstreamDoneChannel, + ) } - reader.processHandle( - handle, - "(stdin)", - &context, - readerChannel, - errorChannel, - downstreamDoneChannel, - ) } else { for _, filename := range filenames { handle, err := lib.OpenFileForRead( @@ -96,17 +96,17 @@ func (reader *RecordReaderTSV) Read( ) if err != nil { errorChannel <- err - return + } else { + reader.processHandle( + handle, + filename, + &context, + readerChannel, + errorChannel, + downstreamDoneChannel, + ) + handle.Close() } - reader.processHandle( - handle, - filename, - &context, - readerChannel, - errorChannel, - downstreamDoneChannel, - ) - handle.Close() } } } @@ -126,9 +126,9 @@ func (reader *RecordReaderTSV) processHandle( reader.headerStrings = nil recordsPerBatch := reader.recordsPerBatch - lineScanner := NewLineScanner(handle, reader.readerOptions.IRS) + lineReader := NewLineReader(handle, reader.readerOptions.IRS) linesChannel := make(chan *list.List, recordsPerBatch) - go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch) + go channelizedLineReader(lineReader, linesChannel, downstreamDoneChannel, recordsPerBatch) for { recordsAndContexts, eof := reader.recordBatchGetter(reader, linesChannel, filename, context, errorChannel) @@ -186,8 +186,7 @@ func getRecordBatchExplicitTSVHeader( } else { if !reader.readerOptions.AllowRaggedCSVInput && len(reader.headerStrings) != len(fields) { err := fmt.Errorf( - "mlr: TSV header/data length mismatch %d != %d "+ - "at filename %s line %d.\n", + "mlr: TSV header/data length mismatch %d != %d at filename %s line %d", len(reader.headerStrings), len(fields), filename, reader.inputLineNumber, ) errorChannel <- err @@ -307,8 +306,7 @@ func getRecordBatchImplicitTSVHeader( } else { if !reader.readerOptions.AllowRaggedCSVInput && len(reader.headerStrings) != len(fields) { err := fmt.Errorf( - "mlr: TSV header/data length mismatch %d != %d "+ - "at filename %s line %d.\n", + "mlr: TSV header/data length mismatch %d != %d at filename %s line %d", len(reader.headerStrings), len(fields), filename, reader.inputLineNumber, ) errorChannel <- err diff --git a/pkg/input/record_reader_xtab.go b/pkg/input/record_reader_xtab.go index 0cfc74b25..5d1530007 100644 --- a/pkg/input/record_reader_xtab.go +++ b/pkg/input/record_reader_xtab.go @@ -1,17 +1,17 @@ package input import ( - "bufio" "container/list" "fmt" "io" + "os" "regexp" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type iXTABPairSplitter interface { @@ -71,8 +71,9 @@ func (reader *RecordReaderXTAB) Read( ) if err != nil { errorChannel <- err + } else { + reader.processHandle(handle, "(stdin)", &context, readerChannel, errorChannel, downstreamDoneChannel) } - reader.processHandle(handle, "(stdin)", &context, readerChannel, errorChannel, downstreamDoneChannel) } else { for _, filename := range filenames { handle, err := lib.OpenFileForRead( @@ -105,10 +106,10 @@ func (reader *RecordReaderXTAB) processHandle( recordsPerBatch := reader.recordsPerBatch // XTAB uses repeated IFS, rather than IRS, to delimit records - lineScanner := NewLineScanner(handle, reader.readerOptions.IFS) + lineReader := NewLineReader(handle, reader.readerOptions.IFS) stanzasChannel := make(chan *list.List, recordsPerBatch) - go channelizedStanzaScanner(lineScanner, reader.readerOptions, stanzasChannel, downstreamDoneChannel, + go channelizedStanzaScanner(lineReader, reader.readerOptions, stanzasChannel, downstreamDoneChannel, recordsPerBatch) for { @@ -137,7 +138,7 @@ func (reader *RecordReaderXTAB) processHandle( // start or end of file. A single stanza, once parsed, will become a single // record. func channelizedStanzaScanner( - lineScanner *bufio.Scanner, + lineReader ILineReader, readerOptions *cli.TReaderOptions, stanzasChannel chan<- *list.List, // list of list of string downstreamDoneChannel <-chan bool, // for mlr head @@ -150,8 +151,17 @@ func channelizedStanzaScanner( stanzas := list.New() stanza := newStanza() - for lineScanner.Scan() { - line := lineScanner.Text() + for { + line, err := lineReader.Read() + if err != nil { + if lib.IsEOF(err) { + done = true + break + } else { + fmt.Fprintf(os.Stderr, "mlr: %#v\n", err) + break + } + } // Check for comments-in-data feature // TODO: function-pointer this away @@ -192,7 +202,7 @@ func channelizedStanzaScanner( // quickly, as it should. if numStanzasSeen%recordsPerBatch == 0 { select { - case _ = <-downstreamDoneChannel: + case <-downstreamDoneChannel: done = true break default: @@ -304,7 +314,7 @@ type tXTABIPSSplitter struct { // which we need to produce just a pair of items -- a key and a value -- delimited by one or more // IPS. For exaemple, with IPS being a space, in 'abc 123' we need to get key 'abc' and value // '123'; for 'abc 123 456' we need key 'abc' and value '123 456'. It's super-elegant to simply -// regex-split the line like 'kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, line, 2)' -- +// regex-split the line like 'kv = lib.RegexCompiledSplitString(reader.readerOptions.IPSRegex, line, 2)' -- // however, that's 3x slower than the current implementation. It turns out regexes are great // but we should use them only when we must, since they are expensive. func (s *tXTABIPSSplitter) Split(input string) (key, value string, err error) { @@ -358,7 +368,7 @@ type tXTABIPSRegexSplitter struct { } func (s *tXTABIPSRegexSplitter) Split(input string) (key, value string, err error) { - kv := lib.RegexSplitString(s.ipsRegex, input, 2) + kv := lib.RegexCompiledSplitString(s.ipsRegex, input, 2) if len(kv) == 0 { return "", "", fmt.Errorf("internal coding error in XTAB reader") } else if len(kv) == 1 { diff --git a/pkg/input/splitters.go b/pkg/input/splitters.go new file mode 100644 index 000000000..5e24e0b73 --- /dev/null +++ b/pkg/input/splitters.go @@ -0,0 +1,77 @@ +// This file contains the interface for file-format-specific record-readers, as +// well as a collection of utility functions. + +package input + +import ( + "regexp" + "strings" + + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" +) + +// IPairSplitter splits a string into left and right, e.g. for IPS. +// This helps us reuse code for splitting by IPS string, or IPS regex. +type iPairSplitter interface { + Split(input string) []string +} + +func newPairSplitter(options *cli.TReaderOptions) iPairSplitter { + if options.IPSRegex == nil { + return &tIPSSplitter{ips: options.IPS} + } else { + return &tIPSRegexSplitter{ipsRegex: options.IPSRegex} + } +} + +type tIPSSplitter struct { + ips string +} + +func (s *tIPSSplitter) Split(input string) []string { + return strings.SplitN(input, s.ips, 2) +} + +type tIPSRegexSplitter struct { + ipsRegex *regexp.Regexp +} + +func (s *tIPSRegexSplitter) Split(input string) []string { + return lib.RegexCompiledSplitString(s.ipsRegex, input, 2) +} + +// IFieldSplitter splits a string into pieces, e.g. for IFS. +// This helps us reuse code for splitting by IFS string, or IFS regex. +type iFieldSplitter interface { + Split(input string) []string +} + +func newFieldSplitter(options *cli.TReaderOptions) iFieldSplitter { + if options.IFSRegex == nil { + return &tIFSSplitter{ifs: options.IFS, allowRepeatIFS: options.AllowRepeatIFS} + } else { + return &tIFSRegexSplitter{ifsRegex: options.IFSRegex} + } +} + +type tIFSSplitter struct { + ifs string + allowRepeatIFS bool +} + +func (s *tIFSSplitter) Split(input string) []string { + fields := lib.SplitString(input, s.ifs) + if s.allowRepeatIFS { + fields = lib.StripEmpties(fields) // left/right trim + } + return fields +} + +type tIFSRegexSplitter struct { + ifsRegex *regexp.Regexp +} + +func (s *tIFSRegexSplitter) Split(input string) []string { + return lib.RegexCompiledSplitString(s.ifsRegex, input, -1) +} diff --git a/pkg/lib/file_readers.go b/pkg/lib/file_readers.go index a348ff900..d370d616f 100644 --- a/pkg/lib/file_readers.go +++ b/pkg/lib/file_readers.go @@ -128,21 +128,24 @@ func openPrepipedHandleForRead( // Avoids shell-injection cases by replacing single-quote with backslash // single-quote and double-quote with backslack double-quote, then wrapping the // entire result in initial and final single-quote. -// -// TODO: test on Windows. Maybe needs move to pkg/platform. +// Also wraps in single quotes in case the filename has whitespace in it func escapeFileNameForPopen(filename string) string { var buffer bytes.Buffer - foundQuote := false + foundQuoteOrSpace := false for _, c := range filename { if c == '\'' || c == '"' { buffer.WriteRune('\'') buffer.WriteRune(c) buffer.WriteRune('\'') + foundQuoteOrSpace = true + } else if c == ' ' { + buffer.WriteRune(c) + foundQuoteOrSpace = true } else { buffer.WriteRune(c) } } - if foundQuote { + if foundQuoteOrSpace { return "'" + buffer.String() + "'" } else { return buffer.String() @@ -266,10 +269,10 @@ func IsUpdateableInPlace( if strings.HasPrefix(filename, "http://") || strings.HasPrefix(filename, "https://") || strings.HasPrefix(filename, "file://") { - return fmt.Errorf("http://, https://, and file:// URLs are not updateable in place.") + return fmt.Errorf("http://, https://, and file:// URLs are not updateable in place") } if prepipe != "" { - return fmt.Errorf("input with --prepipe or --prepipex is not updateable in place.") + return fmt.Errorf("input with --prepipe or --prepipex is not updateable in place") } return nil } @@ -311,7 +314,7 @@ func WrapOutputHandle( ) (io.WriteCloser, bool, error) { switch inputFileEncoding { case FileInputEncodingBzip2: - return fileWriteHandle, false, fmt.Errorf("bzip2 is not currently supported for in-place mode.") + return fileWriteHandle, false, fmt.Errorf("bzip2 is not currently supported for in-place mode") case FileInputEncodingGzip: return gzip.NewWriter(fileWriteHandle), true, nil case FileInputEncodingZlib: diff --git a/pkg/lib/halfpipe.go b/pkg/lib/halfpipe.go index 276b2893b..3b926ef3f 100644 --- a/pkg/lib/halfpipe.go +++ b/pkg/lib/halfpipe.go @@ -4,7 +4,7 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/platform" + "github.com/johnkerl/miller/v6/pkg/platform" ) // OpenOutboundHalfPipe returns a handle to a process. Writing to that handle @@ -21,6 +21,9 @@ import ( func OpenOutboundHalfPipe(commandString string) (*os.File, error) { readPipe, writePipe, err := os.Pipe() + if err != nil { + return nil, err + } var procAttr os.ProcAttr procAttr.Files = []*os.File{ @@ -56,6 +59,9 @@ func OpenOutboundHalfPipe(commandString string) (*os.File, error) { func OpenInboundHalfPipe(commandString string) (*os.File, error) { readPipe, writePipe, err := os.Pipe() + if err != nil { + return nil, err + } var procAttr os.ProcAttr procAttr.Files = []*os.File{ diff --git a/pkg/lib/ordered_map.go b/pkg/lib/ordered_map.go index 093c1ca84..a3d54bd50 100644 --- a/pkg/lib/ordered_map.go +++ b/pkg/lib/ordered_map.go @@ -111,6 +111,29 @@ func (omap *OrderedMap) GetWithCheck(key string) (interface{}, bool) { } } +func (omap *OrderedMap) GetKeys() []string { + keys := make([]string, omap.FieldCount) + i := 0 + for pe := omap.Head; pe != nil; pe = pe.Next { + keys[i] = pe.Key + i++ + } + return keys +} + +// Returns an array of keys, not including the ones specified. The ones +// specified are to be passed in as a map from string to bool, as Go +// doesn't have hash-sets. +func (omap *OrderedMap) GetKeysExcept(exceptions map[string]bool) []string { + keys := make([]string, 0) + for pe := omap.Head; pe != nil; pe = pe.Next { + if _, present := exceptions[pe.Key]; !present { + keys = append(keys, pe.Key) + } + } + return keys +} + // ---------------------------------------------------------------- func (omap *OrderedMap) Clear() { omap.FieldCount = 0 diff --git a/pkg/lib/readfiles.go b/pkg/lib/readfiles.go index 53ce49cc2..6eaaa0d17 100644 --- a/pkg/lib/readfiles.go +++ b/pkg/lib/readfiles.go @@ -6,11 +6,10 @@ package lib import ( - "io/ioutil" "os" "strings" - csv "github.com/johnkerl/miller/pkg/go-csv" + csv "github.com/johnkerl/miller/v6/pkg/go-csv" ) // LoadStringsFromFileOrDir calls LoadStringFromFile if path exists and is a @@ -34,10 +33,10 @@ func LoadStringsFromFileOrDir(path string, extension string) ([]string, error) { } } -// LoadStringFromFile is just a wrapper around ioutil.ReadFile, +// LoadStringFromFile is just a wrapper around os.ReadFile, // with a cast from []byte to string. func LoadStringFromFile(filename string) (string, error) { - data, err := ioutil.ReadFile(filename) + data, err := os.ReadFile(filename) if err != nil { return "", err } @@ -51,14 +50,18 @@ func LoadStringFromFile(filename string) (string, error) { func LoadStringsFromDir(dirname string, extension string) ([]string, error) { dslStrings := make([]string, 0) - entries, err := ioutil.ReadDir(dirname) + f, err := os.Open(dirname) + if err != nil { + return nil, err + } + defer f.Close() + + names, err := f.Readdirnames(-1) if err != nil { return nil, err } - for i := range entries { - entry := &entries[i] - name := (*entry).Name() + for _, name := range names { if !strings.HasSuffix(name, extension) { continue } diff --git a/pkg/lib/regex.go b/pkg/lib/regex.go index 3bab04036..b810a4b01 100644 --- a/pkg/lib/regex.go +++ b/pkg/lib/regex.go @@ -1,5 +1,5 @@ // ================================================================ -// Support for regexes in Miller. +// Support for regular expressions in Miller. // // * By and large we use the Go library. // @@ -13,17 +13,24 @@ // $y = "\2:\1"; // } // where the '=~' sets the captures and the "\2:\1" uses them. (Note that -// https://github.com/johnkerl/miller/issues/388 has a better suggestion -// which would make the captures explicit as variables, rather than implicit -// within CST state -- regardless, the current syntax will still be supported -// for backward compatibility and so is here to stay.) Here we make use of Go -// regexp-library functions to write to, and then later interpolate from, a -// captures array which is stored within CST state. (See the `runtime.State` -// object.) +// https://github.com/johnkerl/miller/issues/388 has a better suggestion which would make the +// captures explicit as variables, rather than implicit within CST state: this is implemented by +// the `strmatch` and `strmatchx` DSL functions. Regardless, the `=~` syntax will still be supported +// for backward compatibility and so is here to stay.) Here we make use of Go regexp-library +// functions to write to, and then later interpolate from, a captures array which is stored within +// CST state. (See the `runtime.State` object.) // // * "\0" is for a full match; "\1" .. "\9" are for submatch cqptures. E.g. // if $x is "foobarbaz" and the regex is "foo(.)(..)baz", then "\0" is // "foobarbaz", "\1" is "b", "\2" is "ar", and "\3".."\9" are "". +// +// * Naming: +// +// o "regexp" and "Regexp" are used for the Go library and its data structure, respectively; +// +// o "regex" is used for regular-expression strings following Miller's idiosyncratic syntax and +// semantics as described above. +// // ================================================================ package lib @@ -34,6 +41,7 @@ import ( "os" "regexp" "strings" + "sync" ) // captureDetector is used to see if a string literal interpolates previous @@ -44,20 +52,54 @@ var captureDetector = regexp.MustCompile(`\\[0-9]`) // "\2:\1" so they don't need to be recomputed on every record. var captureSplitter = regexp.MustCompile(`(\\[0-9])`) -// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax -// which predate the port of Miller from C to Go. Miller regexes use a final -// 'i' to indicate case-insensitivity; Go regexes use an initial "(?i)". +// See regexpCompileCached +var regexpCache map[string]*regexp.Regexp + +const cacheMaxSize = 1000 + +var cacheMutex sync.Mutex + +// regexpCompileCached keeps a cache of compiled regexes, so that the caller has the flexibility to +// only pass in strings while getting the benefits of compilation avoidance. // -// (See also mlr.bnf where we specify which things can be backslash-escaped -// without a syntax error at parse time.) +// Regarding cache size: in nominal use, regexp strings are within Miller DSL code statements, and +// there will be a handful. These will all get re-used after their first application, and the cache +// will remain bounded by the size of the user's DSL code. However, it is possible to have regex +// strings contained within Miller record-field data. // -// * If the regex_string is of the form a.*b, compiles it case-sensisitively. -// * If the regex_string is of the form "a.*b", compiles a.*b case-sensisitively. +// We could solve this by using an LRU cache. However, for simplicity, we limit the number of +// cached compiles, and for any extras that appear during record processing, we simply recompile +// each time. +func regexpCompileCached(s string) (*regexp.Regexp, error) { + if len(regexpCache) > cacheMaxSize { + return regexp.Compile(s) + } + r, err := regexp.Compile(s) + if err == nil { + cacheMutex.Lock() + if regexpCache == nil { + regexpCache = make(map[string]*regexp.Regexp) + } + regexpCache[s] = r + cacheMutex.Unlock() + } + return r, err +} + +// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax which predates the +// port of Miller from C to Go. Miller regexes use a final 'i' to indicate case-insensitivity; Go +// regexes use an initial "(?i)". +// +// (See also mlr.bnf where we specify which things can be backslash-escaped without a syntax error +// at parse time.) +// +// * If the regex_string is of the form a.*b, compiles it case-sensitively. +// * If the regex_string is of the form "a.*b", compiles a.*b case-sensitively. // * If the regex_string is of the form "a.*b"i, compiles a.*b case-insensitively. func CompileMillerRegex(regexString string) (*regexp.Regexp, error) { n := len(regexString) if n < 2 { - return regexp.Compile(regexString) + return regexpCompileCached(regexString) } // TODO: rethink this. This will strip out things people have entered, e.g. "\"...\"". @@ -68,20 +110,20 @@ func CompileMillerRegex(regexString string) (*regexp.Regexp, error) { // literals) and from verbs (like cut -r or having-fields). if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"") { - return regexp.Compile(regexString[1 : n-1]) + return regexpCompileCached(regexString[1 : n-1]) } if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/") { - return regexp.Compile(regexString[1 : n-1]) + return regexpCompileCached(regexString[1 : n-1]) } if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"i") { - return regexp.Compile("(?i)" + regexString[1:n-2]) + return regexpCompileCached("(?i)" + regexString[1:n-2]) } if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/i") { - return regexp.Compile("(?i)" + regexString[1:n-2]) + return regexpCompileCached("(?i)" + regexString[1:n-2]) } - return regexp.Compile(regexString) + return regexpCompileCached(regexString) } // CompileMillerRegexOrDie wraps CompileMillerRegex. Usually in Go we want to @@ -110,7 +152,7 @@ func CompileMillerRegexesOrDie(regexStrings []string) []*regexp.Regexp { // In Go as in all languages I'm aware of with a string-split, "a,b,c" splits // on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine -- // but "" splits to [""] when I wish it were []. This function does the latter. -func RegexSplitString(regex *regexp.Regexp, input string, n int) []string { +func RegexCompiledSplitString(regex *regexp.Regexp, input string, n int) []string { if input == "" { return make([]string, 0) } else { @@ -118,201 +160,50 @@ func RegexSplitString(regex *regexp.Regexp, input string, n int) []string { } } -// MakeEmptyRegexCaptures is for initial CST state at the start of executing -// the DSL expression for the current record. Even if '$x =~ "(..)_(...)" set -// "\1" and "\2" on the previous record, at start of processing for the current -// record we need to start with a clean slate. -func MakeEmptyRegexCaptures() []string { - return nil -} - -// RegexReplacementHasCaptures is used by the CST builder to see if -// string-literal is like "foo bar" or "foo \1 bar" -- in the latter case it -// needs to retain the compiled offsets-matrix information. -func RegexReplacementHasCaptures( - replacement string, -) ( - hasCaptures bool, - matrix [][]int, -) { - if captureDetector.MatchString(replacement) { - return true, captureSplitter.FindAllSubmatchIndex([]byte(replacement), -1) - } else { - return false, nil - } -} - -// RegexMatches implements the =~ DSL operator. The captures are stored in DSL -// state and may be used by a DSL statement after the =~. For example, in -// -// sub($a, "(..)_(...)", "\1:\2") -// -// the replacement string is an argument to sub and therefore the captures are -// confined to the implementation of the sub function. Similarly for gsub. But -// for the match operator, people can do -// -// if ($x =~ "(..)_(...)") { -// ... other lines of code ... -// $y = "\2:\1" -// } -// -// and the =~ callsite doesn't know if captures will be used or not. So, -// RegexMatches always returns the captures array. It is stored within the CST -// state. -func RegexMatches( - input string, - sregex string, -) ( - matches bool, - capturesOneUp []string, -) { - regex := CompileMillerRegexOrDie(sregex) - return RegexMatchesCompiled(input, regex) -} - -// RegexMatchesCompiled is the implementation for the =~ operator. Without -// Miller-style regex captures this would a simple one-line -// regex.MatchString(input). However, we return the captures array for the -// benefit of subsequent references to "\0".."\9". -func RegexMatchesCompiled( - input string, - regex *regexp.Regexp, -) (bool, []string) { - matrix := regex.FindAllSubmatchIndex([]byte(input), -1) - if matrix == nil || len(matrix) == 0 { - // Set all captures to "" - return false, make([]string, 10) - } - - // "\0" .. "\9" - captures := make([]string, 10) - - // If there are multiple matches -- e.g. input is - // - // "...ab_cde...fg_hij..." - // - // with regex - // - // "(..)_(...)" - // - // -- then we only consider the first match: boolean return value is true - // (the input string matched the regex), and the captures array will map - // "\1" to "ab" and "\2" to "cde". - row := matrix[0] - n := len(row) - - // Example return value from FindAllSubmatchIndex with input - // "...ab_cde...fg_hij..." and regex "(..)_(...)": - // - // Matrix is [][]int{ - // []int{3, 9, 3, 5, 6, 9}, - // []int{12, 18, 12, 14, 15, 18}, - // } - // - // As noted above we look at only the first row. - // - // * 3-9 is for the entire match "ab_cde" - // * 3-5 is for the first capture "ab" - // * 6-9 is for the second capture "cde" - - di := 0 - for si := 0; si < n && di <= 9; si += 2 { - start := row[si] - end := row[si+1] - if start >= 0 && end >= 0 { - captures[di] = input[start:end] - } - di += 1 - } - - return true, captures -} - -// InterpolateCaptures example: -// - Input $x is "ab_cde" -// - DSL expression -// if ($x =~ "(..)_(...)") { -// ... other lines of code ... -// $y = "\2:\1"; -// } -// - InterpolateCaptures is used on the evaluation of "\2:\1" -// - replacementString is "\2:\1" -// - replacementMatrix contains precomputed/cached offsets for the "\2" and -// "\1" substrings within "\2:\1" -// - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"), -// slot 2 being "cde" (for "\2"), and slots 3-9 being "". -func InterpolateCaptures( - replacementString string, - replacementMatrix [][]int, - captures []string, -) string { - if replacementMatrix == nil || captures == nil { - return replacementString - } - var buffer bytes.Buffer - - nonMatchStartIndex := 0 - - for _, row := range replacementMatrix { - start := row[0] - buffer.WriteString(replacementString[nonMatchStartIndex:row[0]]) - - // Map "\0".."\9" to integer index 0..9 - index := replacementString[start+1] - '0' - buffer.WriteString(captures[index]) - - nonMatchStartIndex = row[1] - } - - buffer.WriteString(replacementString[nonMatchStartIndex:]) - - return buffer.String() -} - -// RegexSub implements the sub DSL function. -func RegexSub( +// RegexStringSub implements the sub DSL function. +func RegexStringSub( input string, sregex string, replacement string, ) string { regex := CompileMillerRegexOrDie(sregex) - _, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement) - return RegexSubCompiled(input, regex, replacement, replacementCaptureMatrix) + _, replacementCaptureMatrix := ReplacementHasCaptures(replacement) + return RegexCompiledSub(input, regex, replacement, replacementCaptureMatrix) } -// RegexSubCompiled is the same as RegexSub but with compiled regex and +// RegexCompiledSub is the same as RegexStringSub but with compiled regex and // replacement strings. -func RegexSubCompiled( +func RegexCompiledSub( input string, regex *regexp.Regexp, replacement string, replacementCaptureMatrix [][]int, ) string { - return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, true) + return regexCompiledSubOrGsub(input, regex, replacement, replacementCaptureMatrix, true) } -// RegexGsub implements the gsub DSL function. -func RegexGsub( +// RegexStringGsub implements the `gsub` DSL function. +func RegexStringGsub( input string, sregex string, replacement string, ) string { regex := CompileMillerRegexOrDie(sregex) - _, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement) - return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, false) + _, replacementCaptureMatrix := ReplacementHasCaptures(replacement) + return regexCompiledSubOrGsub(input, regex, replacement, replacementCaptureMatrix, false) } -// regexSubGsubCompiled is the implementation for sub/gsub with compilex regex +// regexCompiledSubOrGsub is the implementation for `sub`/`gsub` with compilex regex // and replacement strings. -func regexSubGsubCompiled( +func regexCompiledSubOrGsub( input string, regex *regexp.Regexp, replacement string, replacementCaptureMatrix [][]int, breakOnFirst bool, ) string { - matrix := regex.FindAllSubmatchIndex([]byte(input), -1) - if matrix == nil || len(matrix) == 0 { + matrix := regex.FindAllStringSubmatchIndex(input, -1) + if len(matrix) == 0 { return input } @@ -384,3 +275,254 @@ func regexSubGsubCompiled( buffer.WriteString(input[nonMatchStartIndex:]) return buffer.String() } + +// RegexStringMatchSimple is for simple boolean return without any substring captures. +func RegexStringMatchSimple( + input string, + sregex string, +) bool { + regex := CompileMillerRegexOrDie(sregex) + return RegexCompiledMatchSimple(input, regex) +} + +// RegexCompiledMatchSimple is for simple boolean return without any substring captures. +func RegexCompiledMatchSimple( + input string, + regex *regexp.Regexp, +) bool { + return regex.MatchString(input) +} + +// RegexStringMatchWithMapResults implements much of the `strmatchx` DSL function. This returns +// captures via return values. This is distinct from RegexStringMatchWithCaptures which is for the +// `=~` DSL operator. +func RegexStringMatchWithMapResults( + input string, + sregex string, +) ( + matches bool, + captures []string, + starts []int, + ends []int, +) { + regex := CompileMillerRegexOrDie(sregex) + return RegexCompiledMatchWithMapResults(input, regex) +} + +// RegexCompiledMatchWithMapResults does the work for RegexStringMatchWithMapResults once +// a compiled regexp is available. Array slot 0 is for the full match; slots 1 and up +// are for the capture-matches such as "\([0-9]+\):\([a-z]+\)". +func RegexCompiledMatchWithMapResults( + input string, + regex *regexp.Regexp, +) (bool, []string, []int, []int) { + captures := make([]string, 0, 10) + starts := make([]int, 0, 10) + ends := make([]int, 0, 10) + + matrix := regex.FindAllStringSubmatchIndex(input, -1) + if len(matrix) == 0 { + return false, captures, starts, ends + } + + // If there are multiple matches -- e.g. input is + // + // "...ab_cde...fg_hij..." + // + // with regex + // + // "(..)_(...)" + // + // -- then we only consider the first match: boolean return value is true + // (the input string matched the regex), and the captures array will map + // slot 1 to "ab" and slot 2 to "cde". + row := matrix[0] + n := len(row) + + // Example return value from FindAllSubmatchIndex with input + // "...ab_cde...fg_hij..." and regex "(..)_(...)": + // + // Matrix is [][]int{ + // []int{3, 9, 3, 5, 6, 9}, + // []int{12, 18, 12, 14, 15, 18}, + // } + // + // As noted above we look at only the first row. + // + // * 3-9 is for the entire match "ab_cde" + // * 3-5 is for the first capture "ab" + // * 6-9 is for the second capture "cde" + + for si := 0; si < n; si += 2 { + start := row[si] + end := row[si+1] + if start >= 0 && end >= 0 { + captures = append(captures, input[start:end]) + starts = append(starts, start+1) + ends = append(ends, end) + } else { + captures = append(captures, "") + starts = append(starts, -1) + ends = append(ends, -1) + } + } + + return true, captures, starts, ends +} + +// RegexStringMatchWithCaptures implements the =~ DSL operator. The captures are stored in DSL +// state and may be used by a DSL statement after the =~. For example, in +// +// sub($a, "(..)_(...)", "\1:\2") +// +// the replacement string is an argument to sub and therefore the captures are +// confined to the implementation of the sub function. Similarly for gsub. But +// for the match operator, people can do +// +// if ($x =~ "(..)_(...)") { +// ... other lines of code ... +// $y = "\2:\1" +// } +// +// and the =~ callsite doesn't know if captures will be used or not. So, +// RegexStringMatchWithCaptures always returns the captures array. It is stored within the CST +// state. +func RegexStringMatchWithCaptures( + input string, + sregex string, +) ( + matches bool, + capturesOneUp []string, +) { + regex := CompileMillerRegexOrDie(sregex) + return RegexCompiledMatchWithCaptures(input, regex) +} + +// RegexCompiledMatchWithCaptures is the implementation for the =~ operator. Without +// Miller-style regex captures this would a simple one-line +// regex.MatchString(input). However, we return the captures array for the +// benefit of subsequent references to "\0".."\9". +func RegexCompiledMatchWithCaptures( + input string, + regex *regexp.Regexp, +) (bool, []string) { + matrix := regex.FindAllStringSubmatchIndex(input, -1) + if len(matrix) == 0 { + // Set all captures to "" + return false, make([]string, 10) + } + + // "\0" .. "\9" + captures := make([]string, 10) + + // If there are multiple matches -- e.g. input is + // + // "...ab_cde...fg_hij..." + // + // with regex + // + // "(..)_(...)" + // + // -- then we only consider the first match: boolean return value is true + // (the input string matched the regex), and the captures array will map + // "\1" to "ab" and "\2" to "cde". + row := matrix[0] + n := len(row) + + // Example return value from FindAllSubmatchIndex with input + // "...ab_cde...fg_hij..." and regex "(..)_(...)": + // + // Matrix is [][]int{ + // []int{3, 9, 3, 5, 6, 9}, + // []int{12, 18, 12, 14, 15, 18}, + // } + // + // As noted above we look at only the first row. + // + // * 3-9 is for the entire match "ab_cde" + // * 3-5 is for the first capture "ab" + // * 6-9 is for the second capture "cde" + + di := 0 + for si := 0; si < n && di <= 9; si += 2 { + start := row[si] + end := row[si+1] + if start >= 0 && end >= 0 { + captures[di] = input[start:end] + } + di += 1 + } + + return true, captures +} + +// MakeEmptyCaptures is for initial CST state at the start of executing the DSL expression for the +// current record. Even if '$x =~ "(..)_(...)" set "\1" and "\2" on the previous record, at start +// of processing for the current record we need to start with a clean slate. This is in support of +// CST state, which `=~` semantics requires. +func MakeEmptyCaptures() []string { + return nil +} + +// ReplacementHasCaptures is used by the CST builder to see if string-literal is like "foo bar" or +// "foo \1 bar" -- in the latter case it needs to retain the compiled offsets-matrix information. +// This is in support of CST state, which `=~` semantics requires. +func ReplacementHasCaptures( + replacement string, +) ( + hasCaptures bool, + matrix [][]int, +) { + if captureDetector.MatchString(replacement) { + return true, captureSplitter.FindAllStringSubmatchIndex(replacement, -1) + } else { + return false, nil + } +} + +// InterpolateCaptures example: +// +// * Input $x is "ab_cde" +// +// - DSL expression +// if ($x =~ "(..)_(...)") { +// ... other lines of code ... +// $y = "\2:\1"; +// } +// +// * InterpolateCaptures is used on the evaluation of "\2:\1" +// +// * replacementString is "\2:\1" +// +// - replacementMatrix contains precomputed/cached offsets for the "\2" and +// "\1" substrings within "\2:\1" +// +// - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"), +// slot 2 being "cde" (for "\2"), and slots 3-9 being "". +func InterpolateCaptures( + replacementString string, + replacementMatrix [][]int, + captures []string, +) string { + if replacementMatrix == nil || captures == nil { + return replacementString + } + var buffer bytes.Buffer + + nonMatchStartIndex := 0 + + for _, row := range replacementMatrix { + start := row[0] + buffer.WriteString(replacementString[nonMatchStartIndex:row[0]]) + + // Map "\0".."\9" to integer index 0..9 + index := replacementString[start+1] - '0' + buffer.WriteString(captures[index]) + + nonMatchStartIndex = row[1] + } + + buffer.WriteString(replacementString[nonMatchStartIndex:]) + + return buffer.String() +} diff --git a/pkg/lib/regex_test.go b/pkg/lib/regex_test.go index 961d73f8d..d2a8f5f70 100644 --- a/pkg/lib/regex_test.go +++ b/pkg/lib/regex_test.go @@ -88,7 +88,7 @@ var dataForMatches = []tDataForMatches{ func TestRegexReplacementHasCaptures(t *testing.T) { for i, entry := range dataForHasCaptures { - actualHasCaptures, actualMatrix := RegexReplacementHasCaptures(entry.replacement) + actualHasCaptures, actualMatrix := ReplacementHasCaptures(entry.replacement) if actualHasCaptures != entry.expectedHasCaptures { t.Fatalf("case %d replacement \"%s\" expected %v got %v\n", i, entry.replacement, entry.expectedHasCaptures, actualHasCaptures, @@ -104,7 +104,7 @@ func TestRegexReplacementHasCaptures(t *testing.T) { func TestRegexSub(t *testing.T) { for i, entry := range dataForSub { - actualOutput := RegexSub(entry.input, entry.sregex, entry.replacement) + actualOutput := RegexStringSub(entry.input, entry.sregex, entry.replacement) if actualOutput != entry.expectedOutput { t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n", i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput, @@ -115,7 +115,7 @@ func TestRegexSub(t *testing.T) { func TestRegexGsub(t *testing.T) { for i, entry := range dataForGsub { - actualOutput := RegexGsub(entry.input, entry.sregex, entry.replacement) + actualOutput := RegexStringGsub(entry.input, entry.sregex, entry.replacement) if actualOutput != entry.expectedOutput { t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n", i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput, @@ -126,7 +126,7 @@ func TestRegexGsub(t *testing.T) { func TestRegexMatches(t *testing.T) { for i, entry := range dataForMatches { - actualOutput, actualCaptures := RegexMatches(entry.input, entry.sregex) + actualOutput, actualCaptures := RegexStringMatchWithCaptures(entry.input, entry.sregex) if actualOutput != entry.expectedOutput { t.Fatalf("case %d input \"%s\" sregex \"%s\" expected %v got %v\n", i, entry.input, entry.sregex, entry.expectedOutput, actualOutput, diff --git a/pkg/lib/time.go b/pkg/lib/time.go index 4fa6818c5..8ceae8760 100644 --- a/pkg/lib/time.go +++ b/pkg/lib/time.go @@ -16,6 +16,9 @@ import ( // statement does 'ENV["TZ"] = Asia/Istanbul'. func SetTZFromEnv() error { tzenv := os.Getenv("TZ") + if tzenv == "" { + return nil + } location, err := time.LoadLocation(tzenv) if err != nil { return fmt.Errorf("TZ environment variable appears malformed: \"%s\"", tzenv) diff --git a/pkg/lib/util.go b/pkg/lib/util.go index 4a8faa86d..f37194ccb 100644 --- a/pkg/lib/util.go +++ b/pkg/lib/util.go @@ -2,7 +2,6 @@ package lib import ( "fmt" - "io/ioutil" "os" "sort" "strconv" @@ -15,7 +14,7 @@ func BooleanXOR(a, b bool) bool { } func BoolToInt(b bool) int64 { - if b == false { + if !b { return 0 } else { return 1 @@ -186,9 +185,9 @@ func GetArrayKeysSorted(input map[string]string) []string { // WriteTempFile places the contents string into a temp file, which the caller // must remove. func WriteTempFileOrDie(contents string) string { - // Use "" as first argument to ioutil.TempFile to use default directory. + // Use "" as first argument to os.CreateTemp to use default directory. // Nominally "/tmp" or somesuch on all unix-like systems, but not for Windows. - handle, err := ioutil.TempFile("", "mlr-temp") + handle, err := os.CreateTemp("", "mlr-temp") if err != nil { fmt.Printf("mlr: could not create temp file.\n") os.Exit(1) @@ -209,6 +208,9 @@ func WriteTempFileOrDie(contents string) string { } func CopyStringArray(input []string) []string { + if input == nil { + return nil + } output := make([]string, len(input)) copy(output, input) return output diff --git a/pkg/mlrval/mlrmap.go b/pkg/mlrval/mlrmap.go index e2596f09d..562a91def 100644 --- a/pkg/mlrval/mlrmap.go +++ b/pkg/mlrval/mlrmap.go @@ -10,8 +10,8 @@ // // * It keeps a doubly-linked list of key-value pairs. // -// * By default, no hash functions are computed when the map is written to or -// read from. +// * With hash-records set to false, no hash functions are computed when the map +// is written to or read from. // // * Gets are implemented by sequential scan through the list: given a key, // the key-value pairs are scanned through until a match is (or is not) found. @@ -20,6 +20,10 @@ // was found in the Go implementation. Test data was million-line CSV and // DKVP, with a dozen columns or so. // +// * However, with higher column-count (see https://github.com/johnkerl/miller/issues/1506 +// and https://github.com/johnkerl/miller/pull/1507), non-hashing becomes +// a substantial penalty. +// // Note however that an auxiliary constructor is provided which does use // a key-to-entry hashmap in place of linear search for get/put/has/delete. // This may be useful in certain contexts, even though it's not the default @@ -53,12 +57,11 @@ package mlrval -// For the C port having this off was a noticeable performance improvement (10-15%). -// For the Go port having it off is a less-noticeable performance improvement (5%). -// Both these figures are for just doing mlr cat. At the moment I'm leaving this -// default-on pending more profiling on more complex record-processing operations -// such as mlr sort. -var hashRecords = false +// As noted above, hashing has a minor penalty for low column count: computing +// hashmaps takes more time than is saved later on. But for higher column-count, +// non-hashing has a huge penalty. Therefore we default to on. And users can +// use `mlr --no-hash-records` or `mlr --hash-records` to flip the behavior. +var hashRecords = true func HashRecords(onOff bool) { hashRecords = onOff @@ -70,9 +73,7 @@ type Mlrmap struct { Head *MlrmapEntry Tail *MlrmapEntry - // Surprisingly, using this costs about 25% for cat/cut/etc tests - // on million-line data files (CSV, DKVP) with a dozen or so columns. - // So, the constructor allows callsites to use it, or not. + // This can be nil if hashRecords is off. keysToEntries map[string]*MlrmapEntry } diff --git a/pkg/mlrval/mlrmap_accessors.go b/pkg/mlrval/mlrmap_accessors.go index befb5f726..9552efe6c 100644 --- a/pkg/mlrval/mlrmap_accessors.go +++ b/pkg/mlrval/mlrmap_accessors.go @@ -5,7 +5,7 @@ import ( "fmt" "strconv" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) // IsEmpty determines if a map is empty. @@ -74,7 +74,7 @@ func (mlrmap *Mlrmap) PutReferenceMaybeDedupe(key string, value *Mlrval, dedupe return key, nil } - for i := 2; i < 1000; i++ { + for i := 2; ; i++ { newKey := key + "_" + strconv.Itoa(i) pe := mlrmap.findEntry(newKey) if pe == nil { @@ -82,7 +82,6 @@ func (mlrmap *Mlrmap) PutReferenceMaybeDedupe(key string, value *Mlrval, dedupe return newKey, nil } } - return key, fmt.Errorf("record has too many input fields named \"%s\"", key) } // PutCopy copies the key and value (deep-copying in case the value is array/map). @@ -281,6 +280,19 @@ func (mlrmap *Mlrmap) GetKeys() []string { return keys } +// Returns an array of keys, not including the ones specified. The ones +// specified are to be passed in as a map from string to bool, as Go +// doesn't have hash-sets. +func (mlrmap *Mlrmap) GetKeysExcept(exceptions map[string]bool) []string { + keys := make([]string, 0) + for pe := mlrmap.Head; pe != nil; pe = pe.Next { + if _, present := exceptions[pe.Key]; !present { + keys = append(keys, pe.Key) + } + } + return keys +} + // ---------------------------------------------------------------- // TODO: put error-return into this API func (mlrmap *Mlrmap) PutNameWithPositionalIndex(position int64, name *Mlrval) { @@ -348,7 +360,7 @@ func (mlrmap *Mlrmap) getWithMlrvalArrayIndex(index *Mlrval) (*Mlrval, error) { } if i < n-1 { if !next.IsMap() { - return nil, fmt.Errorf("mlr: cannot multi-index non-map.") + return nil, fmt.Errorf("mlr: cannot multi-index non-map") } current = next.intf.(*Mlrmap) } else { @@ -366,7 +378,7 @@ func (mlrmap *Mlrmap) getWithMlrvalSingleIndex(index *Mlrval) (*Mlrval, error) { return mlrmap.Get(index.String()), nil } else { return nil, fmt.Errorf( - "Record/map indices must be string, int, or array thereof; got %s", index.GetTypeName(), + "record/map indices must be string, int, or array thereof; got %s", index.GetTypeName(), ) } } diff --git a/pkg/mlrval/mlrmap_accessors_test.go b/pkg/mlrval/mlrmap_accessors_test.go index 890ac6a9e..6bae83f89 100644 --- a/pkg/mlrval/mlrmap_accessors_test.go +++ b/pkg/mlrval/mlrmap_accessors_test.go @@ -47,3 +47,22 @@ func TestPutReference(t *testing.T) { } // TODO: TestPrependReference + +func TestGetKeysExcept(t *testing.T) { + mlrmap := NewMlrmap() + mlrmap.PutReference("a", FromInt(1)) + mlrmap.PutReference("b", FromInt(2)) + + exceptions := make(map[string]bool) + exceptions["x"] = true + exceptions["y"] = true + + assert.Equal(t, mlrmap.GetKeys(), []string{"a", "b"}) + assert.Equal(t, mlrmap.GetKeysExcept(exceptions), []string{"a", "b"}) + + exceptions["a"] = true + assert.Equal(t, mlrmap.GetKeysExcept(exceptions), []string{"b"}) + + exceptions["b"] = true + assert.Equal(t, mlrmap.GetKeysExcept(exceptions), []string{}) +} diff --git a/pkg/mlrval/mlrmap_flatten_unflatten.go b/pkg/mlrval/mlrmap_flatten_unflatten.go index 8e48ba515..4e5d117d2 100644 --- a/pkg/mlrval/mlrmap_flatten_unflatten.go +++ b/pkg/mlrval/mlrmap_flatten_unflatten.go @@ -25,7 +25,7 @@ package mlrval import ( "strings" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) // ---------------------------------------------------------------- @@ -106,7 +106,18 @@ func (mlrmap *Mlrmap) isFlattenable() bool { // For mlr unflatten without -f. This undoes Unflatten. This is for conversion // from non-JSON to JSON. If there are fields x.a, x.b, x.c, etc. they're put // into a single field x with map-valued value keyed by "a", "b", "c". - +// +// There is a heurtistic here though. Miller is (wildly) multi-format and needs +// to accommodate all manner of data. In the JSON world, "." is the default +// delimiter for nested data, and we're here to handle that. But in the R world, +// "." is just like "_" in other languages: witness "data.frame" rather than +// "data_frame". If the "." was intended as punctuation, in a say a field named +// "a.b" with value 3, then unflatten-to-JSON will make `{"a": {"b": 3}}`. This +// is just our default behavior; users can use --no-auto-unflatten. Weirder +// are field names like ".", ".x", "x.", "x..y", etc. The heuristic here +// is that when we split on "." and any of the pieces around/between the dots +// are empty string, we don't try to unflatten that field. +// // Special case: if the resulting string keys are string representations of 1, // 2, 3, etc -- without gaps -- then the map is converted to an array. // @@ -134,22 +145,38 @@ func (mlrmap *Mlrmap) CopyUnflattened( // We'll come through this loop once for x.a, another for x.b, etc. for pe := mlrmap.Head; pe != nil; pe = pe.Next { - // Is the field name something dot something? - if strings.Contains(pe.Key, separator) { - arrayOfIndices := SplitAXHelper(pe.Key, separator) - arrayval := arrayOfIndices.intf.([]*Mlrval) - lib.InternalCodingErrorIf(len(arrayval) < 1) - // If the input field name was "x.a" then remember the "x". - baseIndex := arrayval[0].String() - affectedBaseIndices[baseIndex] = true - // Use PutIndexed to assign $x["a"] = 7, or $x["b"] = 8, etc. - other.PutIndexed( - CopyMlrvalArray(arrayval), - unflattenTerminal(pe.Value).Copy(), - ) - } else { + // If there are no dots in the field name, treat it as a terminal. + if !strings.Contains(pe.Key, separator) { other.PutReference(pe.Key, unflattenTerminal(pe.Value)) + continue } + + arrayOfIndices := SplitAXHelper(pe.Key, separator) + arrayval := arrayOfIndices.intf.([]*Mlrval) + lib.InternalCodingErrorIf(len(arrayval) < 1) + + // Check for "" in any of the split pieces; treat the field as terminal if so. + legitDots := true + for i, _ := range arrayval { + piece := arrayval[i].String() + if piece == "" { + legitDots = false + break + } + } + if !legitDots { + other.PutReference(pe.Key, unflattenTerminal(pe.Value)) + continue + } + + // If the input field name was "x.a" then remember the "x". + baseIndex := arrayval[0].String() + affectedBaseIndices[baseIndex] = true + // Use PutIndexed to assign $x["a"] = 7, or $x["b"] = 8, etc. + other.PutIndexed( + CopyMlrvalArray(arrayval), + unflattenTerminal(pe.Value).Copy(), + ) } // Go through all the field names which were turned into maps -- e.g. "x" diff --git a/pkg/mlrval/mlrmap_json.go b/pkg/mlrval/mlrmap_json.go index a985098eb..2db38dd9d 100644 --- a/pkg/mlrval/mlrmap_json.go +++ b/pkg/mlrval/mlrmap_json.go @@ -7,8 +7,8 @@ package mlrval import ( "bytes" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/lib" ) // ---------------------------------------------------------------- diff --git a/pkg/mlrval/mlrval_accessors.go b/pkg/mlrval/mlrval_accessors.go index f788cc35b..e6d8a44e3 100644 --- a/pkg/mlrval/mlrval_accessors.go +++ b/pkg/mlrval/mlrval_accessors.go @@ -3,7 +3,7 @@ package mlrval import ( "strconv" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) func (mv *Mlrval) GetArrayLength() (int, bool) { diff --git a/pkg/mlrval/mlrval_benchmark_test.go b/pkg/mlrval/mlrval_benchmark_test.go index 8d7c576b2..9cb79f349 100644 --- a/pkg/mlrval/mlrval_benchmark_test.go +++ b/pkg/mlrval/mlrval_benchmark_test.go @@ -4,7 +4,7 @@ import ( "testing" ) -// go test -run=nonesuch -bench=. github.com/johnkerl/miller/pkg/mlrval/... +// go test -run=nonesuch -bench=. github.com/johnkerl/miller/v6/pkg/mlrval/... func BenchmarkFromDeferredType(b *testing.B) { for i := 0; i < b.N; i++ { diff --git a/pkg/mlrval/mlrval_cmp.go b/pkg/mlrval/mlrval_cmp.go index cebd3af25..f631420a2 100644 --- a/pkg/mlrval/mlrval_cmp.go +++ b/pkg/mlrval/mlrval_cmp.go @@ -14,7 +14,7 @@ package mlrval import ( - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) type CmpFuncBool func(input1, input2 *Mlrval) bool diff --git a/pkg/mlrval/mlrval_collections.go b/pkg/mlrval/mlrval_collections.go index 5f4e305a6..2813fe099 100644 --- a/pkg/mlrval/mlrval_collections.go +++ b/pkg/mlrval/mlrval_collections.go @@ -74,7 +74,7 @@ import ( "os" "strconv" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) // ================================================================ @@ -396,9 +396,9 @@ func putIndexedOnArray( if inBounds { (*baseArray)[zindex] = rvalue.Copy() } else if mindex.intf.(int64) == 0 { - return errors.New("mlr: zero indices are not supported. Indices are 1-up.") + return errors.New("mlr: zero indices are not supported. Indices are 1-up") } else if mindex.intf.(int64) < 0 { - return errors.New("mlr: Cannot use negative indices to auto-lengthen arrays.") + return errors.New("mlr: Cannot use negative indices to auto-lengthen arrays") } else { // Array is [a,b,c] with mindices 1,2,3. Length is 3. Zindices are 0,1,2. // Given mindex is 4. @@ -431,9 +431,9 @@ func putIndexedOnArray( return (*baseArray)[zindex].PutIndexed(indices[1:], rvalue) } else if mindex.intf.(int64) == 0 { - return errors.New("mlr: zero indices are not supported. Indices are 1-up.") + return errors.New("mlr: zero indices are not supported. Indices are 1-up") } else if mindex.intf.(int64) < 0 { - return errors.New("mlr: Cannot use negative indices to auto-lengthen arrays.") + return errors.New("mlr: Cannot use negative indices to auto-lengthen arrays") } else { // Already allocated but needs to be longer LengthenMlrvalArray(baseArray, int(mindex.intf.(int64))) @@ -458,7 +458,7 @@ func (mv *Mlrval) RemoveIndexed(indices []*Mlrval) error { } else { return errors.New( - "mlr: cannot unset index variable which is neither map nor array.", + "mlr: cannot unset index variable which is neither map nor array", ) } } @@ -527,20 +527,20 @@ func removeIndexedOnArray( rightSlice := (*baseArray)[zindex+1 : len((*baseArray))] *baseArray = append(leftSlice, rightSlice...) } else if mindex.intf.(int64) == 0 { - return errors.New("mlr: zero indices are not supported. Indices are 1-up.") + return errors.New("mlr: zero indices are not supported. Indices are 1-up") } else { // TODO: improve wording - return errors.New("mlr: array index out of bounds for unset.") + return errors.New("mlr: array index out of bounds for unset") } } else { // More indices remain; recurse if inBounds { return (*baseArray)[zindex].RemoveIndexed(indices[1:]) } else if mindex.intf.(int64) == 0 { - return errors.New("mlr: zero indices are not supported. Indices are 1-up.") + return errors.New("mlr: zero indices are not supported. Indices are 1-up") } else { // TODO: improve wording - return errors.New("mlr: array index out of bounds for unset.") + return errors.New("mlr: array index out of bounds for unset") } } diff --git a/pkg/mlrval/mlrval_format.go b/pkg/mlrval/mlrval_format.go index 9f4cb2fe6..661cad153 100644 --- a/pkg/mlrval/mlrval_format.go +++ b/pkg/mlrval/mlrval_format.go @@ -2,8 +2,12 @@ package mlrval import ( "fmt" + "os" "strconv" "strings" + + "golang.org/x/text/language" + "golang.org/x/text/message" ) //---------------------------------------------------------------- @@ -103,9 +107,14 @@ func newFormatter( goFormatString = strings.ReplaceAll(goFormatString, "le", "e") goFormatString = strings.ReplaceAll(goFormatString, "lg", "g") - // MIller 5 and below required C format strings compatible with 64-bit ints + // Miller 5 and below required C format strings compatible with 64-bit ints // and double-precision floats: e.g. "%08lld" and "%9.6lf". For Miller 6, - // We must still accept these for backward compatibility. + // we must still accept these for backward compatibility. + if strings.HasSuffix(goFormatString, "_d") { + // Special sub-case of "d"; must be checked first + n := len(goFormatString) + return newFormatterToSeparatedInt(goFormatString[:n-2] + "d"), nil + } if strings.HasSuffix(goFormatString, "d") { return newFormatterToInt(goFormatString), nil } @@ -113,6 +122,11 @@ func newFormatter( return newFormatterToInt(goFormatString), nil } + if strings.HasSuffix(goFormatString, "_f") { + // Special sub-case of "f"; must be checked first + n := len(goFormatString) + return newFormatterToSeparatedFloat(goFormatString[:n-2] + "f"), nil + } if strings.HasSuffix(goFormatString, "f") { return newFormatterToFloat(goFormatString), nil } @@ -164,6 +178,81 @@ func (formatter *formatterToFloat) FormatFloat(floatValue float64) string { // ---------------------------------------------------------------- +func getLanguageTag() language.Tag { + v, ok := os.LookupEnv("LANG") + if ok { + return language.Make(v) + } else { + return language.Make("en") + } +} + +// ---------------------------------------------------------------- + +type formatterToSeparatedInt struct { + goFormatString string + printer *message.Printer +} + +func newFormatterToSeparatedInt(goFormatString string) IFormatter { + return &formatterToSeparatedInt{ + goFormatString: goFormatString, + printer: message.NewPrinter(getLanguageTag()), + } +} + +func (formatter *formatterToSeparatedInt) Format(mv *Mlrval) *Mlrval { + intValue, isInt := mv.GetIntValue() + if isInt { + formatted := formatter.printer.Sprintf(formatter.goFormatString, intValue) + return TryFromIntString(formatted) + } + floatValue, isFloat := mv.GetFloatValue() + if isFloat { + formatted := formatter.printer.Sprintf(formatter.goFormatString, int(floatValue)) + return TryFromIntString(formatted) + } + return mv +} + +func (formatter *formatterToSeparatedInt) FormatFloat(floatValue float64) string { + return formatter.printer.Sprintf(formatter.goFormatString, int(floatValue)) +} + +// ---------------------------------------------------------------- + +type formatterToSeparatedFloat struct { + goFormatString string + printer *message.Printer +} + +func newFormatterToSeparatedFloat(goFormatString string) IFormatter { + return &formatterToSeparatedFloat{ + goFormatString: goFormatString, + printer: message.NewPrinter(getLanguageTag()), + } +} + +func (formatter *formatterToSeparatedFloat) Format(mv *Mlrval) *Mlrval { + floatValue, isFloat := mv.GetFloatValue() + if isFloat { + formatted := formatter.printer.Sprintf(formatter.goFormatString, floatValue) + return TryFromFloatString(formatted) + } + intValue, isInt := mv.GetIntValue() + if isInt { + formatted := formatter.printer.Sprintf(formatter.goFormatString, float64(intValue)) + return TryFromFloatString(formatted) + } + return mv +} + +func (formatter *formatterToSeparatedFloat) FormatFloat(floatValue float64) string { + return formatter.printer.Sprintf(formatter.goFormatString, floatValue) +} + +// ---------------------------------------------------------------- + type formatterToInt struct { goFormatString string } diff --git a/pkg/mlrval/mlrval_get.go b/pkg/mlrval/mlrval_get.go index 2eb6bfb66..9c681229d 100644 --- a/pkg/mlrval/mlrval_get.go +++ b/pkg/mlrval/mlrval_get.go @@ -4,7 +4,7 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) // It's essential that we use mv.Type() not mv.mvtype, or use an Is...() diff --git a/pkg/mlrval/mlrval_get_test.go b/pkg/mlrval/mlrval_get_test.go index 9107fbdc9..90abc8598 100644 --- a/pkg/mlrval/mlrval_get_test.go +++ b/pkg/mlrval/mlrval_get_test.go @@ -12,23 +12,23 @@ import ( func TestGetString(t *testing.T) { mv := FromInferredType("234") - stringval, ok := mv.GetStringValue() + _, ok := mv.GetStringValue() assert.False(t, ok) mv = FromDeferredType("234") - stringval, ok = mv.GetStringValue() + _, ok = mv.GetStringValue() assert.False(t, ok) mv = FromInferredType("234.5") - stringval, ok = mv.GetStringValue() + _, ok = mv.GetStringValue() assert.False(t, ok) mv = FromDeferredType("234.5") - stringval, ok = mv.GetStringValue() + _, ok = mv.GetStringValue() assert.False(t, ok) mv = FromInferredType("abc") - stringval, ok = mv.GetStringValue() + stringval, ok := mv.GetStringValue() assert.Equal(t, "abc", stringval) assert.True(t, ok) @@ -60,33 +60,33 @@ func TestGetIntValue(t *testing.T) { assert.True(t, ok) mv = FromInferredType("123.4") - intval, ok = mv.GetIntValue() + _, ok = mv.GetIntValue() assert.False(t, ok) mv = FromDeferredType("123.4") - intval, ok = mv.GetIntValue() + _, ok = mv.GetIntValue() assert.False(t, ok) mv = FromInferredType("abc") - intval, ok = mv.GetIntValue() + _, ok = mv.GetIntValue() assert.False(t, ok) mv = FromDeferredType("abc") - intval, ok = mv.GetIntValue() + _, ok = mv.GetIntValue() assert.False(t, ok) } func TestGetFloatValue(t *testing.T) { mv := FromInferredType("234") - floatval, ok := mv.GetFloatValue() + _, ok := mv.GetFloatValue() assert.False(t, ok) mv = FromDeferredType("234") - floatval, ok = mv.GetFloatValue() + _, ok = mv.GetFloatValue() assert.False(t, ok) mv = FromInferredType("234.5") - floatval, ok = mv.GetFloatValue() + floatval, ok := mv.GetFloatValue() assert.Equal(t, 234.5, floatval) assert.True(t, ok) @@ -96,11 +96,11 @@ func TestGetFloatValue(t *testing.T) { assert.True(t, ok) mv = FromInferredType("abc") - floatval, ok = mv.GetFloatValue() + _, ok = mv.GetFloatValue() assert.False(t, ok) mv = FromDeferredType("abc") - floatval, ok = mv.GetFloatValue() + _, ok = mv.GetFloatValue() assert.False(t, ok) } @@ -126,38 +126,38 @@ func TestGetNumericToFloatValue(t *testing.T) { assert.True(t, ok) mv = FromInferredType("abc") - floatval, ok = mv.GetNumericToFloatValue() + _, ok = mv.GetNumericToFloatValue() assert.False(t, ok) mv = FromDeferredType("abc") - floatval, ok = mv.GetNumericToFloatValue() + _, ok = mv.GetNumericToFloatValue() assert.False(t, ok) } func TestGetBoolValue(t *testing.T) { mv := FromInferredType("234") - boolval, ok := mv.GetBoolValue() + _, ok := mv.GetBoolValue() assert.False(t, ok) mv = FromDeferredType("234") - boolval, ok = mv.GetBoolValue() + _, ok = mv.GetBoolValue() assert.False(t, ok) mv = FromInferredType("abc") - boolval, ok = mv.GetBoolValue() + _, ok = mv.GetBoolValue() assert.False(t, ok) mv = FromDeferredType("abc") - boolval, ok = mv.GetBoolValue() + _, ok = mv.GetBoolValue() assert.False(t, ok) mv = FromInferredType("true") - boolval, ok = mv.GetBoolValue() + boolval, ok := mv.GetBoolValue() assert.True(t, boolval) assert.True(t, ok) mv = FromDeferredType("false") - boolval, ok = mv.GetBoolValue() + _, ok = mv.GetBoolValue() assert.False(t, ok, "from-data-file \"false\" should infer to string") } diff --git a/pkg/mlrval/mlrval_infer.go b/pkg/mlrval/mlrval_infer.go index ada3792bd..2c9a20064 100644 --- a/pkg/mlrval/mlrval_infer.go +++ b/pkg/mlrval/mlrval_infer.go @@ -3,7 +3,7 @@ package mlrval import ( "strconv" - "github.com/johnkerl/miller/pkg/scan" + "github.com/johnkerl/miller/v6/pkg/scan" ) // TODO: comment no infer-bool from data files. Always false in this path. @@ -34,7 +34,7 @@ func SetInferrerOctalAsInt() { packageLevelInferrer = inferWithOctalAsInt } -// SetInferrerStringOnly is for mlr -A. +// SetInferrerIntAsFloat is for mlr -F. func SetInferrerIntAsFloat() { packageLevelInferrer = inferWithIntAsFloat } diff --git a/pkg/mlrval/mlrval_is.go b/pkg/mlrval/mlrval_is.go index 5b3fcd9e8..0cdfdadde 100644 --- a/pkg/mlrval/mlrval_is.go +++ b/pkg/mlrval/mlrval_is.go @@ -1,7 +1,7 @@ package mlrval import ( - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) // It's essential that we use mv.Type() not mv.mvtype since types are @@ -112,10 +112,10 @@ func (mv *Mlrval) IsBool() bool { } func (mv *Mlrval) IsTrue() bool { - return mv.Type() == MT_BOOL && mv.intf.(bool) == true + return mv.Type() == MT_BOOL && mv.intf.(bool) } func (mv *Mlrval) IsFalse() bool { - return mv.Type() == MT_BOOL && mv.intf.(bool) == false + return mv.Type() == MT_BOOL && !mv.intf.(bool) } func (mv *Mlrval) IsArray() bool { diff --git a/pkg/mlrval/mlrval_json.go b/pkg/mlrval/mlrval_json.go index c657815ec..1a193aa14 100644 --- a/pkg/mlrval/mlrval_json.go +++ b/pkg/mlrval/mlrval_json.go @@ -13,9 +13,10 @@ import ( "encoding/json" "fmt" "io" + "strconv" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/lib" ) const JSON_INDENT_STRING string = " " @@ -105,7 +106,7 @@ func (mv *Mlrval) UnmarshalJSON(inputBytes []byte) error { decoder := json.NewDecoder(bytes.NewReader(inputBytes)) pmv, eof, err := MlrvalDecodeFromJSON(decoder) if eof { - return fmt.Errorf("mlr: JSON parser: unexpected premature EOF.") + return fmt.Errorf("mlr: JSON parser: unexpected premature EOF") } if err != nil { return err @@ -119,7 +120,7 @@ func TryUnmarshalJSON(inputBytes []byte) (pmv *Mlrval, err error) { decoder := json.NewDecoder(bytes.NewReader(inputBytes)) pmv, eof, err := MlrvalDecodeFromJSON(decoder) if eof { - err = fmt.Errorf("mlr: JSON parser: unexpected premature EOF.") + err = fmt.Errorf("mlr: JSON parser: unexpected premature EOF") } return pmv, err } @@ -188,7 +189,9 @@ func MlrvalDecodeFromJSON(decoder *json.Decoder) ( ) } - mv := FromPending() + // Will be assigned as an array or a map + var mv *Mlrval + if isArray { mv = FromEmptyArray() @@ -196,14 +199,13 @@ func MlrvalDecodeFromJSON(decoder *json.Decoder) ( element, eof, err := MlrvalDecodeFromJSON(decoder) if eof { // xxx constify - return nil, false, fmt.Errorf("mlr: JSON parser: unexpected premature EOF.") + return nil, false, fmt.Errorf("mlr: JSON parser: unexpected premature EOF") } if err != nil { return nil, false, err } mv.ArrayAppend(element) } - } else { mv = FromEmptyMap() @@ -211,7 +213,7 @@ func MlrvalDecodeFromJSON(decoder *json.Decoder) ( key, eof, err := MlrvalDecodeFromJSON(decoder) if eof { // xxx constify - return nil, false, fmt.Errorf("mlr: JSON parser: unexpected premature EOF.") + return nil, false, fmt.Errorf("mlr: JSON parser: unexpected premature EOF") } if err != nil { return nil, false, err @@ -219,14 +221,14 @@ func MlrvalDecodeFromJSON(decoder *json.Decoder) ( if !key.IsString() { return nil, false, fmt.Errorf( // TODO: print out what was gotten - "mlr JSON reader: object keys must be string-valued.", + "mlr JSON reader: object keys must be string-valued", ) } value, eof, err := MlrvalDecodeFromJSON(decoder) if eof { // xxx constify - return nil, false, fmt.Errorf("mlr: JSON parser: unexpected premature EOF.") + return nil, false, fmt.Errorf("mlr: JSON parser: unexpected premature EOF") } if err != nil { return nil, false, err @@ -245,7 +247,7 @@ func MlrvalDecodeFromJSON(decoder *json.Decoder) ( endToken, err := decoder.Token() if err == io.EOF { - return nil, false, fmt.Errorf("mlr: JSON parser: unexpected premature EOF.") + return nil, false, fmt.Errorf("mlr: JSON parser: unexpected premature EOF") } if err != nil { return nil, false, err @@ -352,9 +354,17 @@ func (mv *Mlrval) marshalJSONString(outputIsStdout bool) (string, error) { } // Wraps with double-quotes and escape-encoded JSON-special characters. +// +// Per https://www.json.org/json-en.html: +// +// * Escapes: \b \f \n \r \t \u +// * Acceptable ranges: 0x20..0x10FFFF +// +// Since these are bytes here, we only need to check < 0x20, and special-case the five valid +// escapes, and then \u the rest. + func millerJSONEncodeString(input string) string { var buffer bytes.Buffer - buffer.WriteByte('"') for _, b := range []byte(input) { @@ -362,15 +372,15 @@ func millerJSONEncodeString(input string) string { case '\\': buffer.WriteByte('\\') buffer.WriteByte('\\') - case '\n': - buffer.WriteByte('\\') - buffer.WriteByte('n') case '\b': buffer.WriteByte('\\') buffer.WriteByte('b') case '\f': buffer.WriteByte('\\') buffer.WriteByte('f') + case '\n': + buffer.WriteByte('\\') + buffer.WriteByte('n') case '\r': buffer.WriteByte('\\') buffer.WriteByte('r') @@ -381,19 +391,32 @@ func millerJSONEncodeString(input string) string { buffer.WriteByte('\\') buffer.WriteByte('"') default: - buffer.WriteByte(b) + if b < 0x20 { + s := fmt.Sprintf("\\u%04x", b) + buffer.WriteString(s) + } else { + buffer.WriteByte(b) + } } } buffer.WriteByte('"') - return buffer.String() } // ---------------------------------------------------------------- func (mv *Mlrval) marshalJSONInt(outputIsStdout bool) (string, error) { lib.InternalCodingErrorIf(mv.mvtype != MT_INT) - return colorizer.MaybeColorizeValue(mv.String(), outputIsStdout), nil + // Other formats would use mv.String(): for example, if the user used hex + // format, we would emit whatever they set. However, for JSON, we are + // required to disrespect the user's formatting, and only emit decimal. + // See also https://github.com/johnkerl/miller/issues/1761. + ival, ok := mv.GetIntValue() + if !ok { + panic("Internal coding error: int-typed mlrval denied int access") + } + s := strconv.FormatInt(ival, 10) + return colorizer.MaybeColorizeValue(s, outputIsStdout), nil } // ---------------------------------------------------------------- diff --git a/pkg/mlrval/mlrval_new.go b/pkg/mlrval/mlrval_new.go index eafea9afd..9d63989b1 100644 --- a/pkg/mlrval/mlrval_new.go +++ b/pkg/mlrval/mlrval_new.go @@ -8,7 +8,7 @@ import ( "errors" "fmt" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) // TODO: comment for JSON-scanner context. @@ -132,7 +132,7 @@ func FromNotFunctionError(funcname string, v *Mlrval) *Mlrval { func FromNotNamedTypeError(funcname string, v *Mlrval, expected_type_name string) *Mlrval { return FromError( fmt.Errorf( - "%s: unacceptable non-array value %s with type %s; needed type %s", + "%s: unacceptable value %s with type %s; needed type %s", funcname, v.StringMaybeQuoted(), v.GetTypeName(), @@ -197,6 +197,15 @@ func FromInt(input int64) *Mlrval { } } +func FromIntShowingOctal(input int64) *Mlrval { + return &Mlrval{ + mvtype: MT_INT, + printrepValid: true, + printrep: fmt.Sprintf("0%o", input), + intf: input, + } +} + // TryFromIntString is used by the mlrval Formatter (fmtnum DSL function, // format-values verb, etc). Each mlrval has printrep and a printrepValid for // its original string, then a type-code like MT_INT or MT_FLOAT, and @@ -280,7 +289,7 @@ func FromPrevalidatedFloatString(input string, floatval float64) *Mlrval { } func FromBool(input bool) *Mlrval { - if input == true { + if input { return TRUE } else { return FALSE @@ -309,7 +318,7 @@ func (mv *Mlrval) SetFromPrevalidatedBoolString(input string, boolval bool) *Mlr // The user-defined function is of type 'interface{}' here to avoid what would // otherwise be a package-dependency cycle between this package and -// github.com/johnkerl/miller/pkg/dsl/cst. +// github.com/johnkerl/miller/v6/pkg/dsl/cst. // // Nominally the name argument is the user-specified name if `func f(a, b) { // ... }`, or some autogenerated UUID like `fl0052` if `func (a, b) { ... }`. diff --git a/pkg/mlrval/mlrval_output.go b/pkg/mlrval/mlrval_output.go index d864806b3..b6fd84e0f 100644 --- a/pkg/mlrval/mlrval_output.go +++ b/pkg/mlrval/mlrval_output.go @@ -87,7 +87,7 @@ func (mv *Mlrval) setPrintRep() { mv.printrep = strconv.FormatFloat(mv.intf.(float64), 'f', -1, 64) case MT_BOOL: - if mv.intf.(bool) == true { + if mv.intf.(bool) { mv.printrep = "true" } else { mv.printrep = "false" @@ -120,7 +120,7 @@ func (mv *Mlrval) StringifyValuesRecursively() { switch mv.mvtype { case MT_ARRAY: - for i, _ := range mv.intf.([]*Mlrval) { + for i := range mv.intf.([]*Mlrval) { mv.intf.([]*Mlrval)[i].StringifyValuesRecursively() } diff --git a/pkg/output/channel_writer.go b/pkg/output/channel_writer.go index 6805ad890..ac025398b 100644 --- a/pkg/output/channel_writer.go +++ b/pkg/output/channel_writer.go @@ -6,8 +6,8 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) func ChannelWriter( @@ -66,10 +66,11 @@ func channelWriterHandleBatch( if !recordAndContext.EndOfStream { record := recordAndContext.Record + context := &recordAndContext.Context // XXX more // XXX also make sure this results in exit 1 & goroutine cleanup - if writerOptions.FailOnDataError { + if writerOptions.FailOnDataError && record != nil { ok := true for pe := record.Head; pe != nil; pe = pe.Next { if pe.Value.IsError() { @@ -94,7 +95,11 @@ func channelWriterHandleBatch( } if record != nil { - recordWriter.Write(record, bufferedOutputStream, outputIsStdout) + err := recordWriter.Write(record, context, bufferedOutputStream, outputIsStdout) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) + return true, true + } } outputString := recordAndContext.OutputString @@ -111,8 +116,14 @@ func channelWriterHandleBatch( // queued up. For example, PPRINT needs to see all same-schema // records before printing any, since it needs to compute max width // down columns. - recordWriter.Write(nil, bufferedOutputStream, outputIsStdout) - return true, false + context := &recordAndContext.Context + err := recordWriter.Write(nil, context, bufferedOutputStream, outputIsStdout) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) + return true, true + } else { + return true, false + } } } return false, false diff --git a/pkg/output/file_output_handlers.go b/pkg/output/file_output_handlers.go index d21cfb812..31f6b89a0 100644 --- a/pkg/output/file_output_handlers.go +++ b/pkg/output/file_output_handlers.go @@ -20,9 +20,9 @@ import ( "io" "os" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ================================================================ @@ -300,7 +300,7 @@ func NewPipeWriteOutputHandler( ) (*FileOutputHandler, error) { writePipe, err := lib.OpenOutboundHalfPipe(commandString) if err != nil { - return nil, fmt.Errorf("could not launch command \"%s\" for pipe-to.", commandString) + return nil, fmt.Errorf(`could not launch command "%s" for pipe-to`, commandString) } return newOutputHandlerCommon( @@ -399,13 +399,11 @@ func (handler *FileOutputHandler) Close() (retval error) { done := false for !done { select { - case _ = <-handler.recordErroredChannel: + case <-handler.recordErroredChannel: done = true retval = errors.New("exiting due to data error") // details already printed - break - case _ = <-handler.recordDoneChannel: + case <-handler.recordDoneChannel: done = true - break } } } diff --git a/pkg/output/record_writer.go b/pkg/output/record_writer.go index 37d8a7780..e3c224667 100644 --- a/pkg/output/record_writer.go +++ b/pkg/output/record_writer.go @@ -3,7 +3,8 @@ package output import ( "bufio" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // IRecordWriter is the abstract interface for all record-writers. They are @@ -18,7 +19,8 @@ import ( type IRecordWriter interface { Write( outrec *mlrval.Mlrmap, + context *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, - ) + ) error } diff --git a/pkg/output/record_writer_csv.go b/pkg/output/record_writer_csv.go index fd4801d29..ca51cd325 100644 --- a/pkg/output/record_writer_csv.go +++ b/pkg/output/record_writer_csv.go @@ -5,22 +5,20 @@ import ( "fmt" "strings" - csv "github.com/johnkerl/miller/pkg/go-csv" + csv "github.com/johnkerl/miller/v6/pkg/go-csv" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type RecordWriterCSV struct { - writerOptions *cli.TWriterOptions - ofs0 byte // Go's CSV library only lets its 'Comma' be a single character - csvWriter *csv.Writer - // For reporting schema changes: we print a newline and the new header - lastJoinedHeader *string - // Only write one blank line for schema changes / blank input lines - justWroteEmptyLine bool - // For double-quote around all fields - quoteAll bool + writerOptions *cli.TWriterOptions + csvWriter *csv.Writer + needToPrintHeader bool + firstRecordKeys []string + firstRecordNF int64 + quoteAll bool // For double-quote around all fields } func NewRecordWriterCSV(writerOptions *cli.TWriterOptions) (*RecordWriterCSV, error) { @@ -30,23 +28,26 @@ func NewRecordWriterCSV(writerOptions *cli.TWriterOptions) (*RecordWriterCSV, er if writerOptions.ORS != "\n" && writerOptions.ORS != "\r\n" { return nil, fmt.Errorf("for CSV, ORS cannot be altered") } - return &RecordWriterCSV{ - writerOptions: writerOptions, - csvWriter: nil, // will be set on first Write() wherein we have the output stream - lastJoinedHeader: nil, - justWroteEmptyLine: false, - quoteAll: writerOptions.CSVQuoteAll, - }, nil + writer := &RecordWriterCSV{ + writerOptions: writerOptions, + csvWriter: nil, // will be set on first Write() wherein we have the output stream + needToPrintHeader: !writerOptions.HeaderlessOutput, + firstRecordKeys: nil, + firstRecordNF: -1, + quoteAll: writerOptions.CSVQuoteAll, + } + return writer, nil } func (writer *RecordWriterCSV) Write( outrec *mlrval.Mlrmap, + _ *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { - // End of record stream: nothing special for this output format +) error { if outrec == nil { - return + // End of record stream: nothing special for this output format + return nil } if writer.csvWriter == nil { @@ -54,46 +55,46 @@ func (writer *RecordWriterCSV) Write( writer.csvWriter.Comma = rune(writer.writerOptions.OFS[0]) // xxx temp } - if outrec.IsEmpty() { - if !writer.justWroteEmptyLine { - bufferedOutputStream.WriteString("\n") - } - joinedHeader := "" - writer.lastJoinedHeader = &joinedHeader - writer.justWroteEmptyLine = true - return + if writer.firstRecordKeys == nil { + writer.firstRecordKeys = outrec.GetKeys() + writer.firstRecordNF = int64(len(writer.firstRecordKeys)) } - needToPrintHeader := false - joinedHeader := strings.Join(outrec.GetKeys(), ",") - if writer.lastJoinedHeader == nil || *writer.lastJoinedHeader != joinedHeader { - if writer.lastJoinedHeader != nil { - if !writer.justWroteEmptyLine { - bufferedOutputStream.WriteString("\n") - } - writer.justWroteEmptyLine = true - } - writer.lastJoinedHeader = &joinedHeader - needToPrintHeader = true - } - - if needToPrintHeader && !writer.writerOptions.HeaderlessCSVOutput { + if writer.needToPrintHeader { fields := make([]string, outrec.FieldCount) i := 0 for pe := outrec.Head; pe != nil; pe = pe.Next { fields[i] = pe.Key i++ } - //////writer.csvWriter.Write(fields) writer.WriteCSVRecordMaybeColorized(fields, bufferedOutputStream, outputIsStdout, true, writer.quoteAll) + writer.needToPrintHeader = false } - fields := make([]string, outrec.FieldCount) - i := 0 + var outputNF int64 = outrec.FieldCount + if outputNF < writer.firstRecordNF { + outputNF = writer.firstRecordNF + } + + fields := make([]string, outputNF) + var i int64 = 0 for pe := outrec.Head; pe != nil; pe = pe.Next { + if i < writer.firstRecordNF && pe.Key != writer.firstRecordKeys[i] { + return fmt.Errorf( + "CSV schema change: first keys \"%s\"; current keys \"%s\"", + strings.Join(writer.firstRecordKeys, writer.writerOptions.OFS), + strings.Join(outrec.GetKeys(), writer.writerOptions.OFS), + ) + } fields[i] = pe.Value.String() i++ } + + for ; i < outputNF; i++ { + fields[i] = "" + } + writer.WriteCSVRecordMaybeColorized(fields, bufferedOutputStream, outputIsStdout, false, writer.quoteAll) - writer.justWroteEmptyLine = false + + return nil } diff --git a/pkg/output/record_writer_csv_colorizer.go b/pkg/output/record_writer_csv_colorizer.go index 9cb103b6a..a9efd0b30 100644 --- a/pkg/output/record_writer_csv_colorizer.go +++ b/pkg/output/record_writer_csv_colorizer.go @@ -47,7 +47,7 @@ import ( "strings" "unicode/utf8" - "github.com/johnkerl/miller/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/colorizer" ) var errInvalidDelim = errors.New("csv: invalid field or comment delimiter") diff --git a/pkg/output/record_writer_csvlite.go b/pkg/output/record_writer_csvlite.go index 251cf9580..ac36a8270 100644 --- a/pkg/output/record_writer_csvlite.go +++ b/pkg/output/record_writer_csvlite.go @@ -4,9 +4,10 @@ import ( "bufio" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type RecordWriterCSVLite struct { @@ -27,12 +28,13 @@ func NewRecordWriterCSVLite(writerOptions *cli.TWriterOptions) (*RecordWriterCSV func (writer *RecordWriterCSVLite) Write( outrec *mlrval.Mlrmap, + _ *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { - // End of record stream: nothing special for this output format +) error { if outrec == nil { - return + // End of record stream: nothing special for this output format + return nil } if outrec.IsEmpty() { @@ -42,7 +44,7 @@ func (writer *RecordWriterCSVLite) Write( joinedHeader := "" writer.lastJoinedHeader = &joinedHeader writer.justWroteEmptyLine = true - return + return nil } needToPrintHeader := false @@ -58,7 +60,7 @@ func (writer *RecordWriterCSVLite) Write( needToPrintHeader = true } - if needToPrintHeader && !writer.writerOptions.HeaderlessCSVOutput { + if needToPrintHeader && !writer.writerOptions.HeaderlessOutput { for pe := outrec.Head; pe != nil; pe = pe.Next { bufferedOutputStream.WriteString(colorizer.MaybeColorizeKey(pe.Key, outputIsStdout)) @@ -79,4 +81,6 @@ func (writer *RecordWriterCSVLite) Write( bufferedOutputStream.WriteString(writer.writerOptions.ORS) writer.justWroteEmptyLine = false + + return nil } diff --git a/pkg/output/record_writer_dkvp.go b/pkg/output/record_writer_dkvp.go index bc60868ca..692fa9480 100644 --- a/pkg/output/record_writer_dkvp.go +++ b/pkg/output/record_writer_dkvp.go @@ -3,9 +3,10 @@ package output import ( "bufio" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type RecordWriterDKVP struct { @@ -20,17 +21,18 @@ func NewRecordWriterDKVP(writerOptions *cli.TWriterOptions) (*RecordWriterDKVP, func (writer *RecordWriterDKVP) Write( outrec *mlrval.Mlrmap, + _ *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { - // End of record stream: nothing special for this output format +) error { if outrec == nil { - return + // End of record stream: nothing special for this output format + return nil } if outrec.IsEmpty() { bufferedOutputStream.WriteString(writer.writerOptions.ORS) - return + return nil } for pe := outrec.Head; pe != nil; pe = pe.Next { @@ -42,4 +44,6 @@ func (writer *RecordWriterDKVP) Write( } } bufferedOutputStream.WriteString(writer.writerOptions.ORS) + + return nil } diff --git a/pkg/output/record_writer_factory.go b/pkg/output/record_writer_factory.go index b706f21bd..bb6aba5fa 100644 --- a/pkg/output/record_writer_factory.go +++ b/pkg/output/record_writer_factory.go @@ -3,7 +3,7 @@ package output import ( "fmt" - "github.com/johnkerl/miller/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/cli" ) func Create(writerOptions *cli.TWriterOptions) (IRecordWriter, error) { @@ -16,6 +16,10 @@ func Create(writerOptions *cli.TWriterOptions) (IRecordWriter, error) { return NewRecordWriterDKVP(writerOptions) case "json": return NewRecordWriterJSON(writerOptions) + case "jsonl": + return NewRecordWriterJSONLines(writerOptions) + case "md": + return NewRecordWriterMarkdown(writerOptions) case "markdown": return NewRecordWriterMarkdown(writerOptions) case "nidx": diff --git a/pkg/output/record_writer_json.go b/pkg/output/record_writer_json_jsonl.go similarity index 64% rename from pkg/output/record_writer_json.go rename to pkg/output/record_writer_json_jsonl.go index 578e9f8ba..8c43d43ff 100644 --- a/pkg/output/record_writer_json.go +++ b/pkg/output/record_writer_json_jsonl.go @@ -5,8 +5,9 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -17,7 +18,7 @@ type RecordWriterJSON struct { jvQuoteAll bool // State: - onFirst bool + wroteAnyRecords bool } // ---------------------------------------------------------------- @@ -27,38 +28,54 @@ func NewRecordWriterJSON(writerOptions *cli.TWriterOptions) (*RecordWriterJSON, jsonFormatting = mlrval.JSON_MULTILINE } return &RecordWriterJSON{ - writerOptions: writerOptions, - jsonFormatting: jsonFormatting, - jvQuoteAll: writerOptions.JVQuoteAll, - onFirst: true, + writerOptions: writerOptions, + jsonFormatting: jsonFormatting, + jvQuoteAll: writerOptions.JVQuoteAll, + wroteAnyRecords: false, + }, nil +} + +// ---------------------------------------------------------------- +func NewRecordWriterJSONLines(writerOptions *cli.TWriterOptions) (*RecordWriterJSON, error) { + wopt := *writerOptions + wopt.WrapJSONOutputInOuterList = false + wopt.JSONOutputMultiline = false + return &RecordWriterJSON{ + writerOptions: &wopt, + jsonFormatting: mlrval.JSON_SINGLE_LINE, + jvQuoteAll: writerOptions.JVQuoteAll, + wroteAnyRecords: false, }, nil } // ---------------------------------------------------------------- func (writer *RecordWriterJSON) Write( outrec *mlrval.Mlrmap, + context *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { if outrec != nil && writer.jvQuoteAll { outrec.StringifyValuesRecursively() } if writer.writerOptions.WrapJSONOutputInOuterList { - writer.writeWithListWrap(outrec, bufferedOutputStream, outputIsStdout) + writer.writeWithListWrap(outrec, context, bufferedOutputStream, outputIsStdout) } else { - writer.writeWithoutListWrap(outrec, bufferedOutputStream, outputIsStdout) + writer.writeWithoutListWrap(outrec, context, bufferedOutputStream, outputIsStdout) } + return nil } // ---------------------------------------------------------------- func (writer *RecordWriterJSON) writeWithListWrap( outrec *mlrval.Mlrmap, + context *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, ) { if outrec != nil { // Not end of record stream - if writer.onFirst { + if !writer.wroteAnyRecords { bufferedOutputStream.WriteString("[\n") } @@ -70,25 +87,32 @@ func (writer *RecordWriterJSON) writeWithListWrap( os.Exit(1) } - if !writer.onFirst { + if writer.wroteAnyRecords { bufferedOutputStream.WriteString(",\n") } bufferedOutputStream.WriteString(s) - writer.onFirst = false + writer.wroteAnyRecords = true } else { // End of record stream - if writer.onFirst { // zero records in the entire output stream - bufferedOutputStream.WriteString("[") + + if !writer.wroteAnyRecords { + if context.JSONHadBrackets { + bufferedOutputStream.WriteString("[") + bufferedOutputStream.WriteString("\n]\n") + } + } else { + bufferedOutputStream.WriteString("\n]\n") } - bufferedOutputStream.WriteString("\n]\n") + } } // ---------------------------------------------------------------- func (writer *RecordWriterJSON) writeWithoutListWrap( outrec *mlrval.Mlrmap, + _ *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, ) { diff --git a/pkg/output/record_writer_markdown.go b/pkg/output/record_writer_markdown.go index 2688c2962..b3b96089a 100644 --- a/pkg/output/record_writer_markdown.go +++ b/pkg/output/record_writer_markdown.go @@ -4,14 +4,14 @@ import ( "bufio" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type RecordWriterMarkdown struct { writerOptions *cli.TWriterOptions - ors string numHeaderLinesOutput int lastJoinedHeader string @@ -29,11 +29,12 @@ func NewRecordWriterMarkdown(writerOptions *cli.TWriterOptions) (*RecordWriterMa // ---------------------------------------------------------------- func (writer *RecordWriterMarkdown) Write( outrec *mlrval.Mlrmap, + _ *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { if outrec == nil { // end of record stream - return + return nil } currentJoinedHeader := outrec.GetKeysJoined() @@ -73,4 +74,6 @@ func (writer *RecordWriterMarkdown) Write( bufferedOutputStream.WriteString(" |") } bufferedOutputStream.WriteString(writer.writerOptions.ORS) + + return nil } diff --git a/pkg/output/record_writer_nidx.go b/pkg/output/record_writer_nidx.go index d3babd35a..45d01c45b 100644 --- a/pkg/output/record_writer_nidx.go +++ b/pkg/output/record_writer_nidx.go @@ -3,8 +3,9 @@ package output import ( "bufio" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type RecordWriterNIDX struct { @@ -19,12 +20,13 @@ func NewRecordWriterNIDX(writerOptions *cli.TWriterOptions) (*RecordWriterNIDX, func (writer *RecordWriterNIDX) Write( outrec *mlrval.Mlrmap, + _ *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { - // End of record stream: nothing special for this output format +) error { if outrec == nil { - return + // End of record stream: nothing special for this output format + return nil } for pe := outrec.Head; pe != nil; pe = pe.Next { @@ -34,4 +36,6 @@ func (writer *RecordWriterNIDX) Write( } } bufferedOutputStream.WriteString(writer.writerOptions.ORS) + + return nil } diff --git a/pkg/output/record_writer_pprint.go b/pkg/output/record_writer_pprint.go index 79d49b316..acb3366e2 100644 --- a/pkg/output/record_writer_pprint.go +++ b/pkg/output/record_writer_pprint.go @@ -7,9 +7,10 @@ import ( "strings" "unicode/utf8" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type RecordWriterPPRINT struct { @@ -35,9 +36,10 @@ func NewRecordWriterPPRINT(writerOptions *cli.TWriterOptions) (*RecordWriterPPRI // ---------------------------------------------------------------- func (writer *RecordWriterPPRINT) Write( outrec *mlrval.Mlrmap, + _ *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { // Group records by have-same-schema or not. Pretty-print each // homoegeneous sublist, or "batch". // @@ -83,6 +85,8 @@ func (writer *RecordWriterPPRINT) Write( bufferedOutputStream, outputIsStdout) } } + + return nil } // ---------------------------------------------------------------- @@ -155,7 +159,7 @@ func (writer *RecordWriterPPRINT) writeHeterogenousListNonBarred( outrec := e.Value.(*mlrval.Mlrmap) // Print header line - if onFirst && !writer.writerOptions.HeaderlessCSVOutput { + if onFirst && !writer.writerOptions.HeaderlessOutput { for pe := outrec.Head; pe != nil; pe = pe.Next { if !writer.writerOptions.RightAlignedPPRINTOutput { // left-align if pe.Next != nil { @@ -257,7 +261,7 @@ func (writer *RecordWriterPPRINT) writeHeterogenousListBarred( outrec := e.Value.(*mlrval.Mlrmap) // Print header line - if onFirst && !writer.writerOptions.HeaderlessCSVOutput { + if onFirst && !writer.writerOptions.HeaderlessOutput { bufferedOutputStream.WriteString(horizontalStart) for pe := outrec.Head; pe != nil; pe = pe.Next { bufferedOutputStream.WriteString(horizontalBars[pe.Key]) diff --git a/pkg/output/record_writer_tsv.go b/pkg/output/record_writer_tsv.go index 149ac9530..17f1ce563 100644 --- a/pkg/output/record_writer_tsv.go +++ b/pkg/output/record_writer_tsv.go @@ -5,18 +5,18 @@ import ( "fmt" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type RecordWriterTSV struct { - writerOptions *cli.TWriterOptions - // For reporting schema changes: we print a newline and the new header - lastJoinedHeader *string - // Only write one blank line for schema changes / blank input lines - justWroteEmptyLine bool + writerOptions *cli.TWriterOptions + needToPrintHeader bool + firstRecordKeys []string + firstRecordNF int64 } func NewRecordWriterTSV(writerOptions *cli.TWriterOptions) (*RecordWriterTSV, error) { @@ -27,52 +27,40 @@ func NewRecordWriterTSV(writerOptions *cli.TWriterOptions) (*RecordWriterTSV, er return nil, fmt.Errorf("for CSV, ORS cannot be altered") } return &RecordWriterTSV{ - writerOptions: writerOptions, - lastJoinedHeader: nil, - justWroteEmptyLine: false, + writerOptions: writerOptions, + needToPrintHeader: !writerOptions.HeaderlessOutput, + firstRecordKeys: nil, + firstRecordNF: -1, }, nil } func (writer *RecordWriterTSV) Write( outrec *mlrval.Mlrmap, + _ *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { - // End of record stream: nothing special for this output format +) error { if outrec == nil { - return + // End of record stream: nothing special for this output format + return nil } - if outrec.IsEmpty() { - if !writer.justWroteEmptyLine { - bufferedOutputStream.WriteString(writer.writerOptions.ORS) + if writer.firstRecordKeys == nil { + writer.firstRecordKeys = outrec.GetKeys() + writer.firstRecordNF = int64(len(writer.firstRecordKeys)) + } + + if writer.needToPrintHeader { + fields := make([]string, outrec.FieldCount) + i := 0 + for pe := outrec.Head; pe != nil; pe = pe.Next { + fields[i] = pe.Key + i++ } - joinedHeader := "" - writer.lastJoinedHeader = &joinedHeader - writer.justWroteEmptyLine = true - return - } - - needToPrintHeader := false - joinedHeader := strings.Join(outrec.GetKeys(), ",") - if writer.lastJoinedHeader == nil || *writer.lastJoinedHeader != joinedHeader { - if writer.lastJoinedHeader != nil { - if !writer.justWroteEmptyLine { - bufferedOutputStream.WriteString(writer.writerOptions.ORS) - } - writer.justWroteEmptyLine = true - } - writer.lastJoinedHeader = &joinedHeader - needToPrintHeader = true - } - - if needToPrintHeader && !writer.writerOptions.HeaderlessCSVOutput { for pe := outrec.Head; pe != nil; pe = pe.Next { bufferedOutputStream.WriteString( colorizer.MaybeColorizeKey( - lib.TSVEncodeField( - pe.Key, - ), + lib.TSVEncodeField(pe.Key), outputIsStdout, ), ) @@ -83,22 +71,44 @@ func (writer *RecordWriterTSV) Write( } bufferedOutputStream.WriteString(writer.writerOptions.ORS) + + writer.needToPrintHeader = false } + var outputNF int64 = outrec.FieldCount + if outputNF < writer.firstRecordNF { + outputNF = writer.firstRecordNF + } + + fields := make([]string, outputNF) + var i int64 = 0 for pe := outrec.Head; pe != nil; pe = pe.Next { - bufferedOutputStream.WriteString( - colorizer.MaybeColorizeValue( - lib.TSVEncodeField( - pe.Value.String(), - ), - outputIsStdout, - ), + if i < writer.firstRecordNF && pe.Key != writer.firstRecordKeys[i] { + return fmt.Errorf( + "TSV schema change: first keys \"%s\"; current keys \"%s\"", + strings.Join(writer.firstRecordKeys, writer.writerOptions.OFS), + strings.Join(outrec.GetKeys(), writer.writerOptions.OFS), + ) + } + fields[i] = colorizer.MaybeColorizeValue( + lib.TSVEncodeField(pe.Value.String()), + outputIsStdout, ) - if pe.Next != nil { + i++ + } + + for ; i < outputNF; i++ { + fields[i] = "" + } + + for j, field := range fields { + if j > 0 { bufferedOutputStream.WriteString(writer.writerOptions.OFS) } + bufferedOutputStream.WriteString(field) } + bufferedOutputStream.WriteString(writer.writerOptions.ORS) - writer.justWroteEmptyLine = false + return nil } diff --git a/pkg/output/record_writer_xtab.go b/pkg/output/record_writer_xtab.go index 9093935e9..5d1b52fa0 100644 --- a/pkg/output/record_writer_xtab.go +++ b/pkg/output/record_writer_xtab.go @@ -5,9 +5,10 @@ import ( "fmt" "unicode/utf8" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -43,12 +44,13 @@ func NewRecordWriterXTAB(writerOptions *cli.TWriterOptions) (*RecordWriterXTAB, func (writer *RecordWriterXTAB) Write( outrec *mlrval.Mlrmap, + _ *types.Context, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { - // End of record stream: nothing special for this output format +) error { if outrec == nil { - return + // End of record stream: nothing special for this output format + return nil } maxKeyLength := 1 @@ -64,6 +66,8 @@ func (writer *RecordWriterXTAB) Write( } else { writer.writeWithLeftAlignedValues(outrec, bufferedOutputStream, outputIsStdout, maxKeyLength) } + + return nil } func (writer *RecordWriterXTAB) writeWithLeftAlignedValues( diff --git a/pkg/parsing/errors.go.template b/pkg/parsing/errors.go.template index 5c39ede1d..11d8a4539 100644 --- a/pkg/parsing/errors.go.template +++ b/pkg/parsing/errors.go.template @@ -13,7 +13,7 @@ import ( "fmt" "strings" - "github.com/johnkerl/miller/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/parsing/token" ) type ErrorSymbol interface { diff --git a/pkg/parsing/errors/errors.go b/pkg/parsing/errors/errors.go index 5c39ede1d..11d8a4539 100644 --- a/pkg/parsing/errors/errors.go +++ b/pkg/parsing/errors/errors.go @@ -13,7 +13,7 @@ import ( "fmt" "strings" - "github.com/johnkerl/miller/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/parsing/token" ) type ErrorSymbol interface { diff --git a/pkg/parsing/lexer/acttab.go b/pkg/parsing/lexer/acttab.go index 18917879f..b8150ad23 100644 --- a/pkg/parsing/lexer/acttab.go +++ b/pkg/parsing/lexer/acttab.go @@ -5,7 +5,7 @@ package lexer import ( "fmt" - "github.com/johnkerl/miller/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/parsing/token" ) type ActionTable [NumStates]ActionRow diff --git a/pkg/parsing/lexer/lexer.go b/pkg/parsing/lexer/lexer.go index d106a08a3..74ac942b1 100644 --- a/pkg/parsing/lexer/lexer.go +++ b/pkg/parsing/lexer/lexer.go @@ -6,7 +6,7 @@ import ( "os" "unicode/utf8" - "github.com/johnkerl/miller/pkg/parsing/token" + "github.com/johnkerl/miller/v6/pkg/parsing/token" ) const ( diff --git a/pkg/parsing/mlr.bnf b/pkg/parsing/mlr.bnf index 39d6c0c3b..bd9602f81 100644 --- a/pkg/parsing/mlr.bnf +++ b/pkg/parsing/mlr.bnf @@ -7,7 +7,7 @@ // GRAMMAR FOR THE MILLER DOMAIN-SPECIFIC LANGUAGE // // This is the Miller DSL's BNF grammar, using the awesome GOCC tool framework -// from https://github.com/goccmack/gocc. +// from https://github.com/goccmack/gocc (forked at https://github.com/johnkerl/gocc). // // The first section is lexical elements and the second section is syntactical // elements. These are the analogs of lex and yacc, respectively, using a @@ -347,7 +347,7 @@ panic : '%' '%' '%' 'p' 'a' 'n' 'i' 'c' '%' '%' '%' ; // ================================================================ // Import the AST/ASTNode types and functions -<< import "github.com/johnkerl/miller/pkg/dsl" >> +<< import "github.com/johnkerl/miller/v6/pkg/dsl" >> // ================================================================ // TOP-LEVEL PRODUCTION RULE FOR THE MILLER DSL diff --git a/pkg/parsing/parser/parser.go b/pkg/parsing/parser/parser.go index 444e9f495..b984087af 100644 --- a/pkg/parsing/parser/parser.go +++ b/pkg/parsing/parser/parser.go @@ -6,8 +6,8 @@ import ( "fmt" "strings" - parseError "github.com/johnkerl/miller/pkg/parsing/errors" - "github.com/johnkerl/miller/pkg/parsing/token" + parseError "github.com/johnkerl/miller/v6/pkg/parsing/errors" + "github.com/johnkerl/miller/v6/pkg/parsing/token" ) const ( diff --git a/pkg/parsing/parser/productionstable.go b/pkg/parsing/parser/productionstable.go index f4b61fd50..93ea03996 100644 --- a/pkg/parsing/parser/productionstable.go +++ b/pkg/parsing/parser/productionstable.go @@ -2,7 +2,7 @@ package parser -import "github.com/johnkerl/miller/pkg/dsl" +import "github.com/johnkerl/miller/v6/pkg/dsl" type ( ProdTab [numProductions]ProdTabEntry diff --git a/pkg/parsing/token/token.go b/pkg/parsing/token/token.go index 50282b2c3..69fe0404c 100644 --- a/pkg/parsing/token/token.go +++ b/pkg/parsing/token/token.go @@ -123,7 +123,7 @@ func (t *Token) Int64Value() (int64, error) { func (t *Token) UTF8Rune() (rune, error) { r, _ := utf8.DecodeRune(t.Lit) if r == utf8.RuneError { - err := fmt.Errorf("Invalid rune") + err := fmt.Errorf("invalid rune") return r, err } return r, nil diff --git a/pkg/platform/getargs_windows.go b/pkg/platform/getargs_windows.go index 4349e4346..7a2f1af3d 100644 --- a/pkg/platform/getargs_windows.go +++ b/pkg/platform/getargs_windows.go @@ -79,7 +79,7 @@ func GetArgs() []string { //printArgs(retargs, "NEW") globbed := make([]string, 0) - for i, _ := range retargs { + for i := range retargs { // Expand things like *.csv matches, err := filepath.Glob(retargs[i]) if matches != nil && err == nil { diff --git a/pkg/runtime/stack.go b/pkg/runtime/stack.go index b32cd06dd..a71f83379 100644 --- a/pkg/runtime/stack.go +++ b/pkg/runtime/stack.go @@ -26,12 +26,11 @@ package runtime import ( - "container/list" "fmt" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ================================================================ @@ -68,7 +67,7 @@ func (sv *StackVariable) GetName() string { type Stack struct { // list of *StackFrameSet - stackFrameSets *list.List + stackFrameSets []*StackFrameSet // Invariant: equal to the head of the stackFrameSets list. This is cached // since all sets/gets in between frameset-push and frameset-pop will all @@ -77,9 +76,9 @@ type Stack struct { } func NewStack() *Stack { - stackFrameSets := list.New() + stackFrameSets := make([]*StackFrameSet, 1) head := newStackFrameSet() - stackFrameSets.PushFront(head) + stackFrameSets[0] = head return &Stack{ stackFrameSets: stackFrameSets, head: head, @@ -89,13 +88,13 @@ func NewStack() *Stack { // For when a user-defined function/subroutine is being entered func (stack *Stack) PushStackFrameSet() { stack.head = newStackFrameSet() - stack.stackFrameSets.PushFront(stack.head) + stack.stackFrameSets = append([]*StackFrameSet{stack.head}, stack.stackFrameSets...) } // For when a user-defined function/subroutine is being exited func (stack *Stack) PopStackFrameSet() { - stack.stackFrameSets.Remove(stack.stackFrameSets.Front()) - stack.head = stack.stackFrameSets.Front().Value.(*StackFrameSet) + stack.stackFrameSets = stack.stackFrameSets[1:] + stack.head = stack.stackFrameSets[0] } // ---------------------------------------------------------------- @@ -180,9 +179,8 @@ func (stack *Stack) UnsetIndexed( } func (stack *Stack) Dump() { - fmt.Printf("STACK FRAMESETS (count %d):\n", stack.stackFrameSets.Len()) - for entry := stack.stackFrameSets.Front(); entry != nil; entry = entry.Next() { - stackFrameSet := entry.Value.(*StackFrameSet) + fmt.Printf("STACK FRAMESETS (count %d):\n", len(stack.stackFrameSets)) + for _, stackFrameSet := range stack.stackFrameSets { stackFrameSet.dump() } } @@ -407,7 +405,7 @@ func (frame *StackFrame) defineTyped( return nil } else { return fmt.Errorf( - "%s: variable %s has already been defined in the same scope.", + "%s: variable %s has already been defined in the same scope", "mlr", stackVariable.name, ) } @@ -429,7 +427,7 @@ func (frame *StackFrame) setIndexed( return frame.set(stackVariable, newval) } else { return fmt.Errorf( - "%s: map indices must be int or string; got %s.\n", + "%s: map indices must be int or string; got %s", "mlr", leadingIndex.GetTypeName(), ) } diff --git a/pkg/runtime/state.go b/pkg/runtime/state.go index e94fd4ce5..3fe93aa18 100644 --- a/pkg/runtime/state.go +++ b/pkg/runtime/state.go @@ -9,10 +9,10 @@ package runtime import ( "container/list" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) type State struct { @@ -25,27 +25,42 @@ type State struct { // For holding "\0".."\9" between where they are set via things like // '$x =~ "(..)_(...)"', and interpolated via things like '$y = "\2:\1"'. - RegexCaptures []string - Options *cli.TOptions + // + // Each top-level block and user-defined function has its own captures. + // + // For example, in function `f()`, one can do `somevar =~ someregex`, then + // call some function `g()` which also uses `=~`, and then when `g()` returns, + // `f()` will have its "\1", "\2", etc intact. + // + // This is necessary for the stateful semantics of `=~` and "\1", "\2", etc. + // Those are avoided when the user calls `matchx`, which is newer, and + // stateless. However, `=~` exists in the Miller DSL and we must support it. + regexCapturesByFrame [][]string + + Options *cli.TOptions // StrictMode allows for runtime handling of absent-reads and untyped assignments. StrictMode bool } func NewEmptyState(options *cli.TOptions, strictMode bool) *State { + + // See lib.MakeEmptyCaptures for context. + regexCapturesByFrame := make([][]string, 1) + regexCapturesByFrame[0] = lib.MakeEmptyCaptures() + oosvars := mlrval.NewMlrmap() return &State{ - Inrec: nil, - Context: nil, - Oosvars: oosvars, - FilterExpression: mlrval.TRUE, - Stack: NewStack(), + Inrec: nil, + Context: nil, + Oosvars: oosvars, + FilterExpression: mlrval.TRUE, + Stack: NewStack(), + regexCapturesByFrame: regexCapturesByFrame, // OutputRecordsAndContexts is assigned after construction - // See lib.MakeEmptyRegexCaptures for context. - RegexCaptures: lib.MakeEmptyRegexCaptures(), - Options: options, + Options: options, StrictMode: strictMode, } @@ -57,5 +72,24 @@ func (state *State) Update( ) { state.Inrec = inrec state.Context = context - state.RegexCaptures = lib.MakeEmptyRegexCaptures() + state.regexCapturesByFrame[0] = lib.MakeEmptyCaptures() +} + +func (state *State) SetRegexCaptures( + captures []string, +) { + state.regexCapturesByFrame[0] = lib.CopyStringArray(captures) +} + +func (state *State) GetRegexCaptures() []string { + regexCaptures := state.regexCapturesByFrame[0] + return lib.CopyStringArray(regexCaptures) +} + +func (state *State) PushRegexCapturesFrame() { + state.regexCapturesByFrame = append([][]string{lib.MakeEmptyCaptures()}, state.regexCapturesByFrame...) +} + +func (state *State) PopRegexCapturesFrame() { + state.regexCapturesByFrame = state.regexCapturesByFrame[1:] } diff --git a/pkg/scan/find_benchmark_test.go b/pkg/scan/find_benchmark_test.go index e905bb735..ee3d2b744 100644 --- a/pkg/scan/find_benchmark_test.go +++ b/pkg/scan/find_benchmark_test.go @@ -4,7 +4,7 @@ import ( "testing" ) -// go test -run=nonesuch -bench=. github.com/johnkerl/miller/pkg/scan/... +// go test -run=nonesuch -bench=. github.com/johnkerl/miller/v6/pkg/scan/... func BenchmarkFromNormalCases(b *testing.B) { diff --git a/pkg/stream/stream.go b/pkg/stream/stream.go index 9f2cbe805..1aafe95c9 100644 --- a/pkg/stream/stream.go +++ b/pkg/stream/stream.go @@ -6,11 +6,11 @@ import ( "errors" "io" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/input" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/transformers" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/input" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/transformers" + "github.com/johnkerl/miller/v6/pkg/types" ) // Since Go is concurrent, the context struct (AWK-like variables such as @@ -95,13 +95,10 @@ func Stream( select { case ierr := <-inputErrorChannel: retval = ierr - break - case _ = <-dataProcessingErrorChannel: + case <-dataProcessingErrorChannel: retval = errors.New("exiting due to data error") // details already printed - break - case _ = <-doneWritingChannel: + case <-doneWritingChannel: done = true - break } } diff --git a/pkg/terminals/help/entry.go b/pkg/terminals/help/entry.go index a9148c385..47d3f6e9d 100644 --- a/pkg/terminals/help/entry.go +++ b/pkg/terminals/help/entry.go @@ -10,14 +10,14 @@ import ( "github.com/mattn/go-isatty" - "github.com/johnkerl/miller/pkg/auxents" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/dsl/cst" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/transformers" + "github.com/johnkerl/miller/v6/pkg/auxents" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/dsl/cst" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/transformers" ) // ================================================================ @@ -408,7 +408,7 @@ PPRINT: pretty-printed tabular | 4 5 6 | Record 2: "apple":"4", "bat":"5", "cog":"6" +---------------------+ -Markdown tabular (supported for output only): +Markdown tabular: +-----------------------+ | | apple | bat | cog | | | | --- | --- | --- | | diff --git a/pkg/terminals/regtest/invoker.go b/pkg/terminals/regtest/invoker.go index febbbbfa3..7f58d7d9e 100644 --- a/pkg/terminals/regtest/invoker.go +++ b/pkg/terminals/regtest/invoker.go @@ -6,8 +6,8 @@ import ( "os/exec" "strings" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/platform" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/platform" ) // RunMillerCommand runs a string like 'mlr cat foo.dat', with specified mlr diff --git a/pkg/terminals/regtest/regtester.go b/pkg/terminals/regtest/regtester.go index ec58bdf73..749002b89 100644 --- a/pkg/terminals/regtest/regtester.go +++ b/pkg/terminals/regtest/regtester.go @@ -56,16 +56,14 @@ package regtest import ( - "container/list" "fmt" - "io/ioutil" "os" "path/filepath" "runtime" "strings" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/lib" ) const CmdName = "cmd" @@ -111,8 +109,8 @@ type RegTester struct { casePassCount int caseFailCount int - failDirNames *list.List - failCaseNames *list.List + failDirNames []string + failCaseNames []string firstNFailsToShow int } @@ -133,8 +131,8 @@ func NewRegTester( directoryFailCount: 0, casePassCount: 0, caseFailCount: 0, - failDirNames: list.New(), - failCaseNames: list.New(), + failDirNames: make([]string, 0), + failCaseNames: make([]string, 0), firstNFailsToShow: firstNFailsToShow, } } @@ -153,7 +151,6 @@ func (regtester *RegTester) resetCounts() { func (regtester *RegTester) Execute( casePaths []string, ) bool { - // Don't let the current user's settings affect expected results for _, name := range envVarsToUnset { os.Unsetenv(name) @@ -184,13 +181,13 @@ func (regtester *RegTester) Execute( regtester.executeSinglePath(path) } - if regtester.failCaseNames.Len() > 0 && regtester.firstNFailsToShow > 0 { + if len(regtester.failCaseNames) > 0 && regtester.firstNFailsToShow > 0 { fmt.Println() fmt.Println("RERUNS OF FIRST FAILED CASE FILES:") verbosityLevel := 3 i := 0 - for e := regtester.failCaseNames.Front(); e != nil; e = e.Next() { - regtester.executeSingleCmdFile(e.Value.(string), verbosityLevel) + for _, e := range regtester.failCaseNames { + regtester.executeSingleCmdFile(e, verbosityLevel) i++ if i >= regtester.firstNFailsToShow { break @@ -198,11 +195,11 @@ func (regtester *RegTester) Execute( } } - if !regtester.plainMode && regtester.failDirNames.Len() > 0 { + if !regtester.plainMode && len(regtester.failDirNames) > 0 { fmt.Println() fmt.Println("FAILED CASE DIRECTORIES:") - for e := regtester.failDirNames.Front(); e != nil; e = e.Next() { - fmt.Printf(" %s/\n", e.Value.(string)) + for _, e := range regtester.failDirNames { + fmt.Printf(" %s/\n", e) } } @@ -250,7 +247,7 @@ func (regtester *RegTester) executeSinglePath( regtester.directoryPassCount++ } else { regtester.directoryFailCount++ - regtester.failDirNames.PushBack(path) + regtester.failDirNames = append(regtester.failDirNames, path) } } return passed @@ -262,7 +259,7 @@ func (regtester *RegTester) executeSinglePath( regtester.casePassCount++ } else { regtester.caseFailCount++ - regtester.failCaseNames.PushBack(path) + regtester.failCaseNames = append(regtester.failCaseNames, path) } return passed } @@ -279,7 +276,7 @@ func (regtester *RegTester) executeSingleDirectory( ) (bool, bool) { passed := true // TODO: comment - hasCaseSubdirectories := regtester.hasCaseSubdirectories(dirName) + fileNames, hasCaseSubdirectories := regtester.hasCaseSubdirectories(dirName) if !regtester.plainMode { if hasCaseSubdirectories && regtester.verbosityLevel >= 2 { @@ -287,34 +284,26 @@ func (regtester *RegTester) executeSingleDirectory( } } - entries, err := ioutil.ReadDir(dirName) - if err != nil { - fmt.Printf("%s: %v\n", dirName, err) - passed = false - } else { + for _, name := range fileNames { + path := dirName + "/" + name - for i := range entries { - entry := &entries[i] - path := dirName + "/" + (*entry).Name() - - ok := regtester.executeSinglePath(path) - if !ok { - passed = false - } + ok := regtester.executeSinglePath(path) + if !ok { + passed = false } + } - // Only print if there are .cmd files directly in this directory. - // Otherwise it's just a directory-of-directories and we don't need to - // multiply announce. - if hasCaseSubdirectories { - if passed { - if !regtester.plainMode { - fmt.Printf("%s %s\n", colorizer.MaybeColorizePass("PASS", true), dirName) - } - } else { - if !regtester.plainMode { - fmt.Printf("%s %s\n", colorizer.MaybeColorizeFail("FAIL", true), dirName) - } + // Only print if there are .cmd files directly in this directory. + // Otherwise it's just a directory-of-directories and we don't need to + // multiply announce. + if hasCaseSubdirectories { + if passed { + if !regtester.plainMode { + fmt.Printf("%s %s\n", colorizer.MaybeColorizePass("PASS", true), dirName) + } + } else { + if !regtester.plainMode { + fmt.Printf("%s %s\n", colorizer.MaybeColorizeFail("FAIL", true), dirName) } } } @@ -340,22 +329,27 @@ func (regtester *RegTester) executeSingleDirectory( func (regtester *RegTester) hasCaseSubdirectories( dirName string, -) bool { +) ([]string, bool) { + f, err := os.Open(dirName) + if err != nil { + fmt.Printf("%s: %v\n", dirName, err) + os.Exit(1) + } + defer f.Close() - entries, err := ioutil.ReadDir(dirName) + names, err := f.Readdirnames(-1) if err != nil { fmt.Printf("%s: %v\n", dirName, err) os.Exit(1) } - for i := range entries { - entry := &entries[i] - path := dirName + string(filepath.Separator) + (*entry).Name() + for _, name := range names { + path := dirName + string(filepath.Separator) + name if regtester.isCaseDirectory(path) { - return true + return names, true } } - return false + return names, false } func (regtester *RegTester) isCaseDirectory( @@ -483,8 +477,7 @@ func (regtester *RegTester) executeSingleCmdFile( // Copy any files requested by the test. (Most don't; some do, e.g. those // which test the write-in-place logic of mlr -I.) - for pe := preCopySrcDestPairs.Front(); pe != nil; pe = pe.Next() { - pair := pe.Value.(stringPair) + for _, pair := range preCopySrcDestPairs { src := pair.first dst := pair.second if verbosityLevel >= 3 { @@ -569,8 +562,7 @@ func (regtester *RegTester) executeSingleCmdFile( } } - for pe := postCompareExpectedActualPairs.Front(); pe != nil; pe = pe.Next() { - pair := pe.Value.(stringPair) + for _, pair := range postCompareExpectedActualPairs { expectedFileName := pair.first actualFileName := pair.second @@ -691,8 +683,7 @@ func (regtester *RegTester) executeSingleCmdFile( // Compare any additional output files. Most test cases don't have // these (just stdout/stderr), but some do: for example, those which // test the tee verb/function. - for pe := postCompareExpectedActualPairs.Front(); pe != nil; pe = pe.Next() { - pair := pe.Value.(stringPair) + for _, pair := range postCompareExpectedActualPairs { expectedFileName := pair.first actualFileName := pair.second ok, expectedContents, actualContents, err := regtester.compareFiles(expectedFileName, actualFileName, caseDir) @@ -730,8 +721,7 @@ func (regtester *RegTester) executeSingleCmdFile( } // Clean up any requested file-copies so that we're git-clean after the regression-test run. - for pe := preCopySrcDestPairs.Front(); pe != nil; pe = pe.Next() { - pair := pe.Value.(stringPair) + for _, pair := range preCopySrcDestPairs { dst := pair.second os.Remove(dst) if verbosityLevel >= 3 { @@ -740,8 +730,7 @@ func (regtester *RegTester) executeSingleCmdFile( } // Clean up any extra output files so that we're git-clean after the regression-test run. - for pe := postCompareExpectedActualPairs.Front(); pe != nil; pe = pe.Next() { - pair := pe.Value.(stringPair) + for _, pair := range postCompareExpectedActualPairs { actualFileName := pair.second os.Remove(actualFileName) if verbosityLevel >= 3 { @@ -774,7 +763,7 @@ func (regtester *RegTester) loadFile( fileName string, caseDir string, ) (string, error) { - byteContents, err := ioutil.ReadFile(fileName) + byteContents, err := os.ReadFile(fileName) if err != nil { return "", err } @@ -789,7 +778,7 @@ func (regtester *RegTester) storeFile( fileName string, contents string, ) error { - err := ioutil.WriteFile(fileName, []byte(contents), 0666) + err := os.WriteFile(fileName, []byte(contents), 0o666) if err != nil { return err } @@ -860,7 +849,7 @@ func (regtester *RegTester) loadEnvFile( fields := strings.SplitN(line, "=", 2) if len(fields) != 2 { return nil, fmt.Errorf( - "mlr: could not parse line \"%s\" from file \"%s\".\n", + `mlr: could not parse line "%s" from file "%s"`, line, filename, ) } @@ -873,12 +862,13 @@ func (regtester *RegTester) loadEnvFile( func (regtester *RegTester) loadStringPairFile( filename string, caseDir string, -) (*list.List, error) { +) ([]stringPair, error) { + pairs := make([]stringPair, 0) // If the file doesn't exist that's the normal case -- most cases do not // have a .precopy or .postcmp file. _, err := os.Stat(filename) if os.IsNotExist(err) { - return list.New(), nil + return pairs, nil } // If the file does exist and isn't loadable, that's an error. @@ -887,7 +877,7 @@ func (regtester *RegTester) loadStringPairFile( return nil, err } - pairs := list.New() + pairs = make([]stringPair, 0) lines := strings.Split(contents, "\n") for _, line := range lines { line = strings.TrimSuffix(line, "\r") @@ -897,12 +887,12 @@ func (regtester *RegTester) loadStringPairFile( fields := strings.SplitN(line, " ", 2) // TODO: split on multi-space if len(fields) != 2 { return nil, fmt.Errorf( - "mlr: could not parse line \"%s\" from file \"%s\".\n", + `mlr: could not parse line "%s" from file "%s"`, line, filename, ) } pair := stringPair{first: fields[0], second: fields[1]} - pairs.PushBack(pair) + pairs = append(pairs, pair) } return pairs, nil } diff --git a/pkg/terminals/repl/dsl.go b/pkg/terminals/repl/dsl.go index 8f3a2a046..78f3b98bb 100644 --- a/pkg/terminals/repl/dsl.go +++ b/pkg/terminals/repl/dsl.go @@ -23,9 +23,9 @@ import ( "fmt" "strings" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/dsl/cst" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/dsl/cst" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ---------------------------------------------------------------- diff --git a/pkg/terminals/repl/entry.go b/pkg/terminals/repl/entry.go index d2403ccdf..95d01f385 100644 --- a/pkg/terminals/repl/entry.go +++ b/pkg/terminals/repl/entry.go @@ -27,7 +27,7 @@ import ( "path" "strings" - "github.com/johnkerl/miller/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/cli" ) // ================================================================ @@ -156,7 +156,7 @@ func ReplMain(args []string) int { // --auto-flatten is on by default. But if input and output formats are both JSON, // then we don't need to actually do anything. See also mlrcli_parse.go. options.WriterOptions.AutoFlatten = cli.DecideFinalFlatten(&options.WriterOptions) - options.WriterOptions.AutoUnflatten = cli.DecideFinalUnflatten(options) + options.WriterOptions.AutoUnflatten = cli.DecideFinalUnflatten(options, [][]string{}) recordOutputFileName := "(stdout)" recordOutputStream := os.Stdout diff --git a/pkg/terminals/repl/prompt.go b/pkg/terminals/repl/prompt.go index bfcb46d2c..be2de0e10 100644 --- a/pkg/terminals/repl/prompt.go +++ b/pkg/terminals/repl/prompt.go @@ -11,9 +11,9 @@ import ( "golang.org/x/term" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/version" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/version" ) const ENV_PRIMARY_PROMPT = "MLR_REPL_PS1" diff --git a/pkg/terminals/repl/session.go b/pkg/terminals/repl/session.go index 03ef0f6b4..2f25beb31 100644 --- a/pkg/terminals/repl/session.go +++ b/pkg/terminals/repl/session.go @@ -25,14 +25,14 @@ import ( "strings" "syscall" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/dsl/cst" - "github.com/johnkerl/miller/pkg/input" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/dsl/cst" + "github.com/johnkerl/miller/v6/pkg/input" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -176,7 +176,7 @@ func (repl *Repl) handleSession(istream *os.File) error { doneDraining := false for { select { - case _ = <-repl.appSignalNotificationChannel: + case <-repl.appSignalNotificationChannel: line = "" // Ignore any partially-entered line -- a ^C should do that default: doneDraining = true @@ -268,7 +268,7 @@ func (repl *Repl) closeBufferedOutputStream() error { if repl.recordOutputStream != os.Stdout { err := repl.recordOutputStream.Close() if err != nil { - return fmt.Errorf("mlr repl: error on redirect close of %s: %v\n", + return fmt.Errorf("mlr repl: error on redirect close of %s: %v", repl.recordOutputFileName, err, ) } diff --git a/pkg/terminals/repl/types.go b/pkg/terminals/repl/types.go index b0da1b9d3..76f8507f0 100644 --- a/pkg/terminals/repl/types.go +++ b/pkg/terminals/repl/types.go @@ -9,11 +9,11 @@ import ( "container/list" "os" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/dsl/cst" - "github.com/johnkerl/miller/pkg/input" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/dsl/cst" + "github.com/johnkerl/miller/v6/pkg/input" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/runtime" ) // ================================================================ diff --git a/pkg/terminals/repl/verbs.go b/pkg/terminals/repl/verbs.go index 92d9046ff..c2b992746 100644 --- a/pkg/terminals/repl/verbs.go +++ b/pkg/terminals/repl/verbs.go @@ -10,12 +10,12 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/dsl/cst" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/dsl/cst" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -442,11 +442,8 @@ func handleSkipOrProcessN(repl *Repl, n int64, processingNotSkipping bool) { for i := int64(1); i <= n; i++ { select { case recordsAndContexts = <-repl.readerChannel: - break case err = <-repl.errorChannel: - break - case _ = <-repl.appSignalNotificationChannel: // user typed control-C - break + case <-repl.appSignalNotificationChannel: // user typed control-C } if err != nil { @@ -505,13 +502,11 @@ func handleSkipOrProcessUntil(repl *Repl, dslString string, processingNotSkippin doubleBreak := false select { case recordsAndContexts = <-repl.readerChannel: - break case err = <-repl.errorChannel: - break - case _ = <-repl.appSignalNotificationChannel: // user typed control-C + case <-repl.appSignalNotificationChannel: // user typed control-C doubleBreak = true - break } + if doubleBreak { break } @@ -566,7 +561,7 @@ func skipOrProcessRecord( repl.runtimeState.Update(recordAndContext.Record, &recordAndContext.Context) // End-of-stream marker - if recordAndContext.EndOfStream == true { + if recordAndContext.EndOfStream { fmt.Println("End of record stream") repl.readerChannel = nil repl.errorChannel = nil @@ -639,7 +634,8 @@ func writeRecord(repl *Repl, outrec *mlrval.Mlrmap) { outrec.Unflatten(repl.options.WriterOptions.FLATSEP) } } - repl.recordWriter.Write(outrec, repl.bufferedRecordOutputStream, true /*outputIsStdout*/) + // XXX TEMP + repl.recordWriter.Write(outrec, nil, repl.bufferedRecordOutputStream, true /*outputIsStdout*/) repl.bufferedRecordOutputStream.Flush() } diff --git a/pkg/terminals/terminals.go b/pkg/terminals/terminals.go index 78ec0dd3f..9e11e1e71 100644 --- a/pkg/terminals/terminals.go +++ b/pkg/terminals/terminals.go @@ -10,10 +10,10 @@ import ( "os" "runtime" - "github.com/johnkerl/miller/pkg/terminals/help" - "github.com/johnkerl/miller/pkg/terminals/regtest" - "github.com/johnkerl/miller/pkg/terminals/repl" - "github.com/johnkerl/miller/pkg/version" + "github.com/johnkerl/miller/v6/pkg/terminals/help" + "github.com/johnkerl/miller/v6/pkg/terminals/regtest" + "github.com/johnkerl/miller/v6/pkg/terminals/repl" + "github.com/johnkerl/miller/v6/pkg/version" ) // tTerminalMain is a function-pointer type for the entrypoint handler for a given terminal, diff --git a/pkg/transformers/aaa_chain_transformer.go b/pkg/transformers/aaa_chain_transformer.go index e367ab883..a4137b763 100644 --- a/pkg/transformers/aaa_chain_transformer.go +++ b/pkg/transformers/aaa_chain_transformer.go @@ -3,8 +3,8 @@ package transformers import ( "container/list" "fmt" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" "os" ) @@ -265,7 +265,7 @@ func runSingleTransformerBatch( // the output channel without involving the record-transformer, since // there is no record to be transformed. - if inputRecordAndContext.EndOfStream == true || inputRecordAndContext.Record != nil { + if inputRecordAndContext.EndOfStream || inputRecordAndContext.Record != nil { recordTransformer.Transform( inputRecordAndContext, outputRecordsAndContexts, diff --git a/pkg/transformers/aaa_record_transformer.go b/pkg/transformers/aaa_record_transformer.go index 1f9bae7dd..516a11a31 100644 --- a/pkg/transformers/aaa_record_transformer.go +++ b/pkg/transformers/aaa_record_transformer.go @@ -4,8 +4,8 @@ import ( "container/list" "os" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) // IRecordTransformer is the interface satisfied by all transformers, i.e., @@ -27,6 +27,12 @@ type RecordTransformerFunc func( outputDownstreamDoneChannel chan<- bool, ) +// Used within some verbs +type RecordTransformerHelperFunc func( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext +) + type TransformerUsageFunc func( ostream *os.File, ) diff --git a/pkg/transformers/aaa_transformer_table.go b/pkg/transformers/aaa_transformer_table.go index ed98af07f..b1a41ae17 100644 --- a/pkg/transformers/aaa_transformer_table.go +++ b/pkg/transformers/aaa_transformer_table.go @@ -5,8 +5,8 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/colorizer" - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/colorizer" + "github.com/johnkerl/miller/v6/pkg/lib" ) // ---------------------------------------------------------------- @@ -62,6 +62,7 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{ SkipTrivialRecordsSetup, SortSetup, SortWithinRecordsSetup, + SparsifySetup, SplitSetup, SsubSetup, Stats1Setup, @@ -69,6 +70,7 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{ StepSetup, SubSetup, SummarySetup, + SurvSetup, TacSetup, TailSetup, TeeSetup, diff --git a/pkg/transformers/altkv.go b/pkg/transformers/altkv.go index a97c3127e..cb1d5c8d2 100644 --- a/pkg/transformers/altkv.go +++ b/pkg/transformers/altkv.go @@ -7,9 +7,9 @@ import ( "strconv" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/bar.go b/pkg/transformers/bar.go index 0aaafd8f1..09713c272 100644 --- a/pkg/transformers/bar.go +++ b/pkg/transformers/bar.go @@ -7,9 +7,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) const barDefaultFillString = "*" diff --git a/pkg/transformers/bootstrap.go b/pkg/transformers/bootstrap.go index 9450a425e..47a200499 100644 --- a/pkg/transformers/bootstrap.go +++ b/pkg/transformers/bootstrap.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/case.go b/pkg/transformers/case.go index 4d02617c4..1020876e4 100644 --- a/pkg/transformers/case.go +++ b/pkg/transformers/case.go @@ -9,10 +9,10 @@ import ( "golang.org/x/text/cases" "golang.org/x/text/language" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -157,7 +157,7 @@ func NewTransformerCase( tr.caserFunc = cases.Title(language.Und).String default: return nil, fmt.Errorf( - "mlr %s: case option must be specified using one of -u, -l, -s, -t.", + "mlr %s: case option must be specified using one of -u, -l, -s, -t", verbNameCase, ) } diff --git a/pkg/transformers/cat.go b/pkg/transformers/cat.go index c065aa536..74df80eca 100644 --- a/pkg/transformers/cat.go +++ b/pkg/transformers/cat.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/check.go b/pkg/transformers/check.go index 9f3600190..ed68d0afa 100644 --- a/pkg/transformers/check.go +++ b/pkg/transformers/check.go @@ -6,8 +6,8 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/clean_whitespace.go b/pkg/transformers/clean_whitespace.go index 3ffdd3862..bdb032f66 100644 --- a/pkg/transformers/clean_whitespace.go +++ b/pkg/transformers/clean_whitespace.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/count.go b/pkg/transformers/count.go index 623855ac9..107dbec6d 100644 --- a/pkg/transformers/count.go +++ b/pkg/transformers/count.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/count_similar.go b/pkg/transformers/count_similar.go index 2fafe2d70..b8c0a1819 100644 --- a/pkg/transformers/count_similar.go +++ b/pkg/transformers/count_similar.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/cut.go b/pkg/transformers/cut.go index f3039f65a..e9e57d438 100644 --- a/pkg/transformers/cut.go +++ b/pkg/transformers/cut.go @@ -1,16 +1,18 @@ package transformers import ( + "cmp" "container/list" "fmt" "os" "regexp" + "slices" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -36,7 +38,7 @@ func transformerCutUsage( fmt.Fprintf(o, " -r Treat field names as regular expressions. \"ab\", \"a.*b\" will\n") fmt.Fprintf(o, " match any field name containing the substring \"ab\" or matching\n") fmt.Fprintf(o, " \"a.*b\", respectively; anchors of the form \"^ab$\", \"^a.*b$\" may\n") - fmt.Fprintf(o, " be used. The -o flag is ignored when -r is present.\n") + fmt.Fprintf(o, " be used.\n") fmt.Fprintf(o, "-h|--help Show this message.\n") fmt.Fprintf(o, "Examples:\n") fmt.Fprintf(o, " %s %s -f hostname,status\n", "mlr", verbNameCut) @@ -129,6 +131,7 @@ type TransformerCut struct { fieldNameSet map[string]bool doComplement bool + doArgOrder bool regexes []*regexp.Regexp recordTransformerFunc RecordTransformerFunc @@ -143,6 +146,8 @@ func NewTransformerCut( tr := &TransformerCut{} + tr.doArgOrder = doArgOrder + if !doRegexes { tr.fieldNameList = fieldNames tr.fieldNameSet = lib.StringListToSet(fieldNames) @@ -257,6 +262,11 @@ func (tr *TransformerCut) exclude( outputRecordsAndContexts.PushBack(inrecAndContext) } +type entryIndex struct { + index int + entry *mlrval.MlrmapEntry +} + // ---------------------------------------------------------------- func (tr *TransformerCut) processWithRegexes( inrecAndContext *types.RecordAndContext, @@ -267,11 +277,14 @@ func (tr *TransformerCut) processWithRegexes( if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record newrec := mlrval.NewMlrmapAsRecord() + var entries []entryIndex for pe := inrec.Head; pe != nil; pe = pe.Next { matchesAny := false - for _, regex := range tr.regexes { + var index int + for i, regex := range tr.regexes { if regex.MatchString(pe.Key) { matchesAny = true + index = i break } } @@ -279,7 +292,19 @@ func (tr *TransformerCut) processWithRegexes( if matchesAny != tr.doComplement { // Pointer-motion is OK since the inrec is being hereby discarded. // We're simply transferring ownership to the newrec. - newrec.PutReference(pe.Key, pe.Value) + if tr.doArgOrder { + entries = append(entries, entryIndex{index, pe}) + } else { + newrec.PutReference(pe.Key, pe.Value) + } + } + } + if tr.doArgOrder { + slices.SortStableFunc(entries, func(a, b entryIndex) int { + return cmp.Compare(a.index, b.index) + }) + for _, ei := range entries { + newrec.PutReference(ei.entry.Key, ei.entry.Value) } } outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) diff --git a/pkg/transformers/decimate.go b/pkg/transformers/decimate.go index 8535c1d4b..112d9395e 100644 --- a/pkg/transformers/decimate.go +++ b/pkg/transformers/decimate.go @@ -6,8 +6,8 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/fill_down.go b/pkg/transformers/fill_down.go index b9ab079dc..cf779e336 100644 --- a/pkg/transformers/fill_down.go +++ b/pkg/transformers/fill_down.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -116,7 +116,6 @@ func transformerFillDownParseCLI( type TransformerFillDown struct { // input fillDownFieldNames []string - doAll bool onlyIfAbsent bool // state diff --git a/pkg/transformers/fill_empty.go b/pkg/transformers/fill_empty.go index e893ef334..833ed5b12 100644 --- a/pkg/transformers/fill_empty.go +++ b/pkg/transformers/fill_empty.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/flatten.go b/pkg/transformers/flatten.go index a2f4d1a97..86c4c0fd1 100644 --- a/pkg/transformers/flatten.go +++ b/pkg/transformers/flatten.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/format_values.go b/pkg/transformers/format_values.go index d91041c27..edaf0389f 100644 --- a/pkg/transformers/format_values.go +++ b/pkg/transformers/format_values.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/fraction.go b/pkg/transformers/fraction.go index fb90ab2bf..432d600cf 100644 --- a/pkg/transformers/fraction.go +++ b/pkg/transformers/fraction.go @@ -7,11 +7,11 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -264,9 +264,12 @@ func (tr *TransformerFraction) Transform( } else { numerator = value } - denominator := sumsForGroup[fractionFieldName] - if !mlrval.Equals(value, tr.zero) { + + // Return 0 for 0/n + if mlrval.Equals(numerator, tr.zero) { + outputValue = tr.zero + } else if !mlrval.Equals(denominator, tr.zero) { outputValue = bifs.BIF_divide(numerator, denominator) outputValue = bifs.BIF_times(outputValue, tr.multiplier) } else { diff --git a/pkg/transformers/gap.go b/pkg/transformers/gap.go index 345780f4e..9bec0b2ab 100644 --- a/pkg/transformers/gap.go +++ b/pkg/transformers/gap.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/grep.go b/pkg/transformers/grep.go index 46d01244c..38942d650 100644 --- a/pkg/transformers/grep.go +++ b/pkg/transformers/grep.go @@ -7,8 +7,8 @@ import ( "regexp" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/group_by.go b/pkg/transformers/group_by.go index f210969ad..bf5441cb5 100644 --- a/pkg/transformers/group_by.go +++ b/pkg/transformers/group_by.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/group_like.go b/pkg/transformers/group_like.go index a7ede9a59..73a7dc347 100644 --- a/pkg/transformers/group_like.go +++ b/pkg/transformers/group_like.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/gsub.go b/pkg/transformers/gsub.go deleted file mode 100644 index 0b188505b..000000000 --- a/pkg/transformers/gsub.go +++ /dev/null @@ -1,157 +0,0 @@ -package transformers - -import ( - "container/list" - "fmt" - "os" - "strings" - - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" -) - -// ---------------------------------------------------------------- -const verbNameGsub = "gsub" - -var GsubSetup = TransformerSetup{ - Verb: verbNameGsub, - UsageFunc: transformerGsubUsage, - ParseCLIFunc: transformerGsubParseCLI, - IgnoresInput: false, -} - -func transformerGsubUsage( - o *os.File, -) { - fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameGsub) - fmt.Fprintf(o, "Replaces old string with new string in specified field(s), with regex support\n") - fmt.Fprintf(o, "for the old string and handling multiple matches, like the `gsub` DSL function.\n") - fmt.Fprintf(o, "See also the `sub` and `ssub` verbs.\n") - fmt.Fprintf(o, "Options:\n") - fmt.Fprintf(o, "-f {a,b,c} Field names to convert.\n") - fmt.Fprintf(o, "-h|--help Show this message.\n") -} - -func transformerGsubParseCLI( - pargi *int, - argc int, - args []string, - _ *cli.TOptions, - doConstruct bool, // false for first pass of CLI-parse, true for second pass -) IRecordTransformer { - - // Skip the verb name from the current spot in the mlr command line - argi := *pargi - verb := args[argi] - argi++ - - // Parse local flags - var fieldNames []string = nil - var oldText string - var newText string - - for argi < argc /* variable increment: 1 or 2 depending on flag */ { - opt := args[argi] - if !strings.HasPrefix(opt, "-") { - break // No more flag options to process - } - if args[argi] == "--" { - break // All transformers must do this so main-flags can follow verb-flags - } - argi++ - - if opt == "-h" || opt == "--help" { - transformerGsubUsage(os.Stdout) - os.Exit(0) - - } else if opt == "-f" { - fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) - } else { - transformerGsubUsage(os.Stderr) - os.Exit(1) - } - } - - if fieldNames == nil { - transformerGsubUsage(os.Stderr) - os.Exit(1) - } - - // Get the old and new text from the command line - if (argc - argi) < 2 { - transformerGsubUsage(os.Stderr) - os.Exit(1) - } - oldText = args[argi] - newText = args[argi+1] - - argi += 2 - - *pargi = argi - if !doConstruct { // All transformers must do this for main command-line parsing - return nil - } - - transformer, err := NewTransformerGsub( - fieldNames, - oldText, - newText, - ) - if err != nil { - fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } - - return transformer -} - -// ---------------------------------------------------------------- -type TransformerGsub struct { - fieldNames []string - oldText *mlrval.Mlrval - newText *mlrval.Mlrval -} - -// ---------------------------------------------------------------- -func NewTransformerGsub( - fieldNames []string, - oldText string, - newText string, -) (*TransformerGsub, error) { - tr := &TransformerGsub{ - fieldNames: fieldNames, - oldText: mlrval.FromString(oldText), - newText: mlrval.FromString(newText), - } - return tr, nil -} - -func (tr *TransformerGsub) Transform( - inrecAndContext *types.RecordAndContext, - outputRecordsAndContexts *list.List, // list of *types.RecordAndContext - inputDownstreamDoneChannel <-chan bool, - outputDownstreamDoneChannel chan<- bool, -) { - HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - - if !inrecAndContext.EndOfStream { - inrec := inrecAndContext.Record - - for _, fieldName := range tr.fieldNames { - oldValue := inrec.Get(fieldName) - if oldValue == nil { - continue - } - - newValue := bifs.BIF_gsub(oldValue, tr.oldText, tr.newText) - - inrec.PutReference(fieldName, newValue) - } - - outputRecordsAndContexts.PushBack(inrecAndContext) - } else { - outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker - } -} diff --git a/pkg/transformers/having_fields.go b/pkg/transformers/having_fields.go index b17e6cecc..467c0a7d3 100644 --- a/pkg/transformers/having_fields.go +++ b/pkg/transformers/having_fields.go @@ -7,9 +7,9 @@ import ( "regexp" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) type tHavingFieldsCriterion int diff --git a/pkg/transformers/head.go b/pkg/transformers/head.go index 4626ddead..6f7ff5a19 100644 --- a/pkg/transformers/head.go +++ b/pkg/transformers/head.go @@ -6,8 +6,8 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/histogram.go b/pkg/transformers/histogram.go index a7d6241fb..32f581107 100644 --- a/pkg/transformers/histogram.go +++ b/pkg/transformers/histogram.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/join.go b/pkg/transformers/join.go index 3d8547a12..61d8a47a5 100644 --- a/pkg/transformers/join.go +++ b/pkg/transformers/join.go @@ -6,12 +6,12 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/input" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/transformers/utils" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/input" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/transformers/utils" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -92,6 +92,8 @@ func transformerJoinUsage( fmt.Fprintf(o, " --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field\n") fmt.Fprintf(o, " names from the left file. Automatically includes the join-field name(s). Helpful\n") fmt.Fprintf(o, " for when you only want a limited subset of information from the left file.\n") + fmt.Fprintf(o, " Tip: you can use --lk \"\": this means the left file becomes solely a row-selector\n") + fmt.Fprintf(o, " for the input files.\n") fmt.Fprintf(o, " --lp {text} Additional prefix for non-join output field names from\n") fmt.Fprintf(o, " the left file\n") fmt.Fprintf(o, " --rp {text} Additional prefix for non-join output field names from\n") @@ -128,7 +130,7 @@ func transformerJoinUsage( fmt.Fprintf(o, "expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'.\n") fmt.Fprintf(o, "Please use \"%s --usage-separator-options\" for information on specifying separators.\n", "mlr") - fmt.Fprintf(o, "Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information\n") + fmt.Fprintf(o, "Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information\n") fmt.Fprintf(o, "including examples.\n") } diff --git a/pkg/transformers/json_parse.go b/pkg/transformers/json_parse.go index 1a00ccf08..0690b410e 100644 --- a/pkg/transformers/json_parse.go +++ b/pkg/transformers/json_parse.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/json_stringify.go b/pkg/transformers/json_stringify.go index ca515f0a0..c6b5642a8 100644 --- a/pkg/transformers/json_stringify.go +++ b/pkg/transformers/json_stringify.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/label.go b/pkg/transformers/label.go index b14be1b6d..3ad653595 100644 --- a/pkg/transformers/label.go +++ b/pkg/transformers/label.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -104,7 +104,7 @@ func NewTransformerLabel( for _, newName := range newNames { _, ok := uniquenessChecker[newName] if ok { - return nil, fmt.Errorf("mlr label: labels must be unique; got duplicate \"%s\"\n", newName) + return nil, fmt.Errorf(`mlr label: labels must be unique; got duplicate "%s"`, newName) } uniquenessChecker[newName] = true } diff --git a/pkg/transformers/latin1_to_utf8.go b/pkg/transformers/latin1_to_utf8.go index b3dca48b5..0dd509f39 100644 --- a/pkg/transformers/latin1_to_utf8.go +++ b/pkg/transformers/latin1_to_utf8.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/merge_fields.go b/pkg/transformers/merge_fields.go index de1a555c3..f16a9d31e 100644 --- a/pkg/transformers/merge_fields.go +++ b/pkg/transformers/merge_fields.go @@ -7,10 +7,10 @@ import ( "regexp" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/transformers/utils" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/transformers/utils" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -254,7 +254,7 @@ func NewTransformerMergeFields( for _, accumulatorName := range accumulatorNameList { if !utils.ValidateStats1AccumulatorName(accumulatorName) { return nil, fmt.Errorf( - "mlr %s: accumulator \"%s\" not found.\n", + `mlr %s: accumulator "%s" not found`, verbNameMergeFields, accumulatorName, ) } @@ -479,7 +479,7 @@ func (tr *TransformerMergeFields) transformByCollapsing( matched = valueFieldNameRegex.MatchString(pe.Key) if matched { // TODO: comment re matrix - shortName = lib.RegexSubCompiled(valueFieldName, valueFieldNameRegex, "", nil) + shortName = lib.RegexCompiledSub(valueFieldName, valueFieldNameRegex, "", nil) break } } diff --git a/pkg/transformers/most_or_least_frequent.go b/pkg/transformers/most_or_least_frequent.go index 1381e81d9..a47ba80dc 100644 --- a/pkg/transformers/most_or_least_frequent.go +++ b/pkg/transformers/most_or_least_frequent.go @@ -7,10 +7,10 @@ import ( "sort" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/nest.go b/pkg/transformers/nest.go index bced869df..31d58bf3e 100644 --- a/pkg/transformers/nest.go +++ b/pkg/transformers/nest.go @@ -9,10 +9,10 @@ import ( "strconv" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/nothing.go b/pkg/transformers/nothing.go index 9c5b72f1a..b64688352 100644 --- a/pkg/transformers/nothing.go +++ b/pkg/transformers/nothing.go @@ -6,8 +6,8 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/put_or_filter.go b/pkg/transformers/put_or_filter.go index 1437c8b15..648595ce2 100644 --- a/pkg/transformers/put_or_filter.go +++ b/pkg/transformers/put_or_filter.go @@ -6,13 +6,13 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/dsl" - "github.com/johnkerl/miller/pkg/dsl/cst" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/runtime" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/dsl" + "github.com/johnkerl/miller/v6/pkg/dsl/cst" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/runtime" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -52,6 +52,14 @@ func transformerPutOrFilterUsage( verb string, ) { fmt.Fprintf(o, "Usage: %s %s [options] {DSL expression}\n", "mlr", verb) + if verb == "put" { + fmt.Fprintf(o, "Lets you use a domain-specific language to programmatically alter stream records.\n") + } else if verb == "filter" { + fmt.Fprintf(o, "Lets you use a domain-specific language to programmatically filter which\n") + fmt.Fprintf(o, "stream records will be output.\n") + } + fmt.Fprintf(o, "See also: https://miller.readthedocs.io/en/latest/reference-verbs\n") + fmt.Fprintf(o, "\n") fmt.Fprintf(o, "Options:\n") fmt.Fprintf(o, `-f {file name} File containing a DSL expression (see examples below). If the filename @@ -65,7 +73,7 @@ func transformerPutOrFilterUsage( Since the expression pieces are simply concatenated, please be sure to use intervening semicolons to separate expressions.) --s name=value: Predefines out-of-stream variable @name to have +-s name=value: Predefines out-of-stream variable @name to have Thus mlr put -s foo=97 '$column += @foo' is like mlr put 'begin {@foo = 97} $column += @foo'. The value part is subject to type-inferencing. @@ -239,14 +247,28 @@ func transformerPutOrFilterParseCLI( } else if opt == "-f" { // Get a DSL string from the user-specified filename filename := cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) - theseDSLStrings, err := lib.LoadStringsFromFileOrDir(filename, ".mlr") - if err != nil { - fmt.Fprintf(os.Stderr, "%s %s: cannot load DSL expression from file \"%s\": ", - "mlr", verb, filename) - fmt.Println(err) - os.Exit(1) + + // Miller has a two-pass command-line parser. If the user does + // `mlr put -f foo.mlr` + // then that file can be parsed both times. But if the user does + // `mlr put -f <( echo 'some expression goes here' )` + // that will read stdin. (The filename will come in as "dev/fd/63" or what have you.) + // But this file _cannot_ be read twice. So, if doConstruct==false -- we're + // on the first pass of the command-line parser -- don't bother to parse + // the DSL-contents file. + // + // See also https://github.com/johnkerl/miller/issues/1515 + + if doConstruct { + theseDSLStrings, err := lib.LoadStringsFromFileOrDir(filename, ".mlr") + if err != nil { + fmt.Fprintf(os.Stderr, "%s %s: cannot load DSL expression from file \"%s\": ", + "mlr", verb, filename) + fmt.Println(err) + os.Exit(1) + } + dslStrings = append(dslStrings, theseDSLStrings...) } - dslStrings = append(dslStrings, theseDSLStrings...) haveDSLStringsHere = true } else if opt == "-e" { @@ -451,7 +473,7 @@ func NewTransformerPut( for _, preset := range presets { pair := strings.SplitN(preset, "=", 2) if len(pair) != 2 { - return nil, fmt.Errorf("missing \"=\" in preset expression \"%s\".", preset) + return nil, fmt.Errorf(`missing "=" in preset expression "%s"`, preset) } key := pair[0] svalue := pair[1] @@ -520,7 +542,7 @@ func (tr *TransformerPut) Transform( // If there were no input records then we never executed the // begin-blocks. Do so now. - if tr.executedBeginBlocks == false { + if !tr.executedBeginBlocks { err := tr.cstRootNode.ExecuteBeginBlocks(tr.runtimeState) if err != nil { fmt.Fprintln(os.Stderr, err) diff --git a/pkg/transformers/regularize.go b/pkg/transformers/regularize.go index c75d261f0..d39ffbad8 100644 --- a/pkg/transformers/regularize.go +++ b/pkg/transformers/regularize.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/remove_empty_columns.go b/pkg/transformers/remove_empty_columns.go index ce9b5a5dc..3b6b74263 100644 --- a/pkg/transformers/remove_empty_columns.go +++ b/pkg/transformers/remove_empty_columns.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/rename.go b/pkg/transformers/rename.go index e5f0658b8..39f252827 100644 --- a/pkg/transformers/rename.go +++ b/pkg/transformers/rename.go @@ -7,9 +7,9 @@ import ( "regexp" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -43,9 +43,9 @@ func transformerRenameUsage( fmt.Fprintf(o, " first-match replacement.\n") fmt.Fprintf(o, "-h|--help Show this message.\n") fmt.Fprintf(o, "Examples:\n") - fmt.Fprintf(o, "%s %s old_name,new_name'\n", exeName, verb) - fmt.Fprintf(o, "%s %s old_name_1,new_name_1,old_name_2,new_name_2'\n", exeName, verb) - fmt.Fprintf(o, "%s %s -r 'Date_[0-9]+,Date,' Rename all such fields to be \"Date\"\n", exeName, verb) + fmt.Fprintf(o, "%s %s old_name,new_name\n", exeName, verb) + fmt.Fprintf(o, "%s %s old_name_1,new_name_1,old_name_2,new_name_2\n", exeName, verb) + fmt.Fprintf(o, "%s %s -r 'Date_[0-9]+,Date' Rename all such fields to be \"Date\"\n", exeName, verb) fmt.Fprintf(o, "%s %s -r '\"Date_[0-9]+\",Date' Same\n", exeName, verb) fmt.Fprintf(o, "%s %s -r 'Date_([0-9]+).*,\\1' Rename all such fields to be of the form 20151015\n", exeName, verb) fmt.Fprintf(o, "%s %s -r '\"name\"i,Name' Rename \"name\", \"Name\", \"NAME\", etc. to \"Name\"\n", exeName, verb) @@ -169,7 +169,7 @@ func NewTransformerRename( regexString := pe.Key regex := lib.CompileMillerRegexOrDie(regexString) replacement := pe.Value.(string) - _, replacementCaptureMatrix := lib.RegexReplacementHasCaptures(replacement) + _, replacementCaptureMatrix := lib.ReplacementHasCaptures(replacement) regexAndReplacement := tRegexAndReplacement{ regex: regex, replacement: replacement, @@ -241,7 +241,7 @@ func (tr *TransformerRename) transformWithRegexes( inrec.Rename(oldName, newName) } } else { - newName := lib.RegexSubCompiled(oldName, regex, replacement, replacementCaptureMatrix) + newName := lib.RegexCompiledSub(oldName, regex, replacement, replacementCaptureMatrix) if newName != oldName { inrec.Rename(oldName, newName) } diff --git a/pkg/transformers/reorder.go b/pkg/transformers/reorder.go index 216dd714d..44e915a80 100644 --- a/pkg/transformers/reorder.go +++ b/pkg/transformers/reorder.go @@ -4,12 +4,13 @@ import ( "container/list" "fmt" "os" + "regexp" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -61,9 +62,9 @@ func transformerReorderParseCLI( argi++ var fieldNames []string = nil - putAtEnd := false - beforeFieldName := "" - afterFieldName := "" + doRegexes := false + putAfter := false + centerFieldName := "" for argi < argc /* variable increment: 1 or 2 depending on flag */ { opt := args[argi] @@ -81,21 +82,23 @@ func transformerReorderParseCLI( } else if opt == "-f" { fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + doRegexes = false + + } else if opt == "-r" { + fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + doRegexes = true } else if opt == "-b" { - beforeFieldName = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) - afterFieldName = "" - putAtEnd = false + centerFieldName = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) + putAfter = false } else if opt == "-a" { - afterFieldName = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) - beforeFieldName = "" - putAtEnd = false + centerFieldName = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) + putAfter = true } else if opt == "-e" { - putAtEnd = true - beforeFieldName = "" - afterFieldName = "" + putAfter = true + centerFieldName = "" } else { transformerReorderUsage(os.Stderr) @@ -115,9 +118,9 @@ func transformerReorderParseCLI( transformer, err := NewTransformerReorder( fieldNames, - putAtEnd, - beforeFieldName, - afterFieldName, + doRegexes, + putAfter, + centerFieldName, ) if err != nil { fmt.Fprintln(os.Stderr, err) @@ -132,43 +135,71 @@ type TransformerReorder struct { // input fieldNames []string fieldNamesSet map[string]bool - beforeFieldName string - afterFieldName string + regexes []*regexp.Regexp + centerFieldName string + putAfter bool // state - recordTransformerFunc RecordTransformerFunc + recordTransformerFunc RecordTransformerHelperFunc } func NewTransformerReorder( fieldNames []string, - putAtEnd bool, - beforeFieldName string, - afterFieldName string, + doRegexes bool, + putAfter bool, + centerFieldName string, ) (*TransformerReorder, error) { tr := &TransformerReorder{ fieldNames: fieldNames, fieldNamesSet: lib.StringListToSet(fieldNames), - beforeFieldName: beforeFieldName, - afterFieldName: afterFieldName, + centerFieldName: centerFieldName, + putAfter: putAfter, } - if putAtEnd { - tr.recordTransformerFunc = tr.reorderToEnd - } else if beforeFieldName != "" { - tr.recordTransformerFunc = tr.reorderBefore - } else if afterFieldName != "" { - tr.recordTransformerFunc = tr.reorderAfter + if centerFieldName == "" { + if putAfter { + if doRegexes { + tr.recordTransformerFunc = tr.reorderToEndWithRegex + } else { + tr.recordTransformerFunc = tr.reorderToEndNoRegex + } + } else { + if doRegexes { + tr.recordTransformerFunc = tr.reorderToStartWithRegex + } else { + tr.recordTransformerFunc = tr.reorderToStartNoRegex + lib.ReverseStringList(tr.fieldNames) + } + } } else { - tr.recordTransformerFunc = tr.reorderToStart - lib.ReverseStringList(tr.fieldNames) + if doRegexes { + tr.recordTransformerFunc = tr.reorderBeforeOrAfterWithRegex + } else { + tr.recordTransformerFunc = tr.reorderBeforeOrAfterNoRegex + } + } + + if doRegexes { + tr.regexes = make([]*regexp.Regexp, len(fieldNames)) + for i, regexString := range fieldNames { + // Handles "a.*b"i Miller case-insensitive-regex specification + regex, err := lib.CompileMillerRegex(regexString) + if err != nil { + fmt.Fprintf( + os.Stderr, + "%s %s: cannot compile regex [%s]\n", + "mlr", verbNameCut, regexString, + ) + os.Exit(1) + } + tr.regexes[i] = regex + } } return tr, nil } -// ---------------------------------------------------------------- - func (tr *TransformerReorder) Transform( inrecAndContext *types.RecordAndContext, outputRecordsAndContexts *list.List, // list of *types.RecordAndContext @@ -176,156 +207,198 @@ func (tr *TransformerReorder) Transform( outputDownstreamDoneChannel chan<- bool, ) { HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, outputDownstreamDoneChannel) -} - -// ---------------------------------------------------------------- -func (tr *TransformerReorder) reorderToStart( - inrecAndContext *types.RecordAndContext, - outputRecordsAndContexts *list.List, // list of *types.RecordAndContext - inputDownstreamDoneChannel <-chan bool, - outputDownstreamDoneChannel chan<- bool, -) { if !inrecAndContext.EndOfStream { - inrec := inrecAndContext.Record - for _, fieldName := range tr.fieldNames { - inrec.MoveToHead(fieldName) - } - outputRecordsAndContexts.PushBack(inrecAndContext) - + tr.recordTransformerFunc( + inrecAndContext, + outputRecordsAndContexts, + ) } else { outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } } -// ---------------------------------------------------------------- -func (tr *TransformerReorder) reorderToEnd( +func (tr *TransformerReorder) reorderToStartNoRegex( inrecAndContext *types.RecordAndContext, outputRecordsAndContexts *list.List, // list of *types.RecordAndContext - inputDownstreamDoneChannel <-chan bool, - outputDownstreamDoneChannel chan<- bool, ) { - if !inrecAndContext.EndOfStream { - inrec := inrecAndContext.Record - for _, fieldName := range tr.fieldNames { - inrec.MoveToTail(fieldName) - } - outputRecordsAndContexts.PushBack(inrecAndContext) - } else { - outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker + inrec := inrecAndContext.Record + for _, fieldName := range tr.fieldNames { + inrec.MoveToHead(fieldName) } + outputRecordsAndContexts.PushBack(inrecAndContext) } -// ---------------------------------------------------------------- -func (tr *TransformerReorder) reorderBefore( +func (tr *TransformerReorder) reorderToStartWithRegex( inrecAndContext *types.RecordAndContext, outputRecordsAndContexts *list.List, // list of *types.RecordAndContext - inputDownstreamDoneChannel <-chan bool, - outputDownstreamDoneChannel chan<- bool, ) { - if !inrecAndContext.EndOfStream { - inrec := inrecAndContext.Record - if inrec.Get(tr.beforeFieldName) == nil { - outputRecordsAndContexts.PushBack(inrecAndContext) - return - } + inrec := inrecAndContext.Record - outrec := mlrval.NewMlrmapAsRecord() - pe := inrec.Head - - // * inrec will be GC'ed - // * We will use outrec.PutReference not output.PutCopy since inrec will be GC'ed - - for ; pe != nil; pe = pe.Next { - if pe.Key == tr.beforeFieldName { + outrec := mlrval.NewMlrmapAsRecord() + atEnds := list.New() + for pe := inrec.Head; pe != nil; pe = pe.Next { + found := false + for _, regex := range tr.regexes { + if regex.MatchString(pe.Key) { + outrec.PutReference(pe.Key, pe.Value) + found = true break } - if !tr.fieldNamesSet[pe.Key] { - outrec.PutReference(pe.Key, pe.Value) + } + if !found { + atEnds.PushBack(pe) + } + } + + for atEnd := atEnds.Front(); atEnd != nil; atEnd = atEnd.Next() { + // Ownership transfer; no copy needed + pe := atEnd.Value.(*mlrval.MlrmapEntry) + outrec.PutReference(pe.Key, pe.Value) + } + + outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(outrecAndContext) +} + +func (tr *TransformerReorder) reorderToEndNoRegex( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext +) { + inrec := inrecAndContext.Record + for _, fieldName := range tr.fieldNames { + inrec.MoveToTail(fieldName) + } + outputRecordsAndContexts.PushBack(inrecAndContext) + +} + +func (tr *TransformerReorder) reorderToEndWithRegex( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext +) { + inrec := inrecAndContext.Record + outrec := mlrval.NewMlrmapAsRecord() + atEnds := list.New() + for pe := inrec.Head; pe != nil; pe = pe.Next { + found := false + for _, regex := range tr.regexes { + if regex.MatchString(pe.Key) { + atEnds.PushBack(pe) + found = true + break } } + if !found { + outrec.PutReference(pe.Key, pe.Value) + } + } + for atEnd := atEnds.Front(); atEnd != nil; atEnd = atEnd.Next() { + // Ownership transfer; no copy needed + pe := atEnd.Value.(*mlrval.MlrmapEntry) + outrec.PutReference(pe.Key, pe.Value) + } + + outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(outrecAndContext) +} + +func (tr *TransformerReorder) reorderBeforeOrAfterNoRegex( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext +) { + inrec := inrecAndContext.Record + if inrec.Get(tr.centerFieldName) == nil { + outputRecordsAndContexts.PushBack(inrecAndContext) + return + } + + outrec := mlrval.NewMlrmapAsRecord() + pe := inrec.Head + + // We use outrec.PutReference not output.PutCopy since inrec will be GC'ed + + for ; pe != nil; pe = pe.Next { + if pe.Key == tr.centerFieldName { + break + } + if !tr.fieldNamesSet[pe.Key] { + outrec.PutReference(pe.Key, pe.Value) + } + } + + if !tr.putAfter { for _, fieldName := range tr.fieldNames { value := inrec.Get(fieldName) if value != nil { outrec.PutReference(fieldName, value) } } - - value := inrec.Get(tr.beforeFieldName) - if value != nil { - outrec.PutReference(tr.beforeFieldName, value) - } - - for ; pe != nil; pe = pe.Next { - if pe.Key != tr.beforeFieldName && !tr.fieldNamesSet[pe.Key] { - outrec.PutReference(pe.Key, pe.Value) - } - } - - for _, fieldName := range tr.fieldNames { - inrec.MoveToHead(fieldName) - } - outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) - - } else { - outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } -} -// ---------------------------------------------------------------- -func (tr *TransformerReorder) reorderAfter( - inrecAndContext *types.RecordAndContext, - outputRecordsAndContexts *list.List, // list of *types.RecordAndContext - inputDownstreamDoneChannel <-chan bool, - outputDownstreamDoneChannel chan<- bool, -) { - if !inrecAndContext.EndOfStream { - inrec := inrecAndContext.Record - if inrec.Get(tr.afterFieldName) == nil { - outputRecordsAndContexts.PushBack(inrecAndContext) - return - } - - outrec := mlrval.NewMlrmapAsRecord() - pe := inrec.Head - - // * inrec will be GC'ed - // * We will use outrec.PutReference not output.PutCopy since inrec will be GC'ed - - for ; pe != nil; pe = pe.Next { - if pe.Key == tr.afterFieldName { - break - } - if !tr.fieldNamesSet[pe.Key] { - outrec.PutReference(pe.Key, pe.Value) - } - } - - value := inrec.Get(tr.afterFieldName) - if value != nil { - outrec.PutReference(tr.afterFieldName, value) - } + value := inrec.Get(tr.centerFieldName) + if value != nil { + outrec.PutReference(tr.centerFieldName, value) + } + if tr.putAfter { for _, fieldName := range tr.fieldNames { value := inrec.Get(fieldName) if value != nil { outrec.PutReference(fieldName, value) } } + } - for ; pe != nil; pe = pe.Next { - if pe.Key != tr.afterFieldName && !tr.fieldNamesSet[pe.Key] { - outrec.PutReference(pe.Key, pe.Value) + for ; pe != nil; pe = pe.Next { + if pe.Key != tr.centerFieldName && !tr.fieldNamesSet[pe.Key] { + outrec.PutReference(pe.Key, pe.Value) + } + } + + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) + +} + +func (tr *TransformerReorder) reorderBeforeOrAfterWithRegex( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext +) { + inrec := inrecAndContext.Record + if inrec.Get(tr.centerFieldName) == nil { + outputRecordsAndContexts.PushBack(inrecAndContext) + return + } + + matchingFieldNamesSet := lib.NewOrderedMap() + for pe := inrec.Head; pe != nil; pe = pe.Next { + for _, regex := range tr.regexes { + if regex.MatchString(pe.Key) { + if pe.Key != tr.centerFieldName { + matchingFieldNamesSet.Put(pe.Key, pe.Value) + break + } } } - - for _, fieldName := range tr.fieldNames { - inrec.MoveToHead(fieldName) - } - outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) - - } else { - outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker } + + // We use outrec.PutReference not output.PutCopy since inrec will be GC'ed + outrec := mlrval.NewMlrmapAsRecord() + for pe := inrec.Head; pe != nil; pe = pe.Next { + if pe.Key == tr.centerFieldName { + if tr.putAfter { + outrec.PutReference(pe.Key, pe.Value) + } + for pf := matchingFieldNamesSet.Head; pf != nil; pf = pf.Next { + outrec.PutReference(pf.Key, pf.Value.(*mlrval.Mlrval)) + } + if !tr.putAfter { + outrec.PutReference(pe.Key, pe.Value) + } + } else if !matchingFieldNamesSet.Has(pe.Key) { + outrec.PutReference(pe.Key, pe.Value) + } + } + + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(outrec, &inrecAndContext.Context)) } diff --git a/pkg/transformers/repeat.go b/pkg/transformers/repeat.go index eab1725f4..a9761415a 100644 --- a/pkg/transformers/repeat.go +++ b/pkg/transformers/repeat.go @@ -6,8 +6,8 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) type tRepeatCountSource int diff --git a/pkg/transformers/reshape.go b/pkg/transformers/reshape.go index 4c0cffc07..1cc96f64f 100644 --- a/pkg/transformers/reshape.go +++ b/pkg/transformers/reshape.go @@ -34,10 +34,10 @@ import ( "regexp" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/sample.go b/pkg/transformers/sample.go index b1eef576b..b8e798c65 100644 --- a/pkg/transformers/sample.go +++ b/pkg/transformers/sample.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/sec2gmt.go b/pkg/transformers/sec2gmt.go index 0dfbbe5df..33cbad387 100644 --- a/pkg/transformers/sec2gmt.go +++ b/pkg/transformers/sec2gmt.go @@ -5,10 +5,10 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/sec2gmtdate.go b/pkg/transformers/sec2gmtdate.go index d09defffa..ee440f607 100644 --- a/pkg/transformers/sec2gmtdate.go +++ b/pkg/transformers/sec2gmtdate.go @@ -5,10 +5,10 @@ import ( "fmt" "os" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/seqgen.go b/pkg/transformers/seqgen.go index 2d1c3b188..9d77e7173 100644 --- a/pkg/transformers/seqgen.go +++ b/pkg/transformers/seqgen.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -156,7 +156,7 @@ func NewTransformerSeqgen( if fstart == fstop { doneComparator = bifs.BIF_equals } else { - return nil, fmt.Errorf("mlr seqgen: step must not be zero unless start == stop.") + return nil, fmt.Errorf("mlr seqgen: step must not be zero unless start == stop") } } @@ -192,9 +192,7 @@ func (tr *TransformerSeqgen) Transform( case b := <-inputDownstreamDoneChannel: outputDownstreamDoneChannel <- b keepGoing = false - break default: - break } if !keepGoing { break diff --git a/pkg/transformers/shuffle.go b/pkg/transformers/shuffle.go index 7aad23641..77659e5a8 100644 --- a/pkg/transformers/shuffle.go +++ b/pkg/transformers/shuffle.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/skip_trivial_records.go b/pkg/transformers/skip_trivial_records.go index 91c8bd242..4a0245edb 100644 --- a/pkg/transformers/skip_trivial_records.go +++ b/pkg/transformers/skip_trivial_records.go @@ -6,8 +6,8 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/sort.go b/pkg/transformers/sort.go index d28009ae4..6342192bb 100644 --- a/pkg/transformers/sort.go +++ b/pkg/transformers/sort.go @@ -48,10 +48,10 @@ import ( "sort" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -83,6 +83,7 @@ func transformerSortUsage( fmt.Fprintf(o, "-nf {comma-separated field names} Same as -n\n") fmt.Fprintf(o, "-nr {comma-separated field names} Numerical descending; nulls sort first\n") fmt.Fprintf(o, "-t {comma-separated field names} Natural ascending\n") + fmt.Fprintf(o, "-b Move sort fields to start of record, as in reorder -b\n") fmt.Fprintf(o, "-tr|-rt {comma-separated field names} Natural descending\n") fmt.Fprintf(o, "-h|--help Show this message.\n") fmt.Fprintf(o, "\n") @@ -107,6 +108,7 @@ func transformerSortParseCLI( groupByFieldNames := make([]string, 0) comparatorFuncs := make([]mlrval.CmpFuncInt, 0) + doMoveToHead := false for argi < argc /* variable increment: 1 or 2 depending on flag */ { opt := args[argi] @@ -255,6 +257,9 @@ func transformerSortParseCLI( comparatorFuncs = append(comparatorFuncs, mlrval.NumericDescendingComparator) } + } else if opt == "-b" { + doMoveToHead = true + } else { transformerSortUsage(os.Stderr) os.Exit(1) @@ -274,6 +279,7 @@ func transformerSortParseCLI( transformer, err := NewTransformerSort( groupByFieldNames, comparatorFuncs, + doMoveToHead, ) if err != nil { fmt.Fprintln(os.Stderr, err) @@ -304,6 +310,7 @@ type TransformerSort struct { // -- Input groupByFieldNames []string comparatorFuncs []mlrval.CmpFuncInt + doMoveToHead bool // -- State // Map from string to *list.List: @@ -316,11 +323,13 @@ type TransformerSort struct { func NewTransformerSort( groupByFieldNames []string, comparatorFuncs []mlrval.CmpFuncInt, + doMoveToHead bool, ) (*TransformerSort, error) { tr := &TransformerSort{ groupByFieldNames: groupByFieldNames, comparatorFuncs: comparatorFuncs, + doMoveToHead: doMoveToHead, recordListsByGroup: lib.NewOrderedMap(), groupHeads: lib.NewOrderedMap(), @@ -346,6 +355,13 @@ func (tr *TransformerSort) Transform( if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record + if tr.doMoveToHead { + n := len(tr.groupByFieldNames) + for i := n - 1; i >= 0; i-- { + inrec.MoveToHead(tr.groupByFieldNames[i]) + } + } + groupingKey, selectedValues, ok := inrec.GetSelectedValuesAndJoined( tr.groupByFieldNames, ) diff --git a/pkg/transformers/sort_within_records.go b/pkg/transformers/sort_within_records.go index 398ca5372..50ce51bc2 100644 --- a/pkg/transformers/sort_within_records.go +++ b/pkg/transformers/sort_within_records.go @@ -6,8 +6,8 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/sparsify.go b/pkg/transformers/sparsify.go new file mode 100644 index 000000000..6d6212a33 --- /dev/null +++ b/pkg/transformers/sparsify.go @@ -0,0 +1,192 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "strings" + + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" +) + +// ---------------------------------------------------------------- +const verbNameSparsify = "sparsify" + +var SparsifySetup = TransformerSetup{ + Verb: verbNameSparsify, + UsageFunc: transformerSparsifyUsage, + ParseCLIFunc: transformerSparsifyParseCLI, + IgnoresInput: false, +} + +func transformerSparsifyUsage( + o *os.File, +) { + fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameSparsify) + fmt.Fprint(o, + `Unsets fields for which the key is the empty string (or, optionally, another +specified value). Only makes sense with output format not being CSV or TSV. +`) + + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-s {filler string} What values to remove. Defaults to the empty string.\n") + fmt.Fprintf(o, "-f {a,b,c} Specify field names to be operated on; any other fields won't be\n") + fmt.Fprintf(o, " modified. The default is to modify all fields.\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") + + fmt.Fprint(o, + `Example: if input is a=1,b=,c=3 then output is a=1,c=3. +`) +} + +func transformerSparsifyParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + verb := args[argi] + argi++ + + fillerString := "" + var specifiedFieldNames []string = nil + + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + if args[argi] == "--" { + break // All transformers must do this so main-flags can follow verb-flags + } + argi++ + + if opt == "-h" || opt == "--help" { + transformerSparsifyUsage(os.Stdout) + os.Exit(0) + + } else if opt == "-s" { + fillerString = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) + + } else if opt == "-f" { + specifiedFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + + } else { + transformerSparsifyUsage(os.Stderr) + os.Exit(1) + } + } + + *pargi = argi + if !doConstruct { // All transformers must do this for main command-line parsing + return nil + } + + transformer, err := NewTransformerSparsify( + fillerString, + specifiedFieldNames, + ) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + + return transformer +} + +// ---------------------------------------------------------------- +type TransformerSparsify struct { + fillerString string + fieldNamesSet map[string]bool + recordTransformerFunc RecordTransformerFunc +} + +func NewTransformerSparsify( + fillerString string, + specifiedFieldNames []string, +) (*TransformerSparsify, error) { + + tr := &TransformerSparsify{ + fillerString: fillerString, + fieldNamesSet: lib.StringListToSet(specifiedFieldNames), + } + if specifiedFieldNames == nil { + tr.recordTransformerFunc = tr.transformAll + } else { + tr.recordTransformerFunc = tr.transformSome + } + + return tr, nil +} + +func (tr *TransformerSparsify) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + + if !inrecAndContext.EndOfStream { + tr.recordTransformerFunc( + inrecAndContext, + outputRecordsAndContexts, + inputDownstreamDoneChannel, + outputDownstreamDoneChannel, + ) + } else { + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker + } +} + +func (tr *TransformerSparsify) transformAll( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + inrec := inrecAndContext.Record + outrec := mlrval.NewMlrmapAsRecord() + + for pe := inrec.Head; pe != nil; pe = pe.Next { + if pe.Value.String() != tr.fillerString { + // Reference OK because ownership transfer + outrec.PutReference(pe.Key, pe.Value) + } + } + + outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(outrecAndContext) +} + +// ---------------------------------------------------------------- +func (tr *TransformerSparsify) transformSome( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + inrec := inrecAndContext.Record + outrec := mlrval.NewMlrmapAsRecord() + + for pe := inrec.Head; pe != nil; pe = pe.Next { + if tr.fieldNamesSet[pe.Key] { + if pe.Value.String() != tr.fillerString { + // Reference OK because ownership transfer + outrec.PutReference(pe.Key, pe.Value) + } + } else { + outrec.PutReference(pe.Key, pe.Value) + } + } + + outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(outrecAndContext) +} diff --git a/pkg/transformers/split.go b/pkg/transformers/split.go index 7834931e9..7295fa174 100644 --- a/pkg/transformers/split.go +++ b/pkg/transformers/split.go @@ -7,10 +7,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/ssub.go b/pkg/transformers/ssub.go deleted file mode 100644 index a31864711..000000000 --- a/pkg/transformers/ssub.go +++ /dev/null @@ -1,156 +0,0 @@ -package transformers - -import ( - "container/list" - "fmt" - "os" - "strings" - - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" -) - -// ---------------------------------------------------------------- -const verbNameSsub = "ssub" - -var SsubSetup = TransformerSetup{ - Verb: verbNameSsub, - UsageFunc: transformerSsubUsage, - ParseCLIFunc: transformerSsubParseCLI, - IgnoresInput: false, -} - -func transformerSsubUsage( - o *os.File, -) { - fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameSsub) - fmt.Fprintf(o, "Replaces old string with new string in specified field(s), without regex support for\n") - fmt.Fprintf(o, "the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.\n") - fmt.Fprintf(o, "Options:\n") - fmt.Fprintf(o, "-f {a,b,c} Field names to convert.\n") - fmt.Fprintf(o, "-h|--help Show this message.\n") -} - -func transformerSsubParseCLI( - pargi *int, - argc int, - args []string, - _ *cli.TOptions, - doConstruct bool, // false for first pass of CLI-parse, true for second pass -) IRecordTransformer { - - // Skip the verb name from the current spot in the mlr command line - argi := *pargi - verb := args[argi] - argi++ - - // Parse local flags - var fieldNames []string = nil - var oldText string - var newText string - - for argi < argc /* variable increment: 1 or 2 depending on flag */ { - opt := args[argi] - if !strings.HasPrefix(opt, "-") { - break // No more flag options to process - } - if args[argi] == "--" { - break // All transformers must do this so main-flags can follow verb-flags - } - argi++ - - if opt == "-h" || opt == "--help" { - transformerSsubUsage(os.Stdout) - os.Exit(0) - - } else if opt == "-f" { - fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) - } else { - transformerSsubUsage(os.Stderr) - os.Exit(1) - } - } - - if fieldNames == nil { - transformerSsubUsage(os.Stderr) - os.Exit(1) - } - - // Get the old and new text from the command line - if (argc - argi) < 2 { - transformerSsubUsage(os.Stderr) - os.Exit(1) - } - oldText = args[argi] - newText = args[argi+1] - - argi += 2 - - *pargi = argi - if !doConstruct { // All transformers must do this for main command-line parsing - return nil - } - - transformer, err := NewTransformerSsub( - fieldNames, - oldText, - newText, - ) - if err != nil { - fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } - - return transformer -} - -// ---------------------------------------------------------------- -type TransformerSsub struct { - fieldNames []string - oldText *mlrval.Mlrval - newText *mlrval.Mlrval -} - -// ---------------------------------------------------------------- -func NewTransformerSsub( - fieldNames []string, - oldText string, - newText string, -) (*TransformerSsub, error) { - tr := &TransformerSsub{ - fieldNames: fieldNames, - oldText: mlrval.FromString(oldText), - newText: mlrval.FromString(newText), - } - return tr, nil -} - -func (tr *TransformerSsub) Transform( - inrecAndContext *types.RecordAndContext, - outputRecordsAndContexts *list.List, // list of *types.RecordAndContext - inputDownstreamDoneChannel <-chan bool, - outputDownstreamDoneChannel chan<- bool, -) { - HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - - if !inrecAndContext.EndOfStream { - inrec := inrecAndContext.Record - - for _, fieldName := range tr.fieldNames { - oldValue := inrec.Get(fieldName) - if oldValue == nil { - continue - } - - newValue := bifs.BIF_ssub(oldValue, tr.oldText, tr.newText) - - inrec.PutReference(fieldName, newValue) - } - - outputRecordsAndContexts.PushBack(inrecAndContext) - } else { - outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker - } -} diff --git a/pkg/transformers/stats1.go b/pkg/transformers/stats1.go index b58129691..1f924aa2d 100644 --- a/pkg/transformers/stats1.go +++ b/pkg/transformers/stats1.go @@ -8,11 +8,11 @@ import ( "regexp" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/transformers/utils" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/transformers/utils" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -69,7 +69,7 @@ Options: fmt.Fprintln(o, "Example: mlr stats1 -a count,mode -f size -g shape") fmt.Fprintln(o, - "Example: mlr stats1 -a count,mode --fr '^[a-h].*$' -gr '^k.*$'") + "Example: mlr stats1 -a count,mode --fr '^[a-h].*$' --gr '^k.*$'") fmt.Fprintln(o, ` This computes count and mode statistics on all field names beginning with a through h, grouped by all field names starting with k.`) @@ -312,7 +312,7 @@ func NewTransformerStats1( ) (*TransformerStats1, error) { for _, name := range accumulatorNameList { if !utils.ValidateStats1AccumulatorName(name) { - return nil, fmt.Errorf("mlr stats1: accumulator \"%s\" not found.", name) + return nil, fmt.Errorf(`mlr stats1: accumulator "%s" not found`, name) } } diff --git a/pkg/transformers/stats2.go b/pkg/transformers/stats2.go index a30141021..f1e9d94de 100644 --- a/pkg/transformers/stats2.go +++ b/pkg/transformers/stats2.go @@ -6,11 +6,11 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/transformers/utils" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/transformers/utils" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -207,7 +207,7 @@ func NewTransformerStats2( ) (*TransformerStats2, error) { for _, name := range accumulatorNameList { if !utils.ValidateStats2AccumulatorName(name) { - return nil, fmt.Errorf("mlr stats2: accumulator \"%s\" not found.", name) + return nil, fmt.Errorf(`mlr stats2: accumulator "%s" not found`, name) } } diff --git a/pkg/transformers/step.go b/pkg/transformers/step.go index 56539cb90..e003aaf3f 100644 --- a/pkg/transformers/step.go +++ b/pkg/transformers/step.go @@ -73,12 +73,12 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/transformers/utils" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/transformers/utils" + "github.com/johnkerl/miller/v6/pkg/types" ) // For EWMA @@ -260,7 +260,6 @@ type TransformerStep struct { // STATE // Scratch space used per-record - valueFieldValues []mlrval.Mlrval // Map from group-by field names to value-field names to stepper name to stepper object. See // the Transform method below for more details. groups map[string]map[string]map[string]tStepper @@ -282,12 +281,12 @@ func NewTransformerStep( ) (*TransformerStep, error) { if len(stepperInputs) == 0 || len(valueFieldNames) == 0 { - return nil, fmt.Errorf("mlr %s: -a and -f are both required arguments.", verbNameStep) + return nil, fmt.Errorf("mlr %s: -a and -f are both required arguments", verbNameStep) } if len(stringAlphas) != 0 && len(ewmaSuffixes) != 0 { if len(ewmaSuffixes) != len(stringAlphas) { return nil, fmt.Errorf( - "mlr %s: If -d and -o are provided, their values must have the same length.", verbNameStep, + "mlr %s: If -d and -o are provided, their values must have the same length", verbNameStep, ) } } diff --git a/pkg/transformers/sub.go b/pkg/transformers/sub.go deleted file mode 100644 index 1c96b45fc..000000000 --- a/pkg/transformers/sub.go +++ /dev/null @@ -1,157 +0,0 @@ -package transformers - -import ( - "container/list" - "fmt" - "os" - "strings" - - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" -) - -// ---------------------------------------------------------------- -const verbNameSub = "sub" - -var SubSetup = TransformerSetup{ - Verb: verbNameSub, - UsageFunc: transformerSubUsage, - ParseCLIFunc: transformerSubParseCLI, - IgnoresInput: false, -} - -func transformerSubUsage( - o *os.File, -) { - fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameSub) - fmt.Fprintf(o, "Replaces old string with new string in specified field(s), with regex support\n") - fmt.Fprintf(o, "for the old string and not handling multiple matches, like the `sub` DSL function.\n") - fmt.Fprintf(o, "See also the `gsub` and `ssub` verbs.\n") - fmt.Fprintf(o, "Options:\n") - fmt.Fprintf(o, "-f {a,b,c} Field names to convert.\n") - fmt.Fprintf(o, "-h|--help Show this message.\n") -} - -func transformerSubParseCLI( - pargi *int, - argc int, - args []string, - _ *cli.TOptions, - doConstruct bool, // false for first pass of CLI-parse, true for second pass -) IRecordTransformer { - - // Skip the verb name from the current spot in the mlr command line - argi := *pargi - verb := args[argi] - argi++ - - // Parse local flags - var fieldNames []string = nil - var oldText string - var newText string - - for argi < argc /* variable increment: 1 or 2 depending on flag */ { - opt := args[argi] - if !strings.HasPrefix(opt, "-") { - break // No more flag options to process - } - if args[argi] == "--" { - break // All transformers must do this so main-flags can follow verb-flags - } - argi++ - - if opt == "-h" || opt == "--help" { - transformerSubUsage(os.Stdout) - os.Exit(0) - - } else if opt == "-f" { - fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) - } else { - transformerSubUsage(os.Stderr) - os.Exit(1) - } - } - - if fieldNames == nil { - transformerSubUsage(os.Stderr) - os.Exit(1) - } - - // Get the old and new text from the command line - if (argc - argi) < 2 { - transformerSubUsage(os.Stderr) - os.Exit(1) - } - oldText = args[argi] - newText = args[argi+1] - - argi += 2 - - *pargi = argi - if !doConstruct { // All transformers must do this for main command-line parsing - return nil - } - - transformer, err := NewTransformerSub( - fieldNames, - oldText, - newText, - ) - if err != nil { - fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } - - return transformer -} - -// ---------------------------------------------------------------- -type TransformerSub struct { - fieldNames []string - oldText *mlrval.Mlrval - newText *mlrval.Mlrval -} - -// ---------------------------------------------------------------- -func NewTransformerSub( - fieldNames []string, - oldText string, - newText string, -) (*TransformerSub, error) { - tr := &TransformerSub{ - fieldNames: fieldNames, - oldText: mlrval.FromString(oldText), - newText: mlrval.FromString(newText), - } - return tr, nil -} - -func (tr *TransformerSub) Transform( - inrecAndContext *types.RecordAndContext, - outputRecordsAndContexts *list.List, // list of *types.RecordAndContext - inputDownstreamDoneChannel <-chan bool, - outputDownstreamDoneChannel chan<- bool, -) { - HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) - - if !inrecAndContext.EndOfStream { - inrec := inrecAndContext.Record - - for _, fieldName := range tr.fieldNames { - oldValue := inrec.Get(fieldName) - if oldValue == nil { - continue - } - - newValue := bifs.BIF_sub(oldValue, tr.oldText, tr.newText) - - inrec.PutReference(fieldName, newValue) - } - - outputRecordsAndContexts.PushBack(inrecAndContext) - } else { - outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker - } -} diff --git a/pkg/transformers/subs.go b/pkg/transformers/subs.go new file mode 100644 index 000000000..b5530bb17 --- /dev/null +++ b/pkg/transformers/subs.go @@ -0,0 +1,361 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "regexp" + "strings" + + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" +) + +// ---------------------------------------------------------------- +const verbNameSub = "sub" +const verbNameGsub = "gsub" +const verbNameSsub = "ssub" + +var SubSetup = TransformerSetup{ + Verb: verbNameSub, + UsageFunc: transformerSubUsage, + ParseCLIFunc: transformerSubParseCLI, + IgnoresInput: false, +} + +var GsubSetup = TransformerSetup{ + Verb: verbNameGsub, + UsageFunc: transformerGsubUsage, + ParseCLIFunc: transformerGsubParseCLI, + IgnoresInput: false, +} + +var SsubSetup = TransformerSetup{ + Verb: verbNameSsub, + UsageFunc: transformerSsubUsage, + ParseCLIFunc: transformerSsubParseCLI, + IgnoresInput: false, +} + +func transformerSubUsage( + o *os.File, +) { + fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameSub) + fmt.Fprintf(o, "Replaces old string with new string in specified field(s), with regex support\n") + fmt.Fprintf(o, "for the old string and not handling multiple matches, like the `sub` DSL function.\n") + fmt.Fprintf(o, "See also the `gsub` and `ssub` verbs.\n") + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-f {a,b,c} Field names to convert.\n") + fmt.Fprintf(o, "-r {regex} Regular expression for field names to convert.\n") + fmt.Fprintf(o, "-a Convert all fields.\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") +} + +func transformerGsubUsage( + o *os.File, +) { + fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameGsub) + fmt.Fprintf(o, "Replaces old string with new string in specified field(s), with regex support\n") + fmt.Fprintf(o, "for the old string and handling multiple matches, like the `gsub` DSL function.\n") + fmt.Fprintf(o, "See also the `sub` and `ssub` verbs.\n") + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-f {a,b,c} Field names to convert.\n") + fmt.Fprintf(o, "-r {regex} Regular expression for field names to convert.\n") + fmt.Fprintf(o, "-a Convert all fields.\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") +} + +func transformerSsubUsage( + o *os.File, +) { + fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameSsub) + fmt.Fprintf(o, "Replaces old string with new string in specified field(s), without regex support for\n") + fmt.Fprintf(o, "the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.\n") + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-f {a,b,c} Field names to convert.\n") + fmt.Fprintf(o, "-r {regex} Regular expression for field names to convert.\n") + fmt.Fprintf(o, "-a Convert all fields.\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") +} + +type subConstructorFunc func( + fieldNames []string, + doAllFieldNames bool, + doRegexes bool, + oldText string, + newText string, +) (IRecordTransformer, error) + +type fieldAcceptorFunc func( + fieldName string, +) bool + +func transformerSubParseCLI( + pargi *int, + argc int, + args []string, + opts *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + return transformerSubsParseCLI(pargi, argc, args, opts, doConstruct, transformerSubUsage, NewTransformerSub) +} + +func transformerGsubParseCLI( + pargi *int, + argc int, + args []string, + opts *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + return transformerSubsParseCLI(pargi, argc, args, opts, doConstruct, transformerGsubUsage, NewTransformerGsub) +} + +func transformerSsubParseCLI( + pargi *int, + argc int, + args []string, + opts *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + return transformerSubsParseCLI(pargi, argc, args, opts, doConstruct, transformerSsubUsage, NewTransformerSsub) +} + +// transformerSubsParseCLI is a shared CLI-parser for the sub, gsub, and ssub verbs. +func transformerSubsParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass + usageFunc TransformerUsageFunc, + constructorFunc subConstructorFunc, +) IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + verb := args[argi] + argi++ + + // Parse local flags + var fieldNames []string = nil + doAllFieldNames := false + doRegexes := false + var oldText string + var newText string + + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + if args[argi] == "--" { + break // All transformers must do this so main-flags can follow verb-flags + } + argi++ + + if opt == "-h" || opt == "--help" { + usageFunc(os.Stdout) + os.Exit(0) + + } else if opt == "-a" { + doAllFieldNames = true + doRegexes = false + fieldNames = nil + + } else if opt == "-r" { + doRegexes = true + + } else if opt == "-f" { + fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + doAllFieldNames = false + } else { + usageFunc(os.Stderr) + os.Exit(1) + } + } + + if fieldNames == nil && !doAllFieldNames { + usageFunc(os.Stderr) + os.Exit(1) + } + + // Get the old and new text from the command line + if (argc - argi) < 2 { + usageFunc(os.Stderr) + os.Exit(1) + } + oldText = args[argi] + newText = args[argi+1] + + argi += 2 + + *pargi = argi + if !doConstruct { // All transformers must do this for main command-line parsing + return nil + } + + transformer, err := constructorFunc( + fieldNames, + doAllFieldNames, + doRegexes, + oldText, + newText, + ) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + + return transformer +} + +type TransformerSubs struct { + fieldNamesSet map[string]bool // for -f + regexes []*regexp.Regexp // for -r + oldText *mlrval.Mlrval + newText *mlrval.Mlrval + fieldAcceptor fieldAcceptorFunc // for -f, -r, -a + subber bifs.TernaryFunc // for sub, gsub, ssub +} + +func NewTransformerSub( + fieldNames []string, + doAllFieldNames bool, + doRegexes bool, + oldText string, + newText string, +) (IRecordTransformer, error) { + return NewTransformerSubs(fieldNames, doAllFieldNames, doRegexes, oldText, newText, safe_sub) +} + +func NewTransformerGsub( + fieldNames []string, + doAllFieldNames bool, + doRegexes bool, + oldText string, + newText string, +) (IRecordTransformer, error) { + return NewTransformerSubs(fieldNames, doAllFieldNames, doRegexes, oldText, newText, safe_gsub) +} + +func NewTransformerSsub( + fieldNames []string, + doAllFieldNames bool, + doRegexes bool, + oldText string, + newText string, +) (IRecordTransformer, error) { + return NewTransformerSubs(fieldNames, doAllFieldNames, doRegexes, oldText, newText, safe_ssub) +} + +func NewTransformerSubs( + fieldNames []string, + doAllFieldNames bool, + doRegexes bool, + oldText string, + newText string, + subber bifs.TernaryFunc, +) (IRecordTransformer, error) { + tr := &TransformerSubs{ + fieldNamesSet: lib.StringListToSet(fieldNames), + oldText: mlrval.FromString(oldText), + newText: mlrval.FromString(newText), + subber: subber, + } + if doAllFieldNames { + tr.fieldAcceptor = tr.fieldAcceptorAll + } else if doRegexes { + tr.fieldAcceptor = tr.fieldAcceptorByRegexes + + tr.regexes = make([]*regexp.Regexp, len(fieldNames)) + for i, regexString := range fieldNames { + // Handles "a.*b"i Miller case-insensitive-regex specification + regex, err := lib.CompileMillerRegex(regexString) + if err != nil { + fmt.Fprintf(os.Stderr, "%s %s: cannot compile regex [%s]\n", "mlr", verbNameCut, regexString) + os.Exit(1) + } + tr.regexes[i] = regex + } + } else { + tr.fieldAcceptor = tr.fieldAcceptorByNames + } + return tr, nil +} + +func (tr *TransformerSubs) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + + if !inrecAndContext.EndOfStream { + inrec := inrecAndContext.Record + // Run sub, gsub, or ssub on the user-specified field names + for pe := inrec.Head; pe != nil; pe = pe.Next { + if tr.fieldAcceptor(pe.Key) { + pe.Value = tr.subber(pe.Value, tr.oldText, tr.newText) + } + } + } + // Including emit of end-of-stream marker + outputRecordsAndContexts.PushBack(inrecAndContext) +} + +// fieldAcceptorByNames implements -f +func (tr *TransformerSubs) fieldAcceptorByNames( + fieldName string, +) bool { + return tr.fieldNamesSet[fieldName] +} + +// fieldAcceptorByRegexes implements -r +func (tr *TransformerSubs) fieldAcceptorByRegexes( + fieldName string, +) bool { + for _, regex := range tr.regexes { + if regex.MatchString(fieldName) { + return true + } + } + return false +} + +// fieldAcceptorAll implements -a +func (tr *TransformerSubs) fieldAcceptorAll( + fieldName string, +) bool { + return true +} + +// safe_sub implements sub, but doesn't produce error-type on non-string input. +func safe_sub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval { + if input1.IsString() { + return bifs.BIF_sub(input1, input2, input3) + } else { + return input1 + } +} + +// safe_gsub implements gsub, but doesn't produce error-type on non-string input. +func safe_gsub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval { + if input1.IsString() { + return bifs.BIF_gsub(input1, input2, input3) + } else { + return input1 + } +} + +// safe_ssub implements ssub, but doesn't produce error-type on non-string input. +func safe_ssub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval { + if input1.IsString() { + return bifs.BIF_ssub(input1, input2, input3) + } else { + return input1 + } +} diff --git a/pkg/transformers/summary.go b/pkg/transformers/summary.go index 1b1db3df4..6f386ab62 100644 --- a/pkg/transformers/summary.go +++ b/pkg/transformers/summary.go @@ -6,11 +6,11 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/transformers/utils" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/transformers/utils" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -106,6 +106,7 @@ func transformerSummaryUsage( fmt.Fprintf(o, "-a {mean,sum,etc.} Use only the specified summarizers.\n") fmt.Fprintf(o, "-x {mean,sum,etc.} Use all summarizers, except the specified ones.\n") fmt.Fprintf(o, "--all Use all available summarizers.\n") + fmt.Fprintf(o, "--transpose Show output with field names as column names..\n") fmt.Fprintf(o, "-h|--help Show this message.\n") } @@ -427,7 +428,7 @@ func (tr *TransformerSummary) emitTransposed( // ---------------------------------------------------------------- -// maybeEmitPercentileNameTransposed is a helper method for emitTransposed, +// maybeEmitAccumulatorTransposed is a helper method for emitTransposed, // for "count", "sum", "mean", etc. func (tr *TransformerSummary) maybeEmitAccumulatorTransposed( oracs *list.List, // list of *types.RecordAndContext diff --git a/pkg/transformers/surv.go b/pkg/transformers/surv.go new file mode 100644 index 000000000..6d4b38f09 --- /dev/null +++ b/pkg/transformers/surv.go @@ -0,0 +1,173 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "strings" + + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" + "github.com/kshedden/statmodel/duration" + "github.com/kshedden/statmodel/statmodel" +) + +// ---------------------------------------------------------------- +const verbNameSurv = "surv" + +// SurvSetup defines the surv verb: Kaplan-Meier survival curve. +var SurvSetup = TransformerSetup{ + Verb: verbNameSurv, + UsageFunc: transformerSurvUsage, + ParseCLIFunc: transformerSurvParseCLI, + IgnoresInput: false, +} + +func transformerSurvUsage(o *os.File) { + fmt.Fprintf(o, "Usage: %s %s -d {duration-field} -s {status-field}\n", "mlr", verbNameSurv) + fmt.Fprint(o, ` +Estimate Kaplan-Meier survival curve (right-censored). +Options: + -d {field} Name of duration field (time-to-event or censoring). + -s {field} Name of status field (0=censored, 1=event). + -h, --help Show this message. +`) +} + +func transformerSurvParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, +) IRecordTransformer { + argi := *pargi + verb := args[argi] + argi++ + + var durationField, statusField string + + for argi < argc { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break + } + if opt == "-h" || opt == "--help" { + transformerSurvUsage(os.Stdout) + os.Exit(0) + } else if opt == "-d" { + if argi+1 >= argc { + fmt.Fprintf(os.Stderr, "mlr %s: %s requires an argument\n", verb, opt) + os.Exit(1) + } + argi++ + durationField = args[argi] + argi++ + } else if opt == "-s" { + if argi+1 >= argc { + fmt.Fprintf(os.Stderr, "mlr %s: %s requires an argument\n", verb, opt) + os.Exit(1) + } + argi++ + statusField = args[argi] + argi++ + } else { + break + } + } + *pargi = argi + if !doConstruct { + return nil + } + if durationField == "" { + fmt.Fprintf(os.Stderr, "mlr %s: -d option is required.\n", verbNameSurv) + fmt.Fprintf(os.Stderr, "Please see 'mlr %s --help' for more information.\n", verbNameSurv) + os.Exit(1) + } + if statusField == "" { + fmt.Fprintf(os.Stderr, "mlr %s: -s option is required.\n", verbNameSurv) + fmt.Fprintf(os.Stderr, "Please see 'mlr %s --help' for more information.\n", verbNameSurv) + os.Exit(1) + } + return NewTransformerSurv(durationField, statusField) +} + +// TransformerSurv holds fields for surv verb. +type TransformerSurv struct { + durationField string + statusField string + times []float64 + events []bool +} + +// NewTransformerSurv constructs a new surv transformer. +func NewTransformerSurv(durationField, statusField string) IRecordTransformer { + return &TransformerSurv{ + durationField: durationField, + statusField: statusField, + times: make([]float64, 0), + events: make([]bool, 0), + } +} + +// Transform processes each record or emits results at end-of-stream. +func (tr *TransformerSurv) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + if !inrecAndContext.EndOfStream { + rec := inrecAndContext.Record + mvDur := rec.Get(tr.durationField) + if mvDur == nil { + fmt.Fprintf(os.Stderr, "mlr surv: duration field '%s' not found\n", tr.durationField) + os.Exit(1) + } + duration := mvDur.GetNumericToFloatValueOrDie() + mvStat := rec.Get(tr.statusField) + if mvStat == nil { + fmt.Fprintf(os.Stderr, "mlr surv: status field '%s' not found\n", tr.statusField) + os.Exit(1) + } + status := mvStat.GetNumericToFloatValueOrDie() != 0 + tr.times = append(tr.times, duration) + tr.events = append(tr.events, status) + } else { + // Compute survival using kshedden/statmodel + n := len(tr.times) + if n == 0 { + outputRecordsAndContexts.PushBack(inrecAndContext) + return + } + durations := tr.times + statuses := make([]float64, n) + for i, ev := range tr.events { + if ev { + statuses[i] = 1.0 + } else { + statuses[i] = 0.0 + } + } + dataCols := [][]float64{durations, statuses} + names := []string{tr.durationField, tr.statusField} + ds := statmodel.NewDataset(dataCols, names) + sf, err := duration.NewSurvfuncRight(ds, tr.durationField, tr.statusField, &duration.SurvfuncRightConfig{}) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr surv: %v\n", err) + os.Exit(1) + } + sf.Fit() + times := sf.Time() + survProbs := sf.SurvProb() + for i, t := range times { + newrec := mlrval.NewMlrmapAsRecord() + newrec.PutCopy("time", mlrval.FromFloat(t)) + newrec.PutCopy("survival", mlrval.FromFloat(survProbs[i])) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) + } + outputRecordsAndContexts.PushBack(inrecAndContext) + } +} diff --git a/pkg/transformers/tac.go b/pkg/transformers/tac.go index 83ccd6876..ba25195cb 100644 --- a/pkg/transformers/tac.go +++ b/pkg/transformers/tac.go @@ -6,8 +6,8 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/tail.go b/pkg/transformers/tail.go index dcba44b67..345d09d53 100644 --- a/pkg/transformers/tail.go +++ b/pkg/transformers/tail.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/tee.go b/pkg/transformers/tee.go index 9e944df17..e5f5413ca 100644 --- a/pkg/transformers/tee.go +++ b/pkg/transformers/tee.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/output" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/output" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -183,7 +183,7 @@ func (tr *TransformerTee) Transform( // But 'mlr cut -f foo then tee bar.txt then head -n 10' -- one does expect // bar.txt to have all the output from cut. select { - case _ = <-inputDownstreamDoneChannel: + case <-inputDownstreamDoneChannel: // Do not write this to the coutputDownstreamDoneChannel, as other transformers do break default: diff --git a/pkg/transformers/template.go b/pkg/transformers/template.go index 392f96377..a30bf7dae 100644 --- a/pkg/transformers/template.go +++ b/pkg/transformers/template.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/top.go b/pkg/transformers/top.go index 70119731b..9bc62fe69 100644 --- a/pkg/transformers/top.go +++ b/pkg/transformers/top.go @@ -6,11 +6,11 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/transformers/utils" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/transformers/utils" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/unflatten.go b/pkg/transformers/unflatten.go index d1e02a52c..dcf8014af 100644 --- a/pkg/transformers/unflatten.go +++ b/pkg/transformers/unflatten.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/uniq.go b/pkg/transformers/uniq.go index f28e6c854..ecd89a1c6 100644 --- a/pkg/transformers/uniq.go +++ b/pkg/transformers/uniq.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- @@ -43,6 +43,7 @@ func transformerCountDistinctUsage( fmt.Fprintf(o, "\n") fmt.Fprintf(o, "Options:\n") fmt.Fprintf(o, "-f {a,b,c} Field names for distinct count.\n") + fmt.Fprintf(o, "-x {a,b,c} Field names to exclude for distinct count: use each record's others instead.\n") fmt.Fprintf(o, "-n Show only the number of distinct values. Not compatible with -u.\n") fmt.Fprintf(o, "-o {name} Field name for output count. Default \"%s\".\n", uniqDefaultOutputFieldName) fmt.Fprintf(o, " Ignored with -u.\n") @@ -68,6 +69,7 @@ func transformerCountDistinctParseCLI( // Parse local flags var fieldNames []string = nil + invertFieldNames := false showNumDistinctOnly := false outputFieldName := uniqDefaultOutputFieldName doLashed := true @@ -89,6 +91,10 @@ func transformerCountDistinctParseCLI( } else if opt == "-g" || opt == "-f" { fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + } else if opt == "-x" { + fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + invertFieldNames = true + } else if opt == "-n" { showNumDistinctOnly = true @@ -123,6 +129,7 @@ func transformerCountDistinctParseCLI( transformer, err := NewTransformerUniq( fieldNames, + invertFieldNames, showCounts, showNumDistinctOnly, outputFieldName, @@ -149,6 +156,7 @@ func transformerUniqUsage( fmt.Fprintf(o, "\n") fmt.Fprintf(o, "Options:\n") fmt.Fprintf(o, "-g {d,e,f} Group-by-field names for uniq counts.\n") + fmt.Fprintf(o, "-x {a,b,c} Field names to exclude for uniq: use each record's others instead.\n") fmt.Fprintf(o, "-c Show repeat counts in addition to unique values.\n") fmt.Fprintf(o, "-n Show only the number of distinct values.\n") fmt.Fprintf(o, "-o {name} Field name for output count. Default \"%s\".\n", uniqDefaultOutputFieldName) @@ -173,6 +181,7 @@ func transformerUniqParseCLI( // Parse local flags var fieldNames []string = nil + invertFieldNames := false showCounts := false showNumDistinctOnly := false outputFieldName := uniqDefaultOutputFieldName @@ -195,6 +204,10 @@ func transformerUniqParseCLI( } else if opt == "-g" || opt == "-f" { fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + } else if opt == "-x" { + fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + invertFieldNames = true + } else if opt == "-c" { showCounts = true @@ -238,6 +251,7 @@ func transformerUniqParseCLI( transformer, _ := NewTransformerUniq( fieldNames, + invertFieldNames, showCounts, showNumDistinctOnly, outputFieldName, @@ -250,9 +264,11 @@ func transformerUniqParseCLI( // ---------------------------------------------------------------- type TransformerUniq struct { - fieldNames []string - showCounts bool - outputFieldName string + fieldNames []string + fieldNamesSet map[string]bool + invertFieldNames bool + showCounts bool + outputFieldName string // Example: // Input is: @@ -280,6 +296,7 @@ type TransformerUniq struct { // "a" => "4" => 4 uniqifiedRecordCounts *lib.OrderedMap // record-as-string -> counts uniqifiedRecords *lib.OrderedMap // record-as-string -> records + keysByGroup *lib.OrderedMap // XXX COMMENT ME countsByGroup *lib.OrderedMap // grouping key -> count valuesByGroup *lib.OrderedMap // grouping key -> array of values unlashedCounts *lib.OrderedMap // field name -> string field value -> count @@ -291,6 +308,7 @@ type TransformerUniq struct { // ---------------------------------------------------------------- func NewTransformerUniq( fieldNames []string, + invertFieldNames bool, showCounts bool, showNumDistinctOnly bool, outputFieldName string, @@ -299,12 +317,15 @@ func NewTransformerUniq( ) (*TransformerUniq, error) { tr := &TransformerUniq{ - fieldNames: fieldNames, - showCounts: showCounts, - outputFieldName: outputFieldName, + fieldNames: fieldNames, + fieldNamesSet: lib.StringListToSet(fieldNames), + invertFieldNames: invertFieldNames, + showCounts: showCounts, + outputFieldName: outputFieldName, uniqifiedRecordCounts: lib.NewOrderedMap(), uniqifiedRecords: lib.NewOrderedMap(), + keysByGroup: lib.NewOrderedMap(), countsByGroup: lib.NewOrderedMap(), valuesByGroup: lib.NewOrderedMap(), unlashedCounts: lib.NewOrderedMap(), @@ -334,6 +355,16 @@ func NewTransformerUniq( // ---------------------------------------------------------------- +func (tr *TransformerUniq) getFieldNamesForGrouping( + inrec *mlrval.Mlrmap, +) []string { + if tr.invertFieldNames { + return inrec.GetKeysExcept(tr.fieldNamesSet) + } else { + return tr.fieldNames + } +} + func (tr *TransformerUniq) Transform( inrecAndContext *types.RecordAndContext, outputRecordsAndContexts *list.List, // list of *types.RecordAndContext @@ -441,7 +472,7 @@ func (tr *TransformerUniq) transformUnlashed( if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record - for _, fieldName := range tr.fieldNames { + for _, fieldName := range tr.getFieldNamesForGrouping(inrec) { var countsForFieldName *lib.OrderedMap = nil iCountsForFieldName, present := tr.unlashedCounts.GetWithCheck(fieldName) if !present { @@ -496,7 +527,7 @@ func (tr *TransformerUniq) transformNumDistinctOnly( if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record - groupingKey, ok := inrec.GetSelectedValuesJoined(tr.fieldNames) + groupingKey, ok := inrec.GetSelectedValuesJoined(tr.getFieldNamesForGrouping(inrec)) if ok { iCount, present := tr.countsByGroup.GetWithCheck(groupingKey) if !present { @@ -528,28 +559,33 @@ func (tr *TransformerUniq) transformWithCounts( if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record - groupingKey, selectedValues, ok := inrec.GetSelectedValuesAndJoined(tr.fieldNames) + fieldNamesForGrouping := tr.getFieldNamesForGrouping(inrec) + + groupingKey, selectedValues, ok := inrec.GetSelectedValuesAndJoined(fieldNamesForGrouping) if ok { iCount, present := tr.countsByGroup.GetWithCheck(groupingKey) if !present { tr.countsByGroup.Put(groupingKey, int64(1)) tr.valuesByGroup.Put(groupingKey, selectedValues) + tr.keysByGroup.Put(groupingKey, fieldNamesForGrouping) } else { tr.countsByGroup.Put(groupingKey, iCount.(int64)+1) } } } else { // end of record stream - for pa := tr.countsByGroup.Head; pa != nil; pa = pa.Next { outrec := mlrval.NewMlrmapAsRecord() valuesForGroup := tr.valuesByGroup.Get(pa.Key).([]*mlrval.Mlrval) - for i, fieldName := range tr.fieldNames { + keysForGroup := tr.keysByGroup.Get(pa.Key).([]string) + + for i, fieldNameForGrouping := range keysForGroup { outrec.PutCopy( - fieldName, + fieldNameForGrouping, valuesForGroup[i], ) } + if tr.showCounts { outrec.PutReference( tr.outputFieldName, @@ -573,7 +609,7 @@ func (tr *TransformerUniq) transformWithoutCounts( if !inrecAndContext.EndOfStream { inrec := inrecAndContext.Record - groupingKey, selectedValues, ok := inrec.GetSelectedValuesAndJoined(tr.fieldNames) + groupingKey, selectedValues, ok := inrec.GetSelectedValuesAndJoined(tr.getFieldNamesForGrouping(inrec)) if !ok { return } @@ -584,9 +620,9 @@ func (tr *TransformerUniq) transformWithoutCounts( tr.valuesByGroup.Put(groupingKey, selectedValues) outrec := mlrval.NewMlrmapAsRecord() - for i, fieldName := range tr.fieldNames { + for i, fieldNameForGrouping := range tr.getFieldNamesForGrouping(inrec) { outrec.PutCopy( - fieldName, + fieldNameForGrouping, selectedValues[i], ) } diff --git a/pkg/transformers/unspace.go b/pkg/transformers/unspace.go index eb6253025..274f28fbd 100644 --- a/pkg/transformers/unspace.go +++ b/pkg/transformers/unspace.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/unsparsify.go b/pkg/transformers/unsparsify.go index 467b83dac..30ac1c3ee 100644 --- a/pkg/transformers/unsparsify.go +++ b/pkg/transformers/unsparsify.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/utf8_to_latin1.go b/pkg/transformers/utf8_to_latin1.go index fb658562a..bc744c8fa 100644 --- a/pkg/transformers/utf8_to_latin1.go +++ b/pkg/transformers/utf8_to_latin1.go @@ -6,10 +6,10 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/utils/join_bucket.go b/pkg/transformers/utils/join_bucket.go index 4e26e2538..f4390906f 100644 --- a/pkg/transformers/utils/join_bucket.go +++ b/pkg/transformers/utils/join_bucket.go @@ -7,7 +7,7 @@ package utils import ( "container/list" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/utils/join_bucket_keeper.go b/pkg/transformers/utils/join_bucket_keeper.go index df218cc3d..237f2648b 100644 --- a/pkg/transformers/utils/join_bucket_keeper.go +++ b/pkg/transformers/utils/join_bucket_keeper.go @@ -113,11 +113,11 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/cli" - "github.com/johnkerl/miller/pkg/input" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/input" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/utils/percentile_keeper.go b/pkg/transformers/utils/percentile_keeper.go index 0aebdc709..30701a7c3 100644 --- a/pkg/transformers/utils/percentile_keeper.go +++ b/pkg/transformers/utils/percentile_keeper.go @@ -8,8 +8,8 @@ import ( "fmt" "sort" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) type PercentileKeeper struct { diff --git a/pkg/transformers/utils/stats1_accumulators.go b/pkg/transformers/utils/stats1_accumulators.go index 02756a9a3..1e5267a8a 100644 --- a/pkg/transformers/utils/stats1_accumulators.go +++ b/pkg/transformers/utils/stats1_accumulators.go @@ -9,9 +9,9 @@ import ( "os" "strings" - "github.com/johnkerl/miller/pkg/bifs" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/bifs" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ---------------------------------------------------------------- @@ -72,6 +72,11 @@ var stats1AccumulatorInfos []stats1AccumulatorInfo = []stats1AccumulatorInfo{ "Compute averages (sample means) of specified fields", NewStats1MeanAccumulator, }, + { + "mad", + "Compute mean absolute deviation", + NewStats1MeanAbsDevAccumulator, + }, { "var", @@ -504,6 +509,47 @@ func (acc *Stats1MeanAccumulator) Reset() { acc.count = 0 } +// ---------------------------------------------------------------- +type Stats1MeanAbsDevAccumulator struct { + samples []*mlrval.Mlrval +} + +func NewStats1MeanAbsDevAccumulator() IStats1Accumulator { + return &Stats1MeanAbsDevAccumulator{ + samples: make([]*mlrval.Mlrval, 0, 1000), + } +} +func (acc *Stats1MeanAbsDevAccumulator) Ingest(value *mlrval.Mlrval) { + if value.IsNumeric() { + acc.samples = append(acc.samples, value) + } +} +func (acc *Stats1MeanAbsDevAccumulator) Emit() *mlrval.Mlrval { + n := len(acc.samples) + if n == 0 { + return mlrval.VOID + } + mn := mlrval.FromInt(int64(n)) + + mean := mlrval.FromInt(0) + for i := 0; i < n; i++ { + mean = bifs.BIF_plus_binary(mean, acc.samples[i]) + } + mean = bifs.BIF_divide(mean, mn) + + meanAbsDev := mlrval.FromInt(0) + for i := 0; i < n; i++ { + diff := bifs.BIF_minus_binary(mean, acc.samples[i]) + meanAbsDev = bifs.BIF_plus_binary(meanAbsDev, bifs.BIF_abs(diff)) + } + meanAbsDev = bifs.BIF_divide(meanAbsDev, mn) + + return meanAbsDev +} +func (acc *Stats1MeanAbsDevAccumulator) Reset() { + acc.samples = make([]*mlrval.Mlrval, 0, 1000) +} + // ---------------------------------------------------------------- type Stats1MinAccumulator struct { min *mlrval.Mlrval diff --git a/pkg/transformers/utils/stats2_accumulators.go b/pkg/transformers/utils/stats2_accumulators.go index 84fcdc030..e825af0aa 100644 --- a/pkg/transformers/utils/stats2_accumulators.go +++ b/pkg/transformers/utils/stats2_accumulators.go @@ -9,8 +9,8 @@ import ( "math" "os" - "github.com/johnkerl/miller/pkg/lib" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/utils/top_keeper.go b/pkg/transformers/utils/top_keeper.go index 3a3ce9e4e..4e59783d0 100644 --- a/pkg/transformers/utils/top_keeper.go +++ b/pkg/transformers/utils/top_keeper.go @@ -5,8 +5,8 @@ package utils import ( - "github.com/johnkerl/miller/pkg/mlrval" - "github.com/johnkerl/miller/pkg/types" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" ) // ---------------------------------------------------------------- diff --git a/pkg/transformers/utils/window_keeper.go b/pkg/transformers/utils/window_keeper.go index 2de875020..c19b33a2a 100644 --- a/pkg/transformers/utils/window_keeper.go +++ b/pkg/transformers/utils/window_keeper.go @@ -1,7 +1,7 @@ package utils import ( - "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/v6/pkg/lib" ) // WindowKeeper is a sliding-window container, nominally for use by mlr step, diff --git a/pkg/types/context.go b/pkg/types/context.go index 08ba3cbb6..6f82bc527 100644 --- a/pkg/types/context.go +++ b/pkg/types/context.go @@ -5,7 +5,7 @@ import ( "container/list" "strconv" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // Since Go is concurrent, the context struct (AWK-like variables such as @@ -99,6 +99,9 @@ type Context struct { // NF int NR int64 FNR int64 + + // XXX 1513 + JSONHadBrackets bool } // TODO: comment: Remember command-line values to pass along to CST evaluators. diff --git a/pkg/types/mlrval_typing.go b/pkg/types/mlrval_typing.go index e3c68b5f8..4eeb60269 100644 --- a/pkg/types/mlrval_typing.go +++ b/pkg/types/mlrval_typing.go @@ -8,7 +8,7 @@ package types import ( "fmt" - "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/mlrval" ) // ---------------------------------------------------------------- @@ -24,7 +24,7 @@ func NewTypeGatedMlrvalName( ) (*TypeGatedMlrvalName, error) { typeMask, ok := mlrval.TypeNameToMask(typeName) if !ok { - return nil, fmt.Errorf("mlr: couldn't resolve type name \"%s\".", typeName) + return nil, fmt.Errorf(`mlr: couldn't resolve type name "%s"`, typeName) } return &TypeGatedMlrvalName{ Name: name, @@ -39,7 +39,7 @@ func (tname *TypeGatedMlrvalName) Check(value *mlrval.Mlrval) error { return nil } else { return fmt.Errorf( - "mlr: couldn't assign variable %s %s from value %s %s\n", + "mlr: couldn't assign variable %s %s from value %s %s", tname.TypeName, tname.Name, value.GetTypeName(), value.String(), ) } diff --git a/pkg/version/version.go b/pkg/version/version.go index 1d4cd9cea..ec9c7208a 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -4,4 +4,4 @@ package version // Nominally things like "6.0.0" for a release, then "6.0.0-dev" in between. // This makes it clear that a given build is on the main dev branch, not a // particular snapshot tag. -var STRING string = "6.10.0" +var STRING string = "6.16.0" diff --git a/python/make-tsv.py b/python/make-tsv.py new file mode 100755 index 000000000..bb55d0ba0 --- /dev/null +++ b/python/make-tsv.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python + +import sys + +nrow = 2 +ncol = 100 +if len(sys.argv) == 2: + ncol = int(sys.argv[1]) +if len(sys.argv) == 3: + nrow = int(sys.argv[1]) + ncol = int(sys.argv[2]) + +prefix = "k" +for i in range(nrow): + for j in range(ncol): + if j == 0: + sys.stdout.write("%s%07d" % (prefix, j)) + else: + sys.stdout.write("\t%s%07d" % (prefix, j)) + sys.stdout.write("\n") + prefix = "v" diff --git a/regression_test.go b/regression_test.go index 5657d17bc..0ccaa3a15 100644 --- a/regression_test.go +++ b/regression_test.go @@ -5,7 +5,7 @@ import ( "os" "testing" - "github.com/johnkerl/miller/pkg/terminals/regtest" + "github.com/johnkerl/miller/v6/pkg/terminals/regtest" ) // TestRegression is a familiar entry point for regression testing. Miller diff --git a/scripts/compiler-versions-build b/scripts/compiler-versions-build index 49e2c2ad2..e575959e7 100755 --- a/scripts/compiler-versions-build +++ b/scripts/compiler-versions-build @@ -1,7 +1,7 @@ #!/bin/sh for go in go1.15.15 go1.16.12 go1.17.5 go1.18beta1; do - $go clean github.com/johnkerl/miller/cmd/mlr - $go build github.com/johnkerl/miller/cmd/mlr + $go clean github.com/johnkerl/miller/v6/cmd/mlr + $go build github.com/johnkerl/miller/v6/cmd/mlr mv mlr mlr-$go done diff --git a/snap/README.md b/snap/README.md new file mode 100644 index 000000000..2af316410 --- /dev/null +++ b/snap/README.md @@ -0,0 +1,150 @@ +# Failed attempts to create a snap interactively + +2026-01-02 I used an Ubuntu 24.04 EC2 instance. I followed https://documentation.ubuntu.com/snapcraft/stable/. Error messages said things like + +``` +A network related operation failed in a context of no network access. +Recommended resolution: Verify that the environment has internet connectivity; see https://canonical-craft-providers.readthedocs-hosted.com/en/latest/explanation/ for further reference. +Full execution log: '/home/ubuntu/.local/state/snapcraft/log/snapcraft-20260102-170252.488632.log' +``` + +when there was in fact no network problem. I remained confused. + +``` +$ sudo snapcraft pack + +$ lxc list + +$ snapcraft pack --destructive-mode + +$ snapcraft pack --use-multipass + +$ sudo snap install multipass + +$ snapcraft pack --use-multipass + +$ sudo lxd init --auto + +$ lxc network list + +$ sudo snapcraft pack + +$ sudo snap set snapcraft provider=multipass + +$ sudo snapcraft pack --destructive-mode + +[This created miller_6.15.0_arm64.snap] + +$ snapcraft upload --release=stable *.snap +No keyring found to store or retrieve credentials from. +Recommended resolution: Ensure the keyring is working or SNAPCRAFT_STORE_CREDENTIALS is correctly exported into the environment +For more information, check out: https://documentation.ubuntu.com/snapcraft/stable/how-to/publishing/authenticate +Full execution log: '/home/ubuntu/.local/state/snapcraft/log/snapcraft-20260102-172357.599171.log' + +$ ll *.snap +-rw-r--r-- 1 root root 8994816 Jan 2 17:22 miller_6.15.0_arm64.snap + +$ snap install *.snap +error: access denied (try with sudo) + +$ sudo snap install *.snap +error: cannot find signatures with metadata for snap/component "miller_6.15.0_arm64.snap" +``` + +Conclusion: + +* I got cryptic error messages with various permutations. +* Through trial and error I got a `.snap` file with `sudo` and `multipass` and `--destructive-mode`. +* Even then, I got a `.snap` file only for the current machine's arch, and the resulting `.snap` file was not locally installable. +* This led me to try a GitHub Action. + +# Info from Claude about auto-releasing + +Here's how to set up automatic Snap publishing from GitHub releases: + +## 1. Create snapcraft.yaml + +First, ensure you have a proper `snapcraft.yaml` in your repo root (or in a `snap/` directory): + +```yaml +name: your-app-name +base: core22 # or core24 for Ubuntu 24.04 +version: git # automatically uses git tags +summary: Single-line summary +description: | + Longer description of your application + +grade: stable # or devel +confinement: strict # or classic, devmode + +apps: + your-app-name: + command: bin/your-binary + plugs: + - home + - network + +parts: + your-app: + plugin: nil # change based on your build system (go, python, etc.) + source: . + # Add build steps as needed +``` + +## 2. Get Snapcraft credentials + +Export your Snapcraft login credentials: + +```bash +snapcraft export-login --snaps=miller --channels=stable,candidate,beta,edge snapcraft-token.txt +``` + +This creates a token file with limited permissions for just your snap. + +## 3. Add token to GitHub Secrets + +1. Go to your GitHub repo → Settings → Secrets and variables → Actions +2. Click "New repository secret" +3. Name: `SNAPCRAFT_TOKEN` +4. Value: Paste the entire contents of `snapcraft-token.txt` + +## 4. Create GitHub Action workflow + +Create `.github/workflows/release.yml`: + +```yaml +name: Release to Snap Store + +on: + release: + types: [published] + +jobs: + snap: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Build snap + uses: snapcore/action-build@v1 + id: build + + - name: Publish to Snap Store + uses: snapcore/action-publish@v1 + env: + SNAPCRAFT_STORE_CREDENTIALS: ${{ secrets.SNAPCRAFT_TOKEN }} + with: + snap: ${{ steps.build.outputs.snap }} + # release: stable # or edge, beta, candidate + release: edge +``` + +## Tips + +- **Version handling**: Using `version: git` in snapcraft.yaml automatically uses your git tag as the version +- **Channels**: Start with `edge` channel for testing, then promote to `stable` once confident +- **Multiple architectures**: Add a build matrix if you need to support arm64, etc. +- **Testing before stable**: Consider publishing to `candidate` or `beta` first, then manually promote to `stable` after testing + +Now when you create a GitHub release with a tag (e.g., `v1.0.0`), the workflow will automatically build and publish your snap! diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml new file mode 100644 index 000000000..d374a84b0 --- /dev/null +++ b/snap/snapcraft.yaml @@ -0,0 +1,54 @@ +name: miller +base: core24 +version: git +summary: Miller is like awk, sed, cut, join and sort +description: | + Miller is like awk, sed, cut, join, and sort for data formats such as CSV, TSV, JSON, JSON Lines, and positionally-indexed. + +grade: stable +confinement: strict + +adopt-info: miller + +website: https://github.com/johnkerl/miller/issues +contact: https://github.com/johnkerl/miller/issues +issues: https://github.com/johnkerl/miller/issues +source-code: https://github.com/johnkerl/miller + +license: BSD-2-Clause +compression: lzo + +platforms: + amd64: + build-on: [amd64] + build-for: [amd64] + arm64: + build-on: [arm64] + build-for: [arm64] + armhf: + build-on: [armhf] + build-for: [armhf] + s390x: + build-on: [s390x] + build-for: [s390x] + ppc64el: + build-on: [ppc64el] + build-for: [ppc64el] + +apps: + miller: + command: usr/local/bin/mlr + plugs: + - home + +parts: + miller: + source: https://github.com/johnkerl/miller + source-type: git + plugin: make + build-snaps: + - go + + override-pull: | + craftctl default + craftctl set version="$(git describe --tags | sed 's/^v//' | cut -d "-" -f1)" diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index bdb23ad6c..19a201c62 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -96,6 +96,7 @@ Same as uniq -c. Options: -f {a,b,c} Field names for distinct count. +-x {a,b,c} Field names to exclude for distinct count: use each record's others instead. -n Show only the number of distinct values. Not compatible with -u. -o {name} Field name for output count. Default "count". Ignored with -u. @@ -137,7 +138,7 @@ Options: -r Treat field names as regular expressions. "ab", "a.*b" will match any field name containing the substring "ab" or matching "a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may - be used. The -o flag is ignored when -r is present. + be used. -h|--help Show this message. Examples: mlr cut -f hostname,status @@ -185,6 +186,10 @@ Options: ================================================================ filter Usage: mlr filter [options] {DSL expression} +Lets you use a domain-specific language to programmatically filter which +stream records will be output. +See also: https://miller.readthedocs.io/en/latest/reference-verbs + Options: -f {file name} File containing a DSL expression (see examples below). If the filename is a directory, all *.mlr files in that directory are loaded. @@ -197,7 +202,7 @@ Options: Since the expression pieces are simply concatenated, please be sure to use intervening semicolons to separate expressions.) --s name=value: Predefines out-of-stream variable @name to have +-s name=value: Predefines out-of-stream variable @name to have Thus mlr put -s foo=97 '$column += @foo' is like mlr put 'begin {@foo = 97} $column += @foo'. The value part is subject to type-inferencing. @@ -387,6 +392,8 @@ for the old string and handling multiple matches, like the `gsub` DSL function. See also the `sub` and `ssub` verbs. Options: -f {a,b,c} Field names to convert. +-r {regex} Regular expression for field names to convert. +-a Convert all fields. -h|--help Show this message. ================================================================ @@ -466,6 +473,8 @@ Options: --lk|--left-keep-field-names {a,b,c} If supplied, this means keep only the specified field names from the left file. Automatically includes the join-field name(s). Helpful for when you only want a limited subset of information from the left file. + Tip: you can use --lk "": this means the left file becomes solely a row-selector + for the input files. --lp {text} Additional prefix for non-join output field names from the left file --rp {text} Additional prefix for non-join output field names from @@ -500,7 +509,7 @@ be specified CSV as well unless you override with 'mlr --csv ... join --ijson -l Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'. Please use "mlr --usage-separator-options" for information on specifying separators. -Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information +Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information including examples. ================================================================ @@ -548,6 +557,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -658,6 +668,9 @@ Options: ================================================================ put Usage: mlr put [options] {DSL expression} +Lets you use a domain-specific language to programmatically alter stream records. +See also: https://miller.readthedocs.io/en/latest/reference-verbs + Options: -f {file name} File containing a DSL expression (see examples below). If the filename is a directory, all *.mlr files in that directory are loaded. @@ -670,7 +683,7 @@ Options: Since the expression pieces are simply concatenated, please be sure to use intervening semicolons to separate expressions.) --s name=value: Predefines out-of-stream variable @name to have +-s name=value: Predefines out-of-stream variable @name to have Thus mlr put -s foo=97 '$column += @foo' is like mlr put 'begin {@foo = 97} $column += @foo'. The value part is subject to type-inferencing. @@ -767,9 +780,9 @@ Options: first-match replacement. -h|--help Show this message. Examples: -mlr rename old_name,new_name' -mlr rename old_name_1,new_name_1,old_name_2,new_name_2' -mlr rename -r 'Date_[0-9]+,Date,' Rename all such fields to be "Date" +mlr rename old_name,new_name +mlr rename old_name_1,new_name_1,old_name_2,new_name_2 +mlr rename -r 'Date_[0-9]+,Date' Rename all such fields to be "Date" mlr rename -r '"Date_[0-9]+",Date' Same mlr rename -r 'Date_([0-9]+).*,\1' Rename all such fields to be of the form 20151015 mlr rename -r '"name"i,Name' Rename "name", "Name", "NAME", etc. to "Name" @@ -969,6 +982,7 @@ Options: -nf {comma-separated field names} Same as -n -nr {comma-separated field names} Numerical descending; nulls sort first -t {comma-separated field names} Natural ascending +-b Move sort fields to start of record, as in reorder -b -tr|-rt {comma-separated field names} Natural descending -h|--help Show this message. @@ -985,6 +999,18 @@ Options: -r Recursively sort subobjects/submaps, e.g. for JSON input. -h|--help Show this message. +================================================================ +sparsify +Usage: mlr sparsify [options] +Unsets fields for which the key is the empty string (or, optionally, another +specified value). Only makes sense with output format not being CSV or TSV. +Options: +-s {filler string} What values to remove. Defaults to the empty string. +-f {a,b,c} Specify field names to be operated on; any other fields won't be + modified. The default is to modify all fields. +-h|--help Show this message. +Example: if input is a=1,b=,c=3 then output is a=1,c=3. + ================================================================ split Usage: mlr split [options] {filename} @@ -1035,6 +1061,8 @@ Replaces old string with new string in specified field(s), without regex support the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs. Options: -f {a,b,c} Field names to convert. +-r {regex} Regular expression for field names to convert. +-a Convert all fields. -h|--help Show this message. ================================================================ @@ -1053,6 +1081,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -1087,7 +1116,7 @@ Options: Example: mlr stats1 -a min,p10,p50,p90,max -f value -g size,shape Example: mlr stats1 -a count,mode -f size Example: mlr stats1 -a count,mode -f size -g shape -Example: mlr stats1 -a count,mode --fr '^[a-h].*$' -gr '^k.*$' +Example: mlr stats1 -a count,mode --fr '^[a-h].*$' --gr '^k.*$' This computes count and mode statistics on all field names beginning with a through h, grouped by all field names starting with k. @@ -1185,6 +1214,8 @@ for the old string and not handling multiple matches, like the `sub` DSL functio See also the `gsub` and `ssub` verbs. Options: -f {a,b,c} Field names to convert. +-r {regex} Regular expression for field names to convert. +-a Convert all fields. -h|--help Show this message. ================================================================ @@ -1228,8 +1259,19 @@ Options: -a {mean,sum,etc.} Use only the specified summarizers. -x {mean,sum,etc.} Use all summarizers, except the specified ones. --all Use all available summarizers. +--transpose Show output with field names as column names.. -h|--help Show this message. +================================================================ +surv +Usage: mlr surv -d {duration-field} -s {status-field} + +Estimate Kaplan-Meier survival curve (right-censored). +Options: + -d {field} Name of duration field (time-to-event or censoring). + -s {field} Name of status field (0=censored, 1=event). + -h, --help Show this message. + ================================================================ tac Usage: mlr tac [options] @@ -1320,6 +1362,7 @@ count-distinct. For uniq, -f is a synonym for -g. Options: -g {d,e,f} Group-by-field names for uniq counts. +-x {a,b,c} Field names to exclude for uniq: use each record's others instead. -c Show repeat counts in addition to unique values. -n Show only the number of distinct values. -o {name} Field name for output count. Default "count". diff --git a/test/cases/cli-norc/0001/cmd b/test/cases/cli-norc/0001/cmd new file mode 100644 index 000000000..57174b0ec --- /dev/null +++ b/test/cases/cli-norc/0001/cmd @@ -0,0 +1 @@ +mlr --norc -n cat diff --git a/test/cases/io-format-conversion-keystroke-savers/0001/experr b/test/cases/cli-norc/0001/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0001/experr rename to test/cases/cli-norc/0001/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0002/experr b/test/cases/cli-norc/0001/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0002/experr rename to test/cases/cli-norc/0001/expout diff --git a/test/cases/dsl-argpass-typedecl/0002/experr b/test/cases/dsl-argpass-typedecl/0002/experr index 49cdce4a4..49d1b3f4b 100644 --- a/test/cases/dsl-argpass-typedecl/0002/experr +++ b/test/cases/dsl-argpass-typedecl/0002/experr @@ -1 +1 @@ -mlr: couldn't assign variable int i from value float 0.34679014 +mlr: couldn't assign variable int i from value float 0.34679014 \ No newline at end of file diff --git a/test/cases/dsl-argpass-typedecl/0003/experr b/test/cases/dsl-argpass-typedecl/0003/experr index 93b3d02d8..88075a591 100644 --- a/test/cases/dsl-argpass-typedecl/0003/experr +++ b/test/cases/dsl-argpass-typedecl/0003/experr @@ -1 +1 @@ -mlr: couldn't assign variable int function return value from value float 3.79679014 +mlr: couldn't assign variable int function return value from value float 3.79679014 \ No newline at end of file diff --git a/test/cases/dsl-argpass-typedecl/0004/experr b/test/cases/dsl-argpass-typedecl/0004/experr index 8289c8c80..e3de7b5d2 100644 --- a/test/cases/dsl-argpass-typedecl/0004/experr +++ b/test/cases/dsl-argpass-typedecl/0004/experr @@ -1 +1 @@ -mlr: couldn't assign variable int function return value from value float 4.45000000 +mlr: couldn't assign variable int function return value from value float 4.45000000 \ No newline at end of file diff --git a/test/cases/dsl-argpass-typedecl/0005/experr b/test/cases/dsl-argpass-typedecl/0005/experr index 323a86f3f..22ff28ed1 100644 --- a/test/cases/dsl-argpass-typedecl/0005/experr +++ b/test/cases/dsl-argpass-typedecl/0005/experr @@ -1 +1 @@ -mlr: couldn't assign variable int function return value from value error (error) +mlr: couldn't assign variable int function return value from value error (error) \ No newline at end of file diff --git a/test/cases/dsl-argpass-typedecl/0007/experr b/test/cases/dsl-argpass-typedecl/0007/experr index 85b8be44b..49cdce4a4 100644 --- a/test/cases/dsl-argpass-typedecl/0007/experr +++ b/test/cases/dsl-argpass-typedecl/0007/experr @@ -1,2 +1 @@ mlr: couldn't assign variable int i from value float 0.34679014 - diff --git a/test/cases/dsl-argpass-typedecl/0008/experr b/test/cases/dsl-argpass-typedecl/0008/experr index 1dc1c87a0..4b9795ee1 100644 --- a/test/cases/dsl-argpass-typedecl/0008/experr +++ b/test/cases/dsl-argpass-typedecl/0008/experr @@ -1,2 +1 @@ mlr: couldn't assign variable num i from value string a - diff --git a/test/cases/dsl-array-map-indexing/0005/experr b/test/cases/dsl-array-map-indexing/0005/experr index 3f242c2b7..d2d964201 100644 --- a/test/cases/dsl-array-map-indexing/0005/experr +++ b/test/cases/dsl-array-map-indexing/0005/experr @@ -1 +1 @@ -mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$'. +mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$' diff --git a/test/cases/dsl-array-map-indexing/0006/experr b/test/cases/dsl-array-map-indexing/0006/experr index 40195c5a1..f6f1ef0d1 100644 --- a/test/cases/dsl-array-map-indexing/0006/experr +++ b/test/cases/dsl-array-map-indexing/0006/experr @@ -1 +1 @@ -mlr: '[[[...]]]' is allowed on assignment left-hand sides only when immediately preceded by '$'. +mlr: '[[[...]]]' is allowed on assignment left-hand sides only when immediately preceded by '$' diff --git a/test/cases/dsl-array-map-indexing/0007/experr b/test/cases/dsl-array-map-indexing/0007/experr index 3f242c2b7..d2d964201 100644 --- a/test/cases/dsl-array-map-indexing/0007/experr +++ b/test/cases/dsl-array-map-indexing/0007/experr @@ -1 +1 @@ -mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$'. +mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$' diff --git a/test/cases/dsl-array-map-indexing/0035/experr b/test/cases/dsl-array-map-indexing/0035/experr index 3f242c2b7..d2d964201 100644 --- a/test/cases/dsl-array-map-indexing/0035/experr +++ b/test/cases/dsl-array-map-indexing/0035/experr @@ -1 +1 @@ -mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$'. +mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$' diff --git a/test/cases/dsl-array-map-indexing/0036/experr b/test/cases/dsl-array-map-indexing/0036/experr index 3f242c2b7..d2d964201 100644 --- a/test/cases/dsl-array-map-indexing/0036/experr +++ b/test/cases/dsl-array-map-indexing/0036/experr @@ -1 +1 @@ -mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$'. +mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$' diff --git a/test/cases/dsl-array-map-indexing/0068/experr b/test/cases/dsl-array-map-indexing/0068/experr index 3f242c2b7..d2d964201 100644 --- a/test/cases/dsl-array-map-indexing/0068/experr +++ b/test/cases/dsl-array-map-indexing/0068/experr @@ -1 +1 @@ -mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$'. +mlr: '[[...]]' is allowed on assignment left-hand sides only when immediately preceded by '$' diff --git a/test/cases/dsl-array-map-indexing/0069/experr b/test/cases/dsl-array-map-indexing/0069/experr index 40195c5a1..f6f1ef0d1 100644 --- a/test/cases/dsl-array-map-indexing/0069/experr +++ b/test/cases/dsl-array-map-indexing/0069/experr @@ -1 +1 @@ -mlr: '[[[...]]]' is allowed on assignment left-hand sides only when immediately preceded by '$'. +mlr: '[[[...]]]' is allowed on assignment left-hand sides only when immediately preceded by '$' diff --git a/test/cases/dsl-begin-end/0010/cmd b/test/cases/dsl-begin-end/0010/cmd new file mode 100644 index 000000000..11bc26154 --- /dev/null +++ b/test/cases/dsl-begin-end/0010/cmd @@ -0,0 +1 @@ +mlr --from test/input/s.dkvp put -q 'begin{print 8}; end{print 9}' diff --git a/test/cases/io-format-conversion-keystroke-savers/0003/experr b/test/cases/dsl-begin-end/0010/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0003/experr rename to test/cases/dsl-begin-end/0010/experr diff --git a/test/cases/dsl-begin-end/0010/expout b/test/cases/dsl-begin-end/0010/expout new file mode 100644 index 000000000..512858e60 --- /dev/null +++ b/test/cases/dsl-begin-end/0010/expout @@ -0,0 +1,2 @@ +8 +9 diff --git a/test/cases/dsl-clean-whitespace/0010/cmd b/test/cases/dsl-clean-whitespace/0010/cmd new file mode 100644 index 000000000..2fd915d02 --- /dev/null +++ b/test/cases/dsl-clean-whitespace/0010/cmd @@ -0,0 +1 @@ +mlr --icsv --ojson clean-whitespace then put -f ${CASEDIR}/mlr ${CASEDIR}/input.csv diff --git a/test/cases/io-format-conversion-keystroke-savers/0004/experr b/test/cases/dsl-clean-whitespace/0010/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0004/experr rename to test/cases/dsl-clean-whitespace/0010/experr diff --git a/test/cases/dsl-clean-whitespace/0010/expout b/test/cases/dsl-clean-whitespace/0010/expout new file mode 100644 index 000000000..db3fe878d --- /dev/null +++ b/test/cases/dsl-clean-whitespace/0010/expout @@ -0,0 +1,18 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3, + "d": 4, + "e": 9, + "t": "int" +}, +{ + "a": 5, + "b": 6, + "c": 7, + "d": 8, + "e": 13, + "t": "int" +} +] diff --git a/test/cases/dsl-clean-whitespace/0010/input.csv b/test/cases/dsl-clean-whitespace/0010/input.csv new file mode 100644 index 000000000..432037239 --- /dev/null +++ b/test/cases/dsl-clean-whitespace/0010/input.csv @@ -0,0 +1,3 @@ +a, b, c, d +1, 2, 3, 4 +5, 6, 7, 8 diff --git a/test/cases/dsl-clean-whitespace/0010/mlr b/test/cases/dsl-clean-whitespace/0010/mlr new file mode 100644 index 000000000..e51c30c8b --- /dev/null +++ b/test/cases/dsl-clean-whitespace/0010/mlr @@ -0,0 +1,2 @@ +$e = $d + 5; +$t = typeof($d) diff --git a/test/cases/dsl-context-specific-validation/0001/experr b/test/cases/dsl-context-specific-validation/0001/experr index 153ac97b2..9ccf96101 100644 --- a/test/cases/dsl-context-specific-validation/0001/experr +++ b/test/cases/dsl-context-specific-validation/0001/experr @@ -1 +1 @@ -mlr: begin blocks can only be at top level. +mlr: begin blocks can only be at top level diff --git a/test/cases/dsl-context-specific-validation/0002/experr b/test/cases/dsl-context-specific-validation/0002/experr index 153ac97b2..9ccf96101 100644 --- a/test/cases/dsl-context-specific-validation/0002/experr +++ b/test/cases/dsl-context-specific-validation/0002/experr @@ -1 +1 @@ -mlr: begin blocks can only be at top level. +mlr: begin blocks can only be at top level diff --git a/test/cases/dsl-context-specific-validation/0003/experr b/test/cases/dsl-context-specific-validation/0003/experr index 1bf2e1cd8..2e5c850a0 100644 --- a/test/cases/dsl-context-specific-validation/0003/experr +++ b/test/cases/dsl-context-specific-validation/0003/experr @@ -1 +1 @@ -mlr: end blocks can only be at top level. +mlr: end blocks can only be at top level diff --git a/test/cases/dsl-context-specific-validation/0004/experr b/test/cases/dsl-context-specific-validation/0004/experr index 1bf2e1cd8..2e5c850a0 100644 --- a/test/cases/dsl-context-specific-validation/0004/experr +++ b/test/cases/dsl-context-specific-validation/0004/experr @@ -1 +1 @@ -mlr: end blocks can only be at top level. +mlr: end blocks can only be at top level diff --git a/test/cases/dsl-context-specific-validation/0005/experr b/test/cases/dsl-context-specific-validation/0005/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-specific-validation/0005/experr +++ b/test/cases/dsl-context-specific-validation/0005/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-specific-validation/0006/experr b/test/cases/dsl-context-specific-validation/0006/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-specific-validation/0006/experr +++ b/test/cases/dsl-context-specific-validation/0006/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-specific-validation/0007/experr b/test/cases/dsl-context-specific-validation/0007/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-specific-validation/0007/experr +++ b/test/cases/dsl-context-specific-validation/0007/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-specific-validation/0008/experr b/test/cases/dsl-context-specific-validation/0008/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-specific-validation/0008/experr +++ b/test/cases/dsl-context-specific-validation/0008/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-specific-validation/0009/experr b/test/cases/dsl-context-specific-validation/0009/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-specific-validation/0009/experr +++ b/test/cases/dsl-context-specific-validation/0009/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-specific-validation/0010/experr b/test/cases/dsl-context-specific-validation/0010/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-specific-validation/0010/experr +++ b/test/cases/dsl-context-specific-validation/0010/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-specific-validation/0011/experr b/test/cases/dsl-context-specific-validation/0011/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-specific-validation/0011/experr +++ b/test/cases/dsl-context-specific-validation/0011/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-specific-validation/0012/experr b/test/cases/dsl-context-specific-validation/0012/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-specific-validation/0012/experr +++ b/test/cases/dsl-context-specific-validation/0012/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-specific-validation/0013/experr b/test/cases/dsl-context-specific-validation/0013/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-specific-validation/0013/experr +++ b/test/cases/dsl-context-specific-validation/0013/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-specific-validation/0014/experr b/test/cases/dsl-context-specific-validation/0014/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-specific-validation/0014/experr +++ b/test/cases/dsl-context-specific-validation/0014/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-specific-validation/0015/experr b/test/cases/dsl-context-specific-validation/0015/experr index f70d4eb66..88c27a41c 100644 --- a/test/cases/dsl-context-specific-validation/0015/experr +++ b/test/cases/dsl-context-specific-validation/0015/experr @@ -1 +1 @@ -mlr: break statements are only valid within for/do/while loops. +mlr: break statements are only valid within for/do/while loops diff --git a/test/cases/dsl-context-specific-validation/0016/experr b/test/cases/dsl-context-specific-validation/0016/experr index f70d4eb66..88c27a41c 100644 --- a/test/cases/dsl-context-specific-validation/0016/experr +++ b/test/cases/dsl-context-specific-validation/0016/experr @@ -1 +1 @@ -mlr: break statements are only valid within for/do/while loops. +mlr: break statements are only valid within for/do/while loops diff --git a/test/cases/dsl-context-specific-validation/0017/experr b/test/cases/dsl-context-specific-validation/0017/experr index f70d4eb66..88c27a41c 100644 --- a/test/cases/dsl-context-specific-validation/0017/experr +++ b/test/cases/dsl-context-specific-validation/0017/experr @@ -1 +1 @@ -mlr: break statements are only valid within for/do/while loops. +mlr: break statements are only valid within for/do/while loops diff --git a/test/cases/dsl-context-specific-validation/0018/experr b/test/cases/dsl-context-specific-validation/0018/experr index f70d4eb66..88c27a41c 100644 --- a/test/cases/dsl-context-specific-validation/0018/experr +++ b/test/cases/dsl-context-specific-validation/0018/experr @@ -1 +1 @@ -mlr: break statements are only valid within for/do/while loops. +mlr: break statements are only valid within for/do/while loops diff --git a/test/cases/dsl-context-specific-validation/0024/experr b/test/cases/dsl-context-specific-validation/0024/experr index 0991fd583..326c8c258 100644 --- a/test/cases/dsl-context-specific-validation/0024/experr +++ b/test/cases/dsl-context-specific-validation/0024/experr @@ -1 +1 @@ -mlr: filter expressions must not also contain the "filter" keyword. +mlr: filter expressions must not also contain the "filter" keyword diff --git a/test/cases/dsl-context-specific-validation/0025/experr b/test/cases/dsl-context-specific-validation/0025/experr index 0991fd583..326c8c258 100644 --- a/test/cases/dsl-context-specific-validation/0025/experr +++ b/test/cases/dsl-context-specific-validation/0025/experr @@ -1 +1 @@ -mlr: filter expressions must not also contain the "filter" keyword. +mlr: filter expressions must not also contain the "filter" keyword diff --git a/test/cases/dsl-context-validation/0001/experr b/test/cases/dsl-context-validation/0001/experr index 153ac97b2..9ccf96101 100644 --- a/test/cases/dsl-context-validation/0001/experr +++ b/test/cases/dsl-context-validation/0001/experr @@ -1 +1 @@ -mlr: begin blocks can only be at top level. +mlr: begin blocks can only be at top level diff --git a/test/cases/dsl-context-validation/0002/experr b/test/cases/dsl-context-validation/0002/experr index 1bf2e1cd8..2e5c850a0 100644 --- a/test/cases/dsl-context-validation/0002/experr +++ b/test/cases/dsl-context-validation/0002/experr @@ -1 +1 @@ -mlr: end blocks can only be at top level. +mlr: end blocks can only be at top level diff --git a/test/cases/dsl-context-validation/0003/experr b/test/cases/dsl-context-validation/0003/experr index 153ac97b2..9ccf96101 100644 --- a/test/cases/dsl-context-validation/0003/experr +++ b/test/cases/dsl-context-validation/0003/experr @@ -1 +1 @@ -mlr: begin blocks can only be at top level. +mlr: begin blocks can only be at top level diff --git a/test/cases/dsl-context-validation/0004/experr b/test/cases/dsl-context-validation/0004/experr index 1bf2e1cd8..2e5c850a0 100644 --- a/test/cases/dsl-context-validation/0004/experr +++ b/test/cases/dsl-context-validation/0004/experr @@ -1 +1 @@ -mlr: end blocks can only be at top level. +mlr: end blocks can only be at top level diff --git a/test/cases/dsl-context-validation/0005/experr b/test/cases/dsl-context-validation/0005/experr index 28403d108..105a572c5 100644 --- a/test/cases/dsl-context-validation/0005/experr +++ b/test/cases/dsl-context-validation/0005/experr @@ -1 +1 @@ -mlr: func blocks can only be at top level. +mlr: func blocks can only be at top level diff --git a/test/cases/dsl-context-validation/0006/experr b/test/cases/dsl-context-validation/0006/experr index aa4913898..bb100265c 100644 --- a/test/cases/dsl-context-validation/0006/experr +++ b/test/cases/dsl-context-validation/0006/experr @@ -1 +1 @@ -mlr: subr blocks can only be at top level. +mlr: subr blocks can only be at top level diff --git a/test/cases/dsl-context-validation/0007/experr b/test/cases/dsl-context-validation/0007/experr index e8c49427b..5bfdbce14 100644 --- a/test/cases/dsl-context-validation/0007/experr +++ b/test/cases/dsl-context-validation/0007/experr @@ -1 +1 @@ -mlr: begin/end blocks cannot refer to records via $x, $*, etc. +mlr: begin/end blocks cannot refer to records via $x, $*, etc diff --git a/test/cases/dsl-context-validation/0008/experr b/test/cases/dsl-context-validation/0008/experr index 9979f9d90..62e02c192 100644 --- a/test/cases/dsl-context-validation/0008/experr +++ b/test/cases/dsl-context-validation/0008/experr @@ -1 +1 @@ -mlr: return statements are only valid within func/subr blocks. +mlr: return statements are only valid within func/subr blocks diff --git a/test/cases/dsl-context-validation/0009/experr b/test/cases/dsl-context-validation/0009/experr index f70d4eb66..88c27a41c 100644 --- a/test/cases/dsl-context-validation/0009/experr +++ b/test/cases/dsl-context-validation/0009/experr @@ -1 +1 @@ -mlr: break statements are only valid within for/do/while loops. +mlr: break statements are only valid within for/do/while loops diff --git a/test/cases/dsl-context-validation/0010/experr b/test/cases/dsl-context-validation/0010/experr index f70d4eb66..88c27a41c 100644 --- a/test/cases/dsl-context-validation/0010/experr +++ b/test/cases/dsl-context-validation/0010/experr @@ -1 +1 @@ -mlr: break statements are only valid within for/do/while loops. +mlr: break statements are only valid within for/do/while loops diff --git a/test/cases/dsl-context-validation/0011/experr b/test/cases/dsl-context-validation/0011/experr index f70d4eb66..88c27a41c 100644 --- a/test/cases/dsl-context-validation/0011/experr +++ b/test/cases/dsl-context-validation/0011/experr @@ -1 +1 @@ -mlr: break statements are only valid within for/do/while loops. +mlr: break statements are only valid within for/do/while loops diff --git a/test/cases/dsl-context-validation/0012/experr b/test/cases/dsl-context-validation/0012/experr index f70d4eb66..88c27a41c 100644 --- a/test/cases/dsl-context-validation/0012/experr +++ b/test/cases/dsl-context-validation/0012/experr @@ -1 +1 @@ -mlr: break statements are only valid within for/do/while loops. +mlr: break statements are only valid within for/do/while loops diff --git a/test/cases/dsl-empty-statements/0002/experr b/test/cases/dsl-empty-statements/0002/experr index 7d441dec9..ade7a9c32 100644 --- a/test/cases/dsl-empty-statements/0002/experr +++ b/test/cases/dsl-empty-statements/0002/experr @@ -1 +1 @@ -mlr: filter statement must not be empty. +mlr: filter statement must not be empty diff --git a/test/cases/dsl-env/0008/experr b/test/cases/dsl-env/0008/experr index c58cbb290..624b07c03 100644 --- a/test/cases/dsl-env/0008/experr +++ b/test/cases/dsl-env/0008/experr @@ -1 +1 @@ -mlr: ENV[...] cannot be indexed. +mlr: ENV[...] cannot be indexed diff --git a/test/cases/dsl-first-class-functions/sort-errors-04/experr b/test/cases/dsl-first-class-functions/sort-errors-04/experr index 41f95ed46..4a9e7afda 100644 --- a/test/cases/dsl-first-class-functions/sort-errors-04/experr +++ b/test/cases/dsl-first-class-functions/sort-errors-04/experr @@ -1,2 +1 @@ -mlr: function sort takes maximum argument count 2; got 4. - +mlr: function sort takes maximum argument count 2; got 4 diff --git a/test/cases/dsl-first-class-functions/sort-errors-05/experr b/test/cases/dsl-first-class-functions/sort-errors-05/experr index 41f95ed46..4a9e7afda 100644 --- a/test/cases/dsl-first-class-functions/sort-errors-05/experr +++ b/test/cases/dsl-first-class-functions/sort-errors-05/experr @@ -1,2 +1 @@ -mlr: function sort takes maximum argument count 2; got 4. - +mlr: function sort takes maximum argument count 2; got 4 diff --git a/test/cases/dsl-for-oosvar-loops/0005/experr b/test/cases/dsl-for-oosvar-loops/0005/experr index 69c0cca99..31b1667e3 100644 --- a/test/cases/dsl-for-oosvar-loops/0005/experr +++ b/test/cases/dsl-for-oosvar-loops/0005/experr @@ -1 +1 @@ -mlr: redefinition of variable k in the same scope. +mlr: redefinition of variable k in the same scope diff --git a/test/cases/dsl-for-oosvar-loops/0006/experr b/test/cases/dsl-for-oosvar-loops/0006/experr index 69c0cca99..31b1667e3 100644 --- a/test/cases/dsl-for-oosvar-loops/0006/experr +++ b/test/cases/dsl-for-oosvar-loops/0006/experr @@ -1 +1 @@ -mlr: redefinition of variable k in the same scope. +mlr: redefinition of variable k in the same scope diff --git a/test/cases/dsl-for-oosvar-loops/0007/experr b/test/cases/dsl-for-oosvar-loops/0007/experr index 5226846af..e26fb2c94 100644 --- a/test/cases/dsl-for-oosvar-loops/0007/experr +++ b/test/cases/dsl-for-oosvar-loops/0007/experr @@ -1 +1 @@ -mlr: redefinition of variable a in the same scope. +mlr: redefinition of variable a in the same scope diff --git a/test/cases/dsl-for-oosvar-loops/0008/experr b/test/cases/dsl-for-oosvar-loops/0008/experr index 5226846af..e26fb2c94 100644 --- a/test/cases/dsl-for-oosvar-loops/0008/experr +++ b/test/cases/dsl-for-oosvar-loops/0008/experr @@ -1 +1 @@ -mlr: redefinition of variable a in the same scope. +mlr: redefinition of variable a in the same scope diff --git a/test/cases/dsl-for-oosvar-loops/0009/experr b/test/cases/dsl-for-oosvar-loops/0009/experr index 014bc942f..0ec7e995f 100644 --- a/test/cases/dsl-for-oosvar-loops/0009/experr +++ b/test/cases/dsl-for-oosvar-loops/0009/experr @@ -1 +1 @@ -mlr: redefinition of variable b in the same scope. +mlr: redefinition of variable b in the same scope diff --git a/test/cases/dsl-for-oosvar-loops/0010/experr b/test/cases/dsl-for-oosvar-loops/0010/experr index 5226846af..e26fb2c94 100644 --- a/test/cases/dsl-for-oosvar-loops/0010/experr +++ b/test/cases/dsl-for-oosvar-loops/0010/experr @@ -1 +1 @@ -mlr: redefinition of variable a in the same scope. +mlr: redefinition of variable a in the same scope diff --git a/test/cases/dsl-for-oosvar-loops/0011/experr b/test/cases/dsl-for-oosvar-loops/0011/experr index 5226846af..e26fb2c94 100644 --- a/test/cases/dsl-for-oosvar-loops/0011/experr +++ b/test/cases/dsl-for-oosvar-loops/0011/experr @@ -1 +1 @@ -mlr: redefinition of variable a in the same scope. +mlr: redefinition of variable a in the same scope diff --git a/test/cases/dsl-for-oosvar-loops/0012/experr b/test/cases/dsl-for-oosvar-loops/0012/experr index 5226846af..e26fb2c94 100644 --- a/test/cases/dsl-for-oosvar-loops/0012/experr +++ b/test/cases/dsl-for-oosvar-loops/0012/experr @@ -1 +1 @@ -mlr: redefinition of variable a in the same scope. +mlr: redefinition of variable a in the same scope diff --git a/test/cases/dsl-for-oosvar-loops/0013/experr b/test/cases/dsl-for-oosvar-loops/0013/experr index 014bc942f..0ec7e995f 100644 --- a/test/cases/dsl-for-oosvar-loops/0013/experr +++ b/test/cases/dsl-for-oosvar-loops/0013/experr @@ -1 +1 @@ -mlr: redefinition of variable b in the same scope. +mlr: redefinition of variable b in the same scope diff --git a/test/cases/dsl-for-oosvar-loops/0014/experr b/test/cases/dsl-for-oosvar-loops/0014/experr index 014bc942f..0ec7e995f 100644 --- a/test/cases/dsl-for-oosvar-loops/0014/experr +++ b/test/cases/dsl-for-oosvar-loops/0014/experr @@ -1 +1 @@ -mlr: redefinition of variable b in the same scope. +mlr: redefinition of variable b in the same scope diff --git a/test/cases/dsl-for-oosvar-loops/0015/experr b/test/cases/dsl-for-oosvar-loops/0015/experr index 91d20ead9..b5ded9d15 100644 --- a/test/cases/dsl-for-oosvar-loops/0015/experr +++ b/test/cases/dsl-for-oosvar-loops/0015/experr @@ -1 +1 @@ -mlr: redefinition of variable c in the same scope. +mlr: redefinition of variable c in the same scope diff --git a/test/cases/dsl-for-variants/0006/experr b/test/cases/dsl-for-variants/0006/experr index 148fb6f19..0c3735274 100644 --- a/test/cases/dsl-for-variants/0006/experr +++ b/test/cases/dsl-for-variants/0006/experr @@ -1 +1 @@ -mlr: the triple-for continutation statement must be a bare boolean. +mlr: the triple-for continuation statement must be a bare boolean diff --git a/test/cases/dsl-for-variants/0008/experr b/test/cases/dsl-for-variants/0008/experr index ad7b7a3ce..8e4a46c2e 100644 --- a/test/cases/dsl-for-variants/0008/experr +++ b/test/cases/dsl-for-variants/0008/experr @@ -1 +1 @@ -mlr: the final triple-for continutation statement must be a bare boolean. +mlr: the final triple-for continuation statement must be a bare boolean diff --git a/test/cases/dsl-for-variants/0009/experr b/test/cases/dsl-for-variants/0009/experr index 48da09e23..f6e32effe 100644 --- a/test/cases/dsl-for-variants/0009/experr +++ b/test/cases/dsl-for-variants/0009/experr @@ -1 +1 @@ -mlr: the non-final triple-for continutation statements must be assignments. +mlr: the non-final triple-for continuation statements must be assignments diff --git a/test/cases/dsl-for-variants/0010/experr b/test/cases/dsl-for-variants/0010/experr index 48da09e23..f6e32effe 100644 --- a/test/cases/dsl-for-variants/0010/experr +++ b/test/cases/dsl-for-variants/0010/experr @@ -1 +1 @@ -mlr: the non-final triple-for continutation statements must be assignments. +mlr: the non-final triple-for continuation statements must be assignments diff --git a/test/cases/dsl-forbind-typedecl/0002/experr b/test/cases/dsl-forbind-typedecl/0002/experr index 870536d9d..2d3ce42b4 100644 --- a/test/cases/dsl-forbind-typedecl/0002/experr +++ b/test/cases/dsl-forbind-typedecl/0002/experr @@ -1,2 +1 @@ mlr: couldn't assign variable float i from value int 0 - diff --git a/test/cases/dsl-forbind-typedecl/0004/experr b/test/cases/dsl-forbind-typedecl/0004/experr index f19d3e91b..350883003 100644 --- a/test/cases/dsl-forbind-typedecl/0004/experr +++ b/test/cases/dsl-forbind-typedecl/0004/experr @@ -1,2 +1 @@ mlr: couldn't assign variable int i from value float 1.50000000 - diff --git a/test/cases/dsl-forbind-typedecl/0005/experr b/test/cases/dsl-forbind-typedecl/0005/experr index 02ec7d367..bc277f19f 100644 --- a/test/cases/dsl-forbind-typedecl/0005/experr +++ b/test/cases/dsl-forbind-typedecl/0005/experr @@ -1,2 +1 @@ mlr: couldn't assign variable int i from value float 1.00000000 - diff --git a/test/cases/dsl-functional-tests/0051/expout b/test/cases/dsl-functional-tests/0051/expout index d14a2c4d2..97353ee3a 100644 --- a/test/cases/dsl-functional-tests/0051/expout +++ b/test/cases/dsl-functional-tests/0051/expout @@ -60,5 +60,3 @@ "zsgnt": "int" } ] -[ -] diff --git a/test/cases/dsl-lashed-emitp-singles/0075/experr b/test/cases/dsl-lashed-emitp-singles/0075/experr index abe43d98a..c90eb2309 100644 --- a/test/cases/dsl-lashed-emitp-singles/0075/experr +++ b/test/cases/dsl-lashed-emitp-singles/0075/experr @@ -1 +1 @@ -mlr: lashed-emit node types must be local variables, field names, or oosvars; got map literal. +mlr: lashed-emit node types must be local variables, field names, or oosvars; got map literal diff --git a/test/cases/dsl-line-number-column-number/cond/experr b/test/cases/dsl-line-number-column-number/cond/experr index 7d9b1ed3f..0993a5d51 100644 --- a/test/cases/dsl-line-number-column-number/cond/experr +++ b/test/cases/dsl-line-number-column-number/cond/experr @@ -1 +1 @@ -mlr: conditional expression did not evaluate to boolean at DSL expression line 5 column 3. +mlr: conditional expression did not evaluate to boolean at DSL expression line 5 column 3 diff --git a/test/cases/dsl-line-number-column-number/do-while/experr b/test/cases/dsl-line-number-column-number/do-while/experr index 2ae50c49a..4b2d5dfa8 100644 --- a/test/cases/dsl-line-number-column-number/do-while/experr +++ b/test/cases/dsl-line-number-column-number/do-while/experr @@ -1 +1 @@ -mlr: conditional expression did not evaluate to boolean at DSL expression line 6 column 12. +mlr: conditional expression did not evaluate to boolean at DSL expression line 6 column 12 diff --git a/test/cases/dsl-line-number-column-number/for/experr b/test/cases/dsl-line-number-column-number/for/experr index a99b7edd3..0c385cf26 100644 --- a/test/cases/dsl-line-number-column-number/for/experr +++ b/test/cases/dsl-line-number-column-number/for/experr @@ -1 +1 @@ -mlr: for-loop continuation did not evaluate to boolean at DSL expression line 5 column 9. +mlr: for-loop continuation did not evaluate to boolean at DSL expression line 5 column 9 diff --git a/test/cases/dsl-line-number-column-number/if/experr b/test/cases/dsl-line-number-column-number/if/experr index 2c953ec07..8b4f058c6 100644 --- a/test/cases/dsl-line-number-column-number/if/experr +++ b/test/cases/dsl-line-number-column-number/if/experr @@ -1 +1 @@ -mlr: conditional expression did not evaluate to boolean at DSL expression line 5 column 7. +mlr: conditional expression did not evaluate to boolean at DSL expression line 5 column 7 diff --git a/test/cases/dsl-line-number-column-number/while/experr b/test/cases/dsl-line-number-column-number/while/experr index 977d70e73..eb2268d9f 100644 --- a/test/cases/dsl-line-number-column-number/while/experr +++ b/test/cases/dsl-line-number-column-number/while/experr @@ -1 +1 @@ -mlr: conditional expression did not evaluate to boolean at DSL expression line 5 column 10. +mlr: conditional expression did not evaluate to boolean at DSL expression line 5 column 10 diff --git a/test/cases/dsl-local-map-variable-typedecl/0003/experr b/test/cases/dsl-local-map-variable-typedecl/0003/experr index 74d6d8036..35f4a78af 100644 --- a/test/cases/dsl-local-map-variable-typedecl/0003/experr +++ b/test/cases/dsl-local-map-variable-typedecl/0003/experr @@ -1,2 +1 @@ mlr: couldn't assign variable map a from value int 2 - diff --git a/test/cases/dsl-local-map-variable-typedecl/0004/experr b/test/cases/dsl-local-map-variable-typedecl/0004/experr index 74d6d8036..35f4a78af 100644 --- a/test/cases/dsl-local-map-variable-typedecl/0004/experr +++ b/test/cases/dsl-local-map-variable-typedecl/0004/experr @@ -1,2 +1 @@ mlr: couldn't assign variable map a from value int 2 - diff --git a/test/cases/dsl-localvar-typedecl/0002/experr b/test/cases/dsl-localvar-typedecl/0002/experr index 7bf2edfba..80caea3e4 100644 --- a/test/cases/dsl-localvar-typedecl/0002/experr +++ b/test/cases/dsl-localvar-typedecl/0002/experr @@ -1,2 +1 @@ mlr: couldn't assign variable str a from value int 1 - diff --git a/test/cases/dsl-localvar-typedecl/0003/experr b/test/cases/dsl-localvar-typedecl/0003/experr index f2baa162e..89b17ca3e 100644 --- a/test/cases/dsl-localvar-typedecl/0003/experr +++ b/test/cases/dsl-localvar-typedecl/0003/experr @@ -1,2 +1 @@ mlr: couldn't assign variable int a from value string pan - diff --git a/test/cases/dsl-map-funcs/0003/experr b/test/cases/dsl-map-funcs/0003/experr index b46352b1b..aebcb76c3 100644 --- a/test/cases/dsl-map-funcs/0003/experr +++ b/test/cases/dsl-map-funcs/0003/experr @@ -1,2 +1 @@ -mlr: function mapexcept takes minimum argument count 1; got 0. - +mlr: function mapexcept takes minimum argument count 1; got 0 diff --git a/test/cases/dsl-map-funcs/0004/experr b/test/cases/dsl-map-funcs/0004/experr index cdbfc0f29..a177b4adc 100644 --- a/test/cases/dsl-map-funcs/0004/experr +++ b/test/cases/dsl-map-funcs/0004/experr @@ -1,2 +1 @@ -mlr: function mapselect takes minimum argument count 1; got 0. - +mlr: function mapselect takes minimum argument count 1; got 0 diff --git a/test/cases/dsl-mapsum-mapdiff-mapexcept/0003/experr b/test/cases/dsl-mapsum-mapdiff-mapexcept/0003/experr index b46352b1b..aebcb76c3 100644 --- a/test/cases/dsl-mapsum-mapdiff-mapexcept/0003/experr +++ b/test/cases/dsl-mapsum-mapdiff-mapexcept/0003/experr @@ -1,2 +1 @@ -mlr: function mapexcept takes minimum argument count 1; got 0. - +mlr: function mapexcept takes minimum argument count 1; got 0 diff --git a/test/cases/dsl-mapsum-mapdiff-mapexcept/0004/experr b/test/cases/dsl-mapsum-mapdiff-mapexcept/0004/experr index cdbfc0f29..a177b4adc 100644 --- a/test/cases/dsl-mapsum-mapdiff-mapexcept/0004/experr +++ b/test/cases/dsl-mapsum-mapdiff-mapexcept/0004/experr @@ -1,2 +1 @@ -mlr: function mapselect takes minimum argument count 1; got 0. - +mlr: function mapselect takes minimum argument count 1; got 0 diff --git a/test/cases/dsl-mapvar-assignments/0050/experr b/test/cases/dsl-mapvar-assignments/0050/experr index 7da47746e..c99edb2b9 100644 --- a/test/cases/dsl-mapvar-assignments/0050/experr +++ b/test/cases/dsl-mapvar-assignments/0050/experr @@ -1,2 +1 @@ mlr: couldn't assign variable map o from value int 1 - diff --git a/test/cases/dsl-mapvar-assignments/0056/experr b/test/cases/dsl-mapvar-assignments/0056/experr index 7da47746e..c99edb2b9 100644 --- a/test/cases/dsl-mapvar-assignments/0056/experr +++ b/test/cases/dsl-mapvar-assignments/0056/experr @@ -1,2 +1 @@ mlr: couldn't assign variable map o from value int 1 - diff --git a/test/cases/dsl-mapvars-udfs-subroutines/0006/experr b/test/cases/dsl-mapvars-udfs-subroutines/0006/experr index d99ee6563..e82e26c86 100644 --- a/test/cases/dsl-mapvars-udfs-subroutines/0006/experr +++ b/test/cases/dsl-mapvars-udfs-subroutines/0006/experr @@ -1 +1 @@ -mlr: couldn't assign variable int x from value float 0.34679014 +mlr: couldn't assign variable int x from value float 0.34679014 \ No newline at end of file diff --git a/test/cases/dsl-mapvars-udfs-subroutines/0008/experr b/test/cases/dsl-mapvars-udfs-subroutines/0008/experr index d99ee6563..e82e26c86 100644 --- a/test/cases/dsl-mapvars-udfs-subroutines/0008/experr +++ b/test/cases/dsl-mapvars-udfs-subroutines/0008/experr @@ -1 +1 @@ -mlr: couldn't assign variable int x from value float 0.34679014 +mlr: couldn't assign variable int x from value float 0.34679014 \ No newline at end of file diff --git a/test/cases/dsl-mapvars-udfs-subroutines/0010/experr b/test/cases/dsl-mapvars-udfs-subroutines/0010/experr index d99ee6563..e82e26c86 100644 --- a/test/cases/dsl-mapvars-udfs-subroutines/0010/experr +++ b/test/cases/dsl-mapvars-udfs-subroutines/0010/experr @@ -1 +1 @@ -mlr: couldn't assign variable int x from value float 0.34679014 +mlr: couldn't assign variable int x from value float 0.34679014 \ No newline at end of file diff --git a/test/cases/dsl-mapvars-udfs-subroutines/0011/experr b/test/cases/dsl-mapvars-udfs-subroutines/0011/experr index 5ee09d0dc..23c61d240 100644 --- a/test/cases/dsl-mapvars-udfs-subroutines/0011/experr +++ b/test/cases/dsl-mapvars-udfs-subroutines/0011/experr @@ -1 +1 @@ -mlr: couldn't assign variable int function return value from value absent (absent) +mlr: couldn't assign variable int function return value from value absent (absent) \ No newline at end of file diff --git a/test/cases/dsl-mapvars-udfs-subroutines/0012/experr b/test/cases/dsl-mapvars-udfs-subroutines/0012/experr index 75f9941ea..b52b23036 100644 --- a/test/cases/dsl-mapvars-udfs-subroutines/0012/experr +++ b/test/cases/dsl-mapvars-udfs-subroutines/0012/experr @@ -1,2 +1 @@ mlr: couldn't assign variable var b from value error (error) - diff --git a/test/cases/dsl-match/0001/cmd b/test/cases/dsl-match/0001/cmd new file mode 100644 index 000000000..0e3ce7786 --- /dev/null +++ b/test/cases/dsl-match/0001/cmd @@ -0,0 +1 @@ +mlr --ojsonl --from ${CASEDIR}/input put -f ${CASEDIR}/mlr diff --git a/test/cases/io-format-conversion-keystroke-savers/0005/experr b/test/cases/dsl-match/0001/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0005/experr rename to test/cases/dsl-match/0001/experr diff --git a/test/cases/dsl-match/0001/expout b/test/cases/dsl-match/0001/expout new file mode 100644 index 000000000..e025c7754 --- /dev/null +++ b/test/cases/dsl-match/0001/expout @@ -0,0 +1,11 @@ +{"x": "a", "y": "b", "z": false} +{"x": "abc", "y": "ab", "z": true} +{"x": " 345 78 ", "y": "([0-9]+)", "z": true} +{"x": " 345 78 ", "y": "([0-9]+) ([0-9]+)", "z": true} +{"x": " 345 78 ", "y": "([0-9]+)(.)([0-9]+)", "z": true} +{"x": "", "y": "", "z": true} +{"x": "", "y": "b", "z": false} +{"x": "a", "y": "", "z": true} +{"x": "a", "z": (error)} +{"y": "b", "z": (error)} +{"foo": "bar", "z": (error)} diff --git a/test/cases/dsl-match/0001/input b/test/cases/dsl-match/0001/input new file mode 100644 index 000000000..5facdc4fb --- /dev/null +++ b/test/cases/dsl-match/0001/input @@ -0,0 +1,11 @@ +x=a,y=b +x=abc,y=ab +x= 345 78 ,y=([0-9]+) +x= 345 78 ,y=([0-9]+) ([0-9]+) +x= 345 78 ,y=([0-9]+)(.)([0-9]+) +x=,y= +x=,y=b +x=a,y= +x=a +y=b +foo=bar diff --git a/test/cases/dsl-match/0001/mlr b/test/cases/dsl-match/0001/mlr new file mode 100644 index 000000000..9b015fdb7 --- /dev/null +++ b/test/cases/dsl-match/0001/mlr @@ -0,0 +1 @@ +$z = strmatch($x, $y) diff --git a/test/cases/dsl-match/0002/cmd b/test/cases/dsl-match/0002/cmd new file mode 100644 index 000000000..1fc3ab4d5 --- /dev/null +++ b/test/cases/dsl-match/0002/cmd @@ -0,0 +1 @@ +mlr --ojson --from ${CASEDIR}/input put -f ${CASEDIR}/mlr diff --git a/test/cases/io-format-conversion-keystroke-savers/0006/experr b/test/cases/dsl-match/0002/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0006/experr rename to test/cases/dsl-match/0002/experr diff --git a/test/cases/dsl-match/0002/expout b/test/cases/dsl-match/0002/expout new file mode 100644 index 000000000..1c44eb6c3 --- /dev/null +++ b/test/cases/dsl-match/0002/expout @@ -0,0 +1,110 @@ +[ +{ + "x": "a", + "y": "b", + "z": { + "matched": false + } +}, +{ + "x": "abc", + "y": "ab", + "z": { + "matched": true, + "full_capture": "ab", + "full_start": 1, + "full_end": 2 + } +}, +{ + "x": " 345 78 ", + "y": "([0-9]+)", + "z": { + "matched": true, + "full_capture": "345", + "full_start": 3, + "full_end": 5, + "captures": ["345"], + "starts": [3], + "ends": [5] + } +}, +{ + "x": " 345 78 ", + "y": "([0-9]+) ([0-9]+)", + "z": { + "matched": true, + "full_capture": "345 78", + "full_start": 3, + "full_end": 8, + "captures": ["345", "78"], + "starts": [3, 7], + "ends": [5, 8] + } +}, +{ + "x": " 345 78 ", + "y": "([0-9]+)(.)([0-9]+)", + "z": { + "matched": true, + "full_capture": "345 78", + "full_start": 3, + "full_end": 8, + "captures": ["345", " ", "78"], + "starts": [3, 6, 7], + "ends": [5, 6, 8] + } +}, +{ + "x": "", + "y": "", + "z": { + "matched": true, + "full_capture": "", + "full_start": 1, + "full_end": 0 + } +}, +{ + "x": "", + "y": "b", + "z": { + "matched": false + } +}, +{ + "x": "a", + "y": "", + "z": { + "matched": true, + "full_capture": "", + "full_start": 1, + "full_end": 0 + } +}, +{ + "x": "a", + "z": (error) +}, +{ + "y": "b", + "z": (error) +}, +{ + "foo": "bar", + "z": (error) +}, +{ + "x": "1234567890abcdefghij", + "y": "(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)", + "z": { + "matched": true, + "full_capture": "1234567890abcdefghij", + "full_start": 1, + "full_end": 20, + "captures": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + "starts": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], + "ends": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] + } +} +] diff --git a/test/cases/dsl-match/0002/input b/test/cases/dsl-match/0002/input new file mode 100644 index 000000000..10308f01d --- /dev/null +++ b/test/cases/dsl-match/0002/input @@ -0,0 +1,12 @@ +x=a,y=b +x=abc,y=ab +x= 345 78 ,y=([0-9]+) +x= 345 78 ,y=([0-9]+) ([0-9]+) +x= 345 78 ,y=([0-9]+)(.)([0-9]+) +x=,y= +x=,y=b +x=a,y= +x=a +y=b +foo=bar +x=1234567890abcdefghij,y=(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.) diff --git a/test/cases/dsl-match/0002/mlr b/test/cases/dsl-match/0002/mlr new file mode 100644 index 000000000..184b3e286 --- /dev/null +++ b/test/cases/dsl-match/0002/mlr @@ -0,0 +1 @@ +$z = strmatchx($x, $y) diff --git a/test/cases/dsl-no-filter-in-filter/0002/experr b/test/cases/dsl-no-filter-in-filter/0002/experr index 0991fd583..326c8c258 100644 --- a/test/cases/dsl-no-filter-in-filter/0002/experr +++ b/test/cases/dsl-no-filter-in-filter/0002/experr @@ -1 +1 @@ -mlr: filter expressions must not also contain the "filter" keyword. +mlr: filter expressions must not also contain the "filter" keyword diff --git a/test/cases/dsl-output-redirects/0071/expout b/test/cases/dsl-output-redirects/0071/expout index 4a1435f7c..eed189aad 100644 --- a/test/cases/dsl-output-redirects/0071/expout +++ b/test/cases/dsl-output-redirects/0071/expout @@ -9,5 +9,3 @@ x 8 9 10 -[ -] diff --git a/test/cases/dsl-parse/0112/experr b/test/cases/dsl-parse/0112/experr index c83d31e2e..5e6a6bb14 100644 --- a/test/cases/dsl-parse/0112/experr +++ b/test/cases/dsl-parse/0112/experr @@ -1 +1 @@ -mlr: int literal is not valid for unset statement. +mlr: int literal is not valid for unset statement diff --git a/test/cases/dsl-regex-matching/0017/cmd b/test/cases/dsl-regex-matching/0017/cmd new file mode 100644 index 000000000..6add080d4 --- /dev/null +++ b/test/cases/dsl-regex-matching/0017/cmd @@ -0,0 +1 @@ +mlr -n put -f ${CASEDIR}/mlr diff --git a/test/cases/io-format-conversion-keystroke-savers/0007/experr b/test/cases/dsl-regex-matching/0017/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0007/experr rename to test/cases/dsl-regex-matching/0017/experr diff --git a/test/cases/dsl-regex-matching/0017/expout b/test/cases/dsl-regex-matching/0017/expout new file mode 100644 index 000000000..860e81046 --- /dev/null +++ b/test/cases/dsl-regex-matching/0017/expout @@ -0,0 +1,6 @@ +OUTER PRE: 123 abc +OUTER PRE: 123 abc +INNER: 456 defg +INNER: 456 defg +OUTER POST: 123 abc +OUTER POST: 123 abc diff --git a/test/cases/dsl-regex-matching/0017/mlr b/test/cases/dsl-regex-matching/0017/mlr new file mode 100644 index 000000000..bec25114e --- /dev/null +++ b/test/cases/dsl-regex-matching/0017/mlr @@ -0,0 +1,15 @@ +func f() { + if ("456 defg" =~ "([0-9]+) ([a-z]+)") { + print "INNER: \1 \2"; + print "INNER: \1 \2"; + } +} +end { + if ("123 abc" =~ "([0-9]+) ([a-z]+)") { + print "OUTER PRE: \1 \2"; + print "OUTER PRE: \1 \2"; + f(); + print "OUTER POST: \1 \2"; + print "OUTER POST: \1 \2"; + } +} diff --git a/test/cases/dsl-regex-matching/0018/cmd b/test/cases/dsl-regex-matching/0018/cmd new file mode 100644 index 000000000..6add080d4 --- /dev/null +++ b/test/cases/dsl-regex-matching/0018/cmd @@ -0,0 +1 @@ +mlr -n put -f ${CASEDIR}/mlr diff --git a/test/cases/io-format-conversion-keystroke-savers/0008/experr b/test/cases/dsl-regex-matching/0018/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0008/experr rename to test/cases/dsl-regex-matching/0018/experr diff --git a/test/cases/dsl-regex-matching/0018/expout b/test/cases/dsl-regex-matching/0018/expout new file mode 100644 index 000000000..860e81046 --- /dev/null +++ b/test/cases/dsl-regex-matching/0018/expout @@ -0,0 +1,6 @@ +OUTER PRE: 123 abc +OUTER PRE: 123 abc +INNER: 456 defg +INNER: 456 defg +OUTER POST: 123 abc +OUTER POST: 123 abc diff --git a/test/cases/dsl-regex-matching/0018/mlr b/test/cases/dsl-regex-matching/0018/mlr new file mode 100644 index 000000000..992fa1d0b --- /dev/null +++ b/test/cases/dsl-regex-matching/0018/mlr @@ -0,0 +1,15 @@ +subr s() { + if ("456 defg" =~ "([0-9]+) ([a-z]+)") { + print "INNER: \1 \2"; + print "INNER: \1 \2"; + } +} +end { + if ("123 abc" =~ "([0-9]+) ([a-z]+)") { + print "OUTER PRE: \1 \2"; + print "OUTER PRE: \1 \2"; + call s(); + print "OUTER POST: \1 \2"; + print "OUTER POST: \1 \2"; + } +} diff --git a/test/cases/dsl-regex-matching/null-reset/cmd b/test/cases/dsl-regex-matching/null-reset/cmd new file mode 100644 index 000000000..6add080d4 --- /dev/null +++ b/test/cases/dsl-regex-matching/null-reset/cmd @@ -0,0 +1 @@ +mlr -n put -f ${CASEDIR}/mlr diff --git a/test/cases/io-format-conversion-keystroke-savers/0009/experr b/test/cases/dsl-regex-matching/null-reset/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0009/experr rename to test/cases/dsl-regex-matching/null-reset/experr diff --git a/test/cases/dsl-regex-matching/null-reset/expout b/test/cases/dsl-regex-matching/null-reset/expout new file mode 100644 index 000000000..38eba4339 --- /dev/null +++ b/test/cases/dsl-regex-matching/null-reset/expout @@ -0,0 +1,9 @@ +[\1]:[\2] +true +[]:[] +true +[a]:[c] +false +[]:[] +null +[\1]:[\2] diff --git a/test/cases/dsl-regex-matching/null-reset/mlr b/test/cases/dsl-regex-matching/null-reset/mlr new file mode 100644 index 000000000..0caec5ae3 --- /dev/null +++ b/test/cases/dsl-regex-matching/null-reset/mlr @@ -0,0 +1,11 @@ +end { + print("[\1]:[\2]"); + print("abc" =~ "..."); + print("[\1]:[\2]"); + print("abc" =~ "(.).(.)"); + print("[\1]:[\2]"); + print("abc" =~ "(.)x(.)"); + print("[\1]:[\2]"); + print("abc" =~ null); + print("[\1]:[\2]"); +} diff --git a/test/cases/dsl-sorts/sorta-natural/expout b/test/cases/dsl-sorts/sorta-natural/expout index 01349be34..05972250a 100644 --- a/test/cases/dsl-sorts/sorta-natural/expout +++ b/test/cases/dsl-sorts/sorta-natural/expout @@ -2,5 +2,3 @@ ["X200", "X20", "X2", "X100", "X10", "X1"] ["X1", "X2", "X10", "X20", "X100", "X200"] ["X200", "X100", "X20", "X10", "X2", "X1"] -[ -] diff --git a/test/cases/dsl-sorts/sortmf-within/expout b/test/cases/dsl-sorts/sortmf-within/expout index c683738c5..acb15cce5 100644 --- a/test/cases/dsl-sorts/sortmf-within/expout +++ b/test/cases/dsl-sorts/sortmf-within/expout @@ -18,5 +18,3 @@ "b": 2, "c": 1 } -[ -] diff --git a/test/cases/dsl-split-join/0021/expout b/test/cases/dsl-split-join/0021/expout index a49c0a717..e69de29bb 100644 --- a/test/cases/dsl-split-join/0021/expout +++ b/test/cases/dsl-split-join/0021/expout @@ -1,4 +0,0 @@ -[3, 4, 5] -[3, 4] -[3] -[] diff --git a/test/cases/dsl-split-join/0021/mlr b/test/cases/dsl-split-join/0021/mlr index 86e3dd532..32232c023 100644 --- a/test/cases/dsl-split-join/0021/mlr +++ b/test/cases/dsl-split-join/0021/mlr @@ -1,6 +1,7 @@ -end { +test/cases/dsl-split-join/0021/mlrend { print splita("3,4,5", ","); print splita("3,4", ","); print splita("3", ","); + print splita(3, ","); print splita("", ","); } diff --git a/test/cases/dsl-split-join/0028/cmd b/test/cases/dsl-split-join/0028/cmd new file mode 100644 index 000000000..6add080d4 --- /dev/null +++ b/test/cases/dsl-split-join/0028/cmd @@ -0,0 +1 @@ +mlr -n put -f ${CASEDIR}/mlr diff --git a/test/cases/io-format-conversion-keystroke-savers/0010/experr b/test/cases/dsl-split-join/0028/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0010/experr rename to test/cases/dsl-split-join/0028/experr diff --git a/test/cases/dsl-split-join/0028/expout b/test/cases/dsl-split-join/0028/expout new file mode 100644 index 000000000..51b400812 --- /dev/null +++ b/test/cases/dsl-split-join/0028/expout @@ -0,0 +1 @@ +345 diff --git a/test/cases/dsl-split-join/0028/mlr b/test/cases/dsl-split-join/0028/mlr new file mode 100644 index 000000000..08e79d8dc --- /dev/null +++ b/test/cases/dsl-split-join/0028/mlr @@ -0,0 +1 @@ +end {print joinv([3,4,5], "")} diff --git a/test/cases/dsl-stat/0001/cmd b/test/cases/dsl-stat/0001/cmd new file mode 100644 index 000000000..94b141d0d --- /dev/null +++ b/test/cases/dsl-stat/0001/cmd @@ -0,0 +1 @@ +mlr --icsv --ojson put -f ${CASEDIR}/mlr ${CASEDIR}/input.csv diff --git a/test/cases/io-format-conversion-keystroke-savers/0011/experr b/test/cases/dsl-stat/0001/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0011/experr rename to test/cases/dsl-stat/0001/experr diff --git a/test/cases/dsl-stat/0001/expout b/test/cases/dsl-stat/0001/expout new file mode 100644 index 000000000..dcb25f94b --- /dev/null +++ b/test/cases/dsl-stat/0001/expout @@ -0,0 +1,12 @@ +[ +{ + "path": "test/cases/dsl-stat/0001/input.csv", + "name": "input.csv", + "isdir": false +}, +{ + "path": "test/cases/dsl-stat/0001/", + "name": "0001", + "isdir": true +} +] diff --git a/test/cases/dsl-stat/0001/input.csv b/test/cases/dsl-stat/0001/input.csv new file mode 100644 index 000000000..d2731b359 --- /dev/null +++ b/test/cases/dsl-stat/0001/input.csv @@ -0,0 +1,3 @@ +path +test/cases/dsl-stat/0001/input.csv +test/cases/dsl-stat/0001/ diff --git a/test/cases/dsl-stat/0001/mlr b/test/cases/dsl-stat/0001/mlr new file mode 100644 index 000000000..195f1c6a7 --- /dev/null +++ b/test/cases/dsl-stat/0001/mlr @@ -0,0 +1,3 @@ +s = stat($path); +$name = s["name"]; +$isdir = s["isdir"]; diff --git a/test/cases/dsl-subr/0007/experr b/test/cases/dsl-subr/0007/experr index 57f65d06e..54423defa 100644 --- a/test/cases/dsl-subr/0007/experr +++ b/test/cases/dsl-subr/0007/experr @@ -1 +1 @@ -mlr: return statements in subr blocks must not return a value. +mlr: return statements in subr blocks must not return a value diff --git a/test/cases/dsl-subr/0015/experr b/test/cases/dsl-subr/0015/experr index 96372b783..374f3bf05 100644 --- a/test/cases/dsl-subr/0015/experr +++ b/test/cases/dsl-subr/0015/experr @@ -1 +1 @@ -mlr: subroutine named "s" has already been defined. +mlr: subroutine named "s" has already been defined diff --git a/test/cases/dsl-triple-for-loops/0015/experr b/test/cases/dsl-triple-for-loops/0015/experr index 148fb6f19..0c3735274 100644 --- a/test/cases/dsl-triple-for-loops/0015/experr +++ b/test/cases/dsl-triple-for-loops/0015/experr @@ -1 +1 @@ -mlr: the triple-for continutation statement must be a bare boolean. +mlr: the triple-for continuation statement must be a bare boolean diff --git a/test/cases/dsl-triple-for-loops/0016/experr b/test/cases/dsl-triple-for-loops/0016/experr index 48da09e23..f6e32effe 100644 --- a/test/cases/dsl-triple-for-loops/0016/experr +++ b/test/cases/dsl-triple-for-loops/0016/experr @@ -1 +1 @@ -mlr: the non-final triple-for continutation statements must be assignments. +mlr: the non-final triple-for continuation statements must be assignments diff --git a/test/cases/dsl-typedecl/0004/experr b/test/cases/dsl-typedecl/0004/experr index 698f98f19..d773f96b4 100644 --- a/test/cases/dsl-typedecl/0004/experr +++ b/test/cases/dsl-typedecl/0004/experr @@ -1,2 +1 @@ mlr: couldn't assign variable str x from value int 3 - diff --git a/test/cases/dsl-typedecl/0005/experr b/test/cases/dsl-typedecl/0005/experr index d1a782622..a9603769f 100644 --- a/test/cases/dsl-typedecl/0005/experr +++ b/test/cases/dsl-typedecl/0005/experr @@ -1,2 +1 @@ mlr: couldn't assign variable arr x from value int 3 - diff --git a/test/cases/dsl-typedecl/0009/experr b/test/cases/dsl-typedecl/0009/experr index d773f96b4..4294ddc2e 100644 --- a/test/cases/dsl-typedecl/0009/experr +++ b/test/cases/dsl-typedecl/0009/experr @@ -1 +1 @@ -mlr: couldn't assign variable str x from value int 3 +mlr: couldn't assign variable str x from value int 3 \ No newline at end of file diff --git a/test/cases/dsl-typedecl/0010/experr b/test/cases/dsl-typedecl/0010/experr index a9603769f..55d3b8e4d 100644 --- a/test/cases/dsl-typedecl/0010/experr +++ b/test/cases/dsl-typedecl/0010/experr @@ -1 +1 @@ -mlr: couldn't assign variable arr x from value int 3 +mlr: couldn't assign variable arr x from value int 3 \ No newline at end of file diff --git a/test/cases/dsl-typedecl/0014/experr b/test/cases/dsl-typedecl/0014/experr index 83d5d73ae..84cc0ab08 100644 --- a/test/cases/dsl-typedecl/0014/experr +++ b/test/cases/dsl-typedecl/0014/experr @@ -1 +1 @@ -mlr: couldn't assign variable str function return value from value int 6 +mlr: couldn't assign variable str function return value from value int 6 \ No newline at end of file diff --git a/test/cases/dsl-typedecl/0015/experr b/test/cases/dsl-typedecl/0015/experr index 16aa9509e..2e3bd9694 100644 --- a/test/cases/dsl-typedecl/0015/experr +++ b/test/cases/dsl-typedecl/0015/experr @@ -1 +1 @@ -mlr: couldn't assign variable arr function return value from value int 6 +mlr: couldn't assign variable arr function return value from value int 6 \ No newline at end of file diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0004/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0004/experr index 57f65d06e..54423defa 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0004/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0004/experr @@ -1 +1 @@ -mlr: return statements in subr blocks must not return a value. +mlr: return statements in subr blocks must not return a value diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0005/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0005/experr index 6ffaf460f..8a639f9e1 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0005/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0005/experr @@ -1 +1 @@ -mlr: return statements in func blocks must return a value. +mlr: return statements in func blocks must return a value diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0008/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0008/experr index 7c7da7e52..17bba472c 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0008/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0008/experr @@ -1 +1 @@ -mlr: function named "log" must not override a built-in function of the same name. +mlr: function named "log" must not override a built-in function of the same name diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0010/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0010/experr index 153ac97b2..9ccf96101 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0010/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0010/experr @@ -1 +1 @@ -mlr: begin blocks can only be at top level. +mlr: begin blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0011/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0011/experr index 1bf2e1cd8..2e5c850a0 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0011/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0011/experr @@ -1 +1 @@ -mlr: end blocks can only be at top level. +mlr: end blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0012/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0012/experr index 153ac97b2..9ccf96101 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0012/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0012/experr @@ -1 +1 @@ -mlr: begin blocks can only be at top level. +mlr: begin blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0013/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0013/experr index 1bf2e1cd8..2e5c850a0 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0013/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0013/experr @@ -1 +1 @@ -mlr: end blocks can only be at top level. +mlr: end blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0014/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0014/experr index 28403d108..105a572c5 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0014/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0014/experr @@ -1 +1 @@ -mlr: func blocks can only be at top level. +mlr: func blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0015/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0015/experr index aa4913898..bb100265c 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0015/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0015/experr @@ -1 +1 @@ -mlr: subr blocks can only be at top level. +mlr: subr blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0016/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0016/experr index 28403d108..105a572c5 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0016/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0016/experr @@ -1 +1 @@ -mlr: func blocks can only be at top level. +mlr: func blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0017/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0017/experr index aa4913898..bb100265c 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0017/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0017/experr @@ -1 +1 @@ -mlr: subr blocks can only be at top level. +mlr: subr blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0018/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0018/experr index 153ac97b2..9ccf96101 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0018/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0018/experr @@ -1 +1 @@ -mlr: begin blocks can only be at top level. +mlr: begin blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0019/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0019/experr index 1bf2e1cd8..2e5c850a0 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0019/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0019/experr @@ -1 +1 @@ -mlr: end blocks can only be at top level. +mlr: end blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0020/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0020/experr index 153ac97b2..9ccf96101 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0020/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0020/experr @@ -1 +1 @@ -mlr: begin blocks can only be at top level. +mlr: begin blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0021/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0021/experr index 1bf2e1cd8..2e5c850a0 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0021/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0021/experr @@ -1 +1 @@ -mlr: end blocks can only be at top level. +mlr: end blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0022/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0022/experr index 28403d108..105a572c5 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0022/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0022/experr @@ -1 +1 @@ -mlr: func blocks can only be at top level. +mlr: func blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0023/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0023/experr index aa4913898..bb100265c 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0023/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0023/experr @@ -1 +1 @@ -mlr: subr blocks can only be at top level. +mlr: subr blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0024/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0024/experr index 28403d108..105a572c5 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0024/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0024/experr @@ -1 +1 @@ -mlr: func blocks can only be at top level. +mlr: func blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0025/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0025/experr index aa4913898..bb100265c 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0025/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0025/experr @@ -1 +1 @@ -mlr: subr blocks can only be at top level. +mlr: subr blocks can only be at top level diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0026/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0026/experr index 7c7da7e52..17bba472c 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0026/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0026/experr @@ -1 +1 @@ -mlr: function named "log" must not override a built-in function of the same name. +mlr: function named "log" must not override a built-in function of the same name diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0027/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0027/experr index c1c50ef4d..e7f7b7522 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0027/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0027/experr @@ -1 +1 @@ -mlr: function named "f" has already been defined. +mlr: function named "f" has already been defined diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0028/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0028/experr index 96372b783..374f3bf05 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0028/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0028/experr @@ -1 +1 @@ -mlr: subroutine named "s" has already been defined. +mlr: subroutine named "s" has already been defined diff --git a/test/cases/dsl-user-defined-functions-and-subroutines/0029/experr b/test/cases/dsl-user-defined-functions-and-subroutines/0029/experr index 96372b783..374f3bf05 100644 --- a/test/cases/dsl-user-defined-functions-and-subroutines/0029/experr +++ b/test/cases/dsl-user-defined-functions-and-subroutines/0029/experr @@ -1 +1 @@ -mlr: subroutine named "s" has already been defined. +mlr: subroutine named "s" has already been defined diff --git a/test/cases/io-barred-pprint/barred-input-headerless/cmd b/test/cases/io-barred-pprint/barred-input-headerless/cmd new file mode 100644 index 000000000..de4ebd0d8 --- /dev/null +++ b/test/cases/io-barred-pprint/barred-input-headerless/cmd @@ -0,0 +1 @@ +mlr --hi -i pprint --barred-input -o json cat test/input/abixy.tbl diff --git a/test/cases/io-format-conversion-keystroke-savers/0012/experr b/test/cases/io-barred-pprint/barred-input-headerless/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0012/experr rename to test/cases/io-barred-pprint/barred-input-headerless/experr diff --git a/test/cases/io-barred-pprint/barred-input-headerless/expout b/test/cases/io-barred-pprint/barred-input-headerless/expout new file mode 100644 index 000000000..e08b56a90 --- /dev/null +++ b/test/cases/io-barred-pprint/barred-input-headerless/expout @@ -0,0 +1,79 @@ +[ +{ + "1": "a", + "2": "b", + "3": "i", + "4": "x", + "5": "y" +}, +{ + "1": "pan", + "2": "pan", + "3": 1, + "4": 0.34679014, + "5": 0.72680286 +}, +{ + "1": "eks", + "2": "pan", + "3": 2, + "4": 0.75867996, + "5": 0.52215111 +}, +{ + "1": "wye", + "2": "wye", + "3": 3, + "4": 0.20460331, + "5": 0.33831853 +}, +{ + "1": "eks", + "2": "wye", + "3": 4, + "4": 0.38139939, + "5": 0.13418874 +}, +{ + "1": "wye", + "2": "pan", + "3": 5, + "4": 0.57328892, + "5": 0.86362447 +}, +{ + "1": "zee", + "2": "pan", + "3": 6, + "4": 0.52712616, + "5": 0.49322129 +}, +{ + "1": "eks", + "2": "zee", + "3": 7, + "4": 0.61178406, + "5": 0.18788492 +}, +{ + "1": "zee", + "2": "wye", + "3": 8, + "4": 0.59855401, + "5": 0.97618139 +}, +{ + "1": "hat", + "2": "wye", + "3": 9, + "4": 0.03144188, + "5": 0.74955076 +}, +{ + "1": "pan", + "2": "wye", + "3": 10, + "4": 0.50262601, + "5": 0.95261836 +} +] diff --git a/test/cases/io-barred-pprint/barred-input/cmd b/test/cases/io-barred-pprint/barred-input/cmd new file mode 100644 index 000000000..4c6742df6 --- /dev/null +++ b/test/cases/io-barred-pprint/barred-input/cmd @@ -0,0 +1 @@ +mlr -i pprint --barred-input -o json cat test/input/abixy.tbl diff --git a/test/cases/io-format-conversion-keystroke-savers/0013/experr b/test/cases/io-barred-pprint/barred-input/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0013/experr rename to test/cases/io-barred-pprint/barred-input/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0003/expout b/test/cases/io-barred-pprint/barred-input/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0003/expout rename to test/cases/io-barred-pprint/barred-input/expout diff --git a/test/cases/io-csv-auto-unsparsify/at/cmd b/test/cases/io-csv-auto-unsparsify/at/cmd new file mode 100644 index 000000000..64a5e8c77 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/at/cmd @@ -0,0 +1 @@ +mlr -i json -o csv cat ${CASEDIR}/input.json diff --git a/test/cases/io-format-conversion-keystroke-savers/0014/experr b/test/cases/io-csv-auto-unsparsify/at/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0014/experr rename to test/cases/io-csv-auto-unsparsify/at/experr diff --git a/test/cases/io-csv-auto-unsparsify/at/expout b/test/cases/io-csv-auto-unsparsify/at/expout new file mode 100644 index 000000000..29e4b3171 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/at/expout @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +4,5,6 +7,8,9 diff --git a/test/cases/io-csv-auto-unsparsify/at/input.json b/test/cases/io-csv-auto-unsparsify/at/input.json new file mode 100644 index 000000000..832be9c9e --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/at/input.json @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-csv-auto-unsparsify/key-change/cmd b/test/cases/io-csv-auto-unsparsify/key-change/cmd new file mode 100644 index 000000000..64a5e8c77 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/key-change/cmd @@ -0,0 +1 @@ +mlr -i json -o csv cat ${CASEDIR}/input.json diff --git a/test/cases/io-csv-auto-unsparsify/key-change/experr b/test/cases/io-csv-auto-unsparsify/key-change/experr new file mode 100644 index 000000000..699fbb70f --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/key-change/experr @@ -0,0 +1,2 @@ +mlr: CSV schema change: first keys "a,b,c"; current keys "a,X,c" +mlr: exiting due to data error. diff --git a/test/cases/io-csv-auto-unsparsify/key-change/expout b/test/cases/io-csv-auto-unsparsify/key-change/expout new file mode 100644 index 000000000..88700c714 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/key-change/expout @@ -0,0 +1,3 @@ +a,b,c +1,2,3 +4,5,6 diff --git a/test/cases/io-csv-auto-unsparsify/key-change/input.json b/test/cases/io-csv-auto-unsparsify/key-change/input.json new file mode 100644 index 000000000..841abab57 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/key-change/input.json @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6 +}, +{ + "a": 7, + "X": 8, + "c": 9 +} +] diff --git a/test/cases/io-format-conversion-keystroke-savers/0015/experr b/test/cases/io-csv-auto-unsparsify/key-change/should-fail similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0015/experr rename to test/cases/io-csv-auto-unsparsify/key-change/should-fail diff --git a/test/cases/io-csv-auto-unsparsify/over/cmd b/test/cases/io-csv-auto-unsparsify/over/cmd new file mode 100644 index 000000000..64a5e8c77 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/over/cmd @@ -0,0 +1 @@ +mlr -i json -o csv cat ${CASEDIR}/input.json diff --git a/test/cases/io-format-conversion-keystroke-savers/0016/experr b/test/cases/io-csv-auto-unsparsify/over/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0016/experr rename to test/cases/io-csv-auto-unsparsify/over/experr diff --git a/test/cases/io-csv-auto-unsparsify/over/expout b/test/cases/io-csv-auto-unsparsify/over/expout new file mode 100644 index 000000000..44ad0219a --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/over/expout @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +4,5,6,7 +7,8,9 diff --git a/test/cases/io-csv-auto-unsparsify/over/input.json b/test/cases/io-csv-auto-unsparsify/over/input.json new file mode 100644 index 000000000..38b47c2f0 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/over/input.json @@ -0,0 +1,18 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6, + "d": 7 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-csv-auto-unsparsify/under/cmd b/test/cases/io-csv-auto-unsparsify/under/cmd new file mode 100644 index 000000000..64a5e8c77 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/under/cmd @@ -0,0 +1 @@ +mlr -i json -o csv cat ${CASEDIR}/input.json diff --git a/test/cases/io-format-conversion-keystroke-savers/0017/experr b/test/cases/io-csv-auto-unsparsify/under/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0017/experr rename to test/cases/io-csv-auto-unsparsify/under/experr diff --git a/test/cases/io-csv-auto-unsparsify/under/expout b/test/cases/io-csv-auto-unsparsify/under/expout new file mode 100644 index 000000000..48f0b0017 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/under/expout @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +4,5, +7,8,9 diff --git a/test/cases/io-csv-auto-unsparsify/under/input.json b/test/cases/io-csv-auto-unsparsify/under/input.json new file mode 100644 index 000000000..e90f7439a --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/under/input.json @@ -0,0 +1,16 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-format-conversion-keystroke-savers/0049/cmd b/test/cases/io-format-conversion-keystroke-savers/c/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0049/cmd rename to test/cases/io-format-conversion-keystroke-savers/c/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0018/experr b/test/cases/io-format-conversion-keystroke-savers/c/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0018/experr rename to test/cases/io-format-conversion-keystroke-savers/c/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0014/expout b/test/cases/io-format-conversion-keystroke-savers/c/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0014/expout rename to test/cases/io-format-conversion-keystroke-savers/c/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0008/cmd b/test/cases/io-format-conversion-keystroke-savers/c2d/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0008/cmd rename to test/cases/io-format-conversion-keystroke-savers/c2d/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0019/experr b/test/cases/io-format-conversion-keystroke-savers/c2d/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0019/experr rename to test/cases/io-format-conversion-keystroke-savers/c2d/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0008/expout b/test/cases/io-format-conversion-keystroke-savers/c2d/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0008/expout rename to test/cases/io-format-conversion-keystroke-savers/c2d/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0010/cmd b/test/cases/io-format-conversion-keystroke-savers/c2j/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0010/cmd rename to test/cases/io-format-conversion-keystroke-savers/c2j/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0020/experr b/test/cases/io-format-conversion-keystroke-savers/c2j/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0020/experr rename to test/cases/io-format-conversion-keystroke-savers/c2j/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0010/expout b/test/cases/io-format-conversion-keystroke-savers/c2j/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0010/expout rename to test/cases/io-format-conversion-keystroke-savers/c2j/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0011/cmd b/test/cases/io-format-conversion-keystroke-savers/c2m/c2p/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0011/cmd rename to test/cases/io-format-conversion-keystroke-savers/c2m/c2p/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0021/experr b/test/cases/io-format-conversion-keystroke-savers/c2m/c2p/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0021/experr rename to test/cases/io-format-conversion-keystroke-savers/c2m/c2p/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0004/expout b/test/cases/io-format-conversion-keystroke-savers/c2m/c2p/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0004/expout rename to test/cases/io-format-conversion-keystroke-savers/c2m/c2p/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0013/cmd b/test/cases/io-format-conversion-keystroke-savers/c2m/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0013/cmd rename to test/cases/io-format-conversion-keystroke-savers/c2m/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0022/experr b/test/cases/io-format-conversion-keystroke-savers/c2m/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0022/experr rename to test/cases/io-format-conversion-keystroke-savers/c2m/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0006/expout b/test/cases/io-format-conversion-keystroke-savers/c2m/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0006/expout rename to test/cases/io-format-conversion-keystroke-savers/c2m/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0009/cmd b/test/cases/io-format-conversion-keystroke-savers/c2n/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0009/cmd rename to test/cases/io-format-conversion-keystroke-savers/c2n/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0023/experr b/test/cases/io-format-conversion-keystroke-savers/c2n/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0023/experr rename to test/cases/io-format-conversion-keystroke-savers/c2n/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0002/expout b/test/cases/io-format-conversion-keystroke-savers/c2n/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0002/expout rename to test/cases/io-format-conversion-keystroke-savers/c2n/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/c2p/cmd b/test/cases/io-format-conversion-keystroke-savers/c2p/cmd new file mode 100644 index 000000000..8779b448d --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/c2p/cmd @@ -0,0 +1 @@ +mlr --c2p cat test/input/abixy.csv diff --git a/test/cases/io-format-conversion-keystroke-savers/0024/experr b/test/cases/io-format-conversion-keystroke-savers/c2p/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0024/experr rename to test/cases/io-format-conversion-keystroke-savers/c2p/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0011/expout b/test/cases/io-format-conversion-keystroke-savers/c2p/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0011/expout rename to test/cases/io-format-conversion-keystroke-savers/c2p/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0007/cmd b/test/cases/io-format-conversion-keystroke-savers/c2t/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0007/cmd rename to test/cases/io-format-conversion-keystroke-savers/c2t/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0025/experr b/test/cases/io-format-conversion-keystroke-savers/c2t/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0025/experr rename to test/cases/io-format-conversion-keystroke-savers/c2t/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0001/expout b/test/cases/io-format-conversion-keystroke-savers/c2t/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0001/expout rename to test/cases/io-format-conversion-keystroke-savers/c2t/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0012/cmd b/test/cases/io-format-conversion-keystroke-savers/c2x/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0012/cmd rename to test/cases/io-format-conversion-keystroke-savers/c2x/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0026/experr b/test/cases/io-format-conversion-keystroke-savers/c2x/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0026/experr rename to test/cases/io-format-conversion-keystroke-savers/c2x/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0005/expout b/test/cases/io-format-conversion-keystroke-savers/c2x/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0005/expout rename to test/cases/io-format-conversion-keystroke-savers/c2x/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0003/cmd b/test/cases/io-format-conversion-keystroke-savers/d2j/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0003/cmd rename to test/cases/io-format-conversion-keystroke-savers/d2j/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0027/experr b/test/cases/io-format-conversion-keystroke-savers/d2j/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0027/experr rename to test/cases/io-format-conversion-keystroke-savers/d2j/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0017/expout b/test/cases/io-format-conversion-keystroke-savers/d2j/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0017/expout rename to test/cases/io-format-conversion-keystroke-savers/d2j/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0006/cmd b/test/cases/io-format-conversion-keystroke-savers/d2m/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0006/cmd rename to test/cases/io-format-conversion-keystroke-savers/d2m/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0004/cmd b/test/cases/io-format-conversion-keystroke-savers/d2m/d2p/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0004/cmd rename to test/cases/io-format-conversion-keystroke-savers/d2m/d2p/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0028/experr b/test/cases/io-format-conversion-keystroke-savers/d2m/d2p/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0028/experr rename to test/cases/io-format-conversion-keystroke-savers/d2m/d2p/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0018/expout b/test/cases/io-format-conversion-keystroke-savers/d2m/d2p/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0018/expout rename to test/cases/io-format-conversion-keystroke-savers/d2m/d2p/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0029/experr b/test/cases/io-format-conversion-keystroke-savers/d2m/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0029/experr rename to test/cases/io-format-conversion-keystroke-savers/d2m/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0013/expout b/test/cases/io-format-conversion-keystroke-savers/d2m/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0013/expout rename to test/cases/io-format-conversion-keystroke-savers/d2m/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0002/cmd b/test/cases/io-format-conversion-keystroke-savers/d2n/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0002/cmd rename to test/cases/io-format-conversion-keystroke-savers/d2n/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0030/experr b/test/cases/io-format-conversion-keystroke-savers/d2n/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0030/experr rename to test/cases/io-format-conversion-keystroke-savers/d2n/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0009/expout b/test/cases/io-format-conversion-keystroke-savers/d2n/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0009/expout rename to test/cases/io-format-conversion-keystroke-savers/d2n/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/d2p/cmd b/test/cases/io-format-conversion-keystroke-savers/d2p/cmd new file mode 100644 index 000000000..7ee7ebd21 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/d2p/cmd @@ -0,0 +1 @@ +mlr --d2p cat test/input/abixy.dkvp diff --git a/test/cases/io-format-conversion-keystroke-savers/0031/experr b/test/cases/io-format-conversion-keystroke-savers/d2p/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0031/experr rename to test/cases/io-format-conversion-keystroke-savers/d2p/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0032/expout b/test/cases/io-format-conversion-keystroke-savers/d2p/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0032/expout rename to test/cases/io-format-conversion-keystroke-savers/d2p/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0001/cmd b/test/cases/io-format-conversion-keystroke-savers/d2t/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0001/cmd rename to test/cases/io-format-conversion-keystroke-savers/d2t/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0032/experr b/test/cases/io-format-conversion-keystroke-savers/d2t/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0032/experr rename to test/cases/io-format-conversion-keystroke-savers/d2t/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0007/expout b/test/cases/io-format-conversion-keystroke-savers/d2t/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0007/expout rename to test/cases/io-format-conversion-keystroke-savers/d2t/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0005/cmd b/test/cases/io-format-conversion-keystroke-savers/d2x/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0005/cmd rename to test/cases/io-format-conversion-keystroke-savers/d2x/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0033/experr b/test/cases/io-format-conversion-keystroke-savers/d2x/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0033/experr rename to test/cases/io-format-conversion-keystroke-savers/d2x/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0012/expout b/test/cases/io-format-conversion-keystroke-savers/d2x/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0012/expout rename to test/cases/io-format-conversion-keystroke-savers/d2x/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0053/cmd b/test/cases/io-format-conversion-keystroke-savers/itsv-odkvp/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0053/cmd rename to test/cases/io-format-conversion-keystroke-savers/itsv-odkvp/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0034/experr b/test/cases/io-format-conversion-keystroke-savers/itsv-odkvp/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0034/experr rename to test/cases/io-format-conversion-keystroke-savers/itsv-odkvp/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0053/expout b/test/cases/io-format-conversion-keystroke-savers/itsv-odkvp/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0053/expout rename to test/cases/io-format-conversion-keystroke-savers/itsv-odkvp/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0052/input.tsv b/test/cases/io-format-conversion-keystroke-savers/itsv-odkvp/input.tsv similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0052/input.tsv rename to test/cases/io-format-conversion-keystroke-savers/itsv-odkvp/input.tsv diff --git a/test/cases/io-format-conversion-keystroke-savers/0052/cmd b/test/cases/io-format-conversion-keystroke-savers/itsvlite-odkvp/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0052/cmd rename to test/cases/io-format-conversion-keystroke-savers/itsvlite-odkvp/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0035/experr b/test/cases/io-format-conversion-keystroke-savers/itsvlite-odkvp/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0035/experr rename to test/cases/io-format-conversion-keystroke-savers/itsvlite-odkvp/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0052/expout b/test/cases/io-format-conversion-keystroke-savers/itsvlite-odkvp/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0052/expout rename to test/cases/io-format-conversion-keystroke-savers/itsvlite-odkvp/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0053/input.tsv b/test/cases/io-format-conversion-keystroke-savers/itsvlite-odkvp/input.tsv similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0053/input.tsv rename to test/cases/io-format-conversion-keystroke-savers/itsvlite-odkvp/input.tsv diff --git a/test/cases/io-format-conversion-keystroke-savers/0051/cmd b/test/cases/io-format-conversion-keystroke-savers/j/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0051/cmd rename to test/cases/io-format-conversion-keystroke-savers/j/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0036/experr b/test/cases/io-format-conversion-keystroke-savers/j/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0036/experr rename to test/cases/io-format-conversion-keystroke-savers/j/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0039/expout b/test/cases/io-format-conversion-keystroke-savers/j/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0039/expout rename to test/cases/io-format-conversion-keystroke-savers/j/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0028/cmd b/test/cases/io-format-conversion-keystroke-savers/j2c/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0028/cmd rename to test/cases/io-format-conversion-keystroke-savers/j2c/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0037/experr b/test/cases/io-format-conversion-keystroke-savers/j2c/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0037/experr rename to test/cases/io-format-conversion-keystroke-savers/j2c/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0028/expout b/test/cases/io-format-conversion-keystroke-savers/j2c/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0028/expout rename to test/cases/io-format-conversion-keystroke-savers/j2c/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0030/cmd b/test/cases/io-format-conversion-keystroke-savers/j2d/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0030/cmd rename to test/cases/io-format-conversion-keystroke-savers/j2d/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0038/experr b/test/cases/io-format-conversion-keystroke-savers/j2d/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0038/experr rename to test/cases/io-format-conversion-keystroke-savers/j2d/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0015/expout b/test/cases/io-format-conversion-keystroke-savers/j2d/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0015/expout rename to test/cases/io-format-conversion-keystroke-savers/j2d/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0034/cmd b/test/cases/io-format-conversion-keystroke-savers/j2m/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0034/cmd rename to test/cases/io-format-conversion-keystroke-savers/j2m/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0039/experr b/test/cases/io-format-conversion-keystroke-savers/j2m/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0039/experr rename to test/cases/io-format-conversion-keystroke-savers/j2m/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0020/expout b/test/cases/io-format-conversion-keystroke-savers/j2m/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0020/expout rename to test/cases/io-format-conversion-keystroke-savers/j2m/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0032/cmd b/test/cases/io-format-conversion-keystroke-savers/j2m/j2p/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0032/cmd rename to test/cases/io-format-conversion-keystroke-savers/j2m/j2p/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0040/experr b/test/cases/io-format-conversion-keystroke-savers/j2m/j2p/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0040/experr rename to test/cases/io-format-conversion-keystroke-savers/j2m/j2p/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0047/expout b/test/cases/io-format-conversion-keystroke-savers/j2m/j2p/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0047/expout rename to test/cases/io-format-conversion-keystroke-savers/j2m/j2p/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0031/cmd b/test/cases/io-format-conversion-keystroke-savers/j2n/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0031/cmd rename to test/cases/io-format-conversion-keystroke-savers/j2n/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0041/experr b/test/cases/io-format-conversion-keystroke-savers/j2n/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0041/experr rename to test/cases/io-format-conversion-keystroke-savers/j2n/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0016/expout b/test/cases/io-format-conversion-keystroke-savers/j2n/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0016/expout rename to test/cases/io-format-conversion-keystroke-savers/j2n/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/j2p/cmd b/test/cases/io-format-conversion-keystroke-savers/j2p/cmd new file mode 100644 index 000000000..1c11e7e36 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/j2p/cmd @@ -0,0 +1 @@ +mlr --j2p cat test/input/abixy.json diff --git a/test/cases/io-format-conversion-keystroke-savers/0042/experr b/test/cases/io-format-conversion-keystroke-savers/j2p/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0042/experr rename to test/cases/io-format-conversion-keystroke-savers/j2p/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/j2p/expout b/test/cases/io-format-conversion-keystroke-savers/j2p/expout new file mode 100644 index 000000000..b8ac13481 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/j2p/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +eks pan 2 0.75867996 0.52215111 +wye wye 3 0.20460331 0.33831853 +eks wye 4 0.38139939 0.13418874 +wye pan 5 0.57328892 0.86362447 +zee pan 6 0.52712616 0.49322129 +eks zee 7 0.61178406 0.18788492 +zee wye 8 0.59855401 0.97618139 +hat wye 9 0.03144188 0.74955076 +pan wye 10 0.50262601 0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/0029/cmd b/test/cases/io-format-conversion-keystroke-savers/j2t/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0029/cmd rename to test/cases/io-format-conversion-keystroke-savers/j2t/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0043/experr b/test/cases/io-format-conversion-keystroke-savers/j2t/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0043/experr rename to test/cases/io-format-conversion-keystroke-savers/j2t/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0029/expout b/test/cases/io-format-conversion-keystroke-savers/j2t/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0029/expout rename to test/cases/io-format-conversion-keystroke-savers/j2t/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0033/cmd b/test/cases/io-format-conversion-keystroke-savers/j2x/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0033/cmd rename to test/cases/io-format-conversion-keystroke-savers/j2x/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0044/experr b/test/cases/io-format-conversion-keystroke-savers/j2x/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0044/experr rename to test/cases/io-format-conversion-keystroke-savers/j2x/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0019/expout b/test/cases/io-format-conversion-keystroke-savers/j2x/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0019/expout rename to test/cases/io-format-conversion-keystroke-savers/j2x/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/l2m/cmd b/test/cases/io-format-conversion-keystroke-savers/l2m/cmd new file mode 100644 index 000000000..462e4a9a2 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/l2m/cmd @@ -0,0 +1 @@ +mlr --l2m cat test/input/abixy.json diff --git a/test/cases/io-format-conversion-keystroke-savers/0045/experr b/test/cases/io-format-conversion-keystroke-savers/l2m/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0045/experr rename to test/cases/io-format-conversion-keystroke-savers/l2m/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0034/expout b/test/cases/io-format-conversion-keystroke-savers/l2m/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0034/expout rename to test/cases/io-format-conversion-keystroke-savers/l2m/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/m2c/cmd b/test/cases/io-format-conversion-keystroke-savers/m2c/cmd new file mode 100644 index 000000000..029dc93d3 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/m2c/cmd @@ -0,0 +1 @@ +mlr --m2c cat test/input/abixy.md diff --git a/test/cases/io-format-conversion-keystroke-savers/0046/experr b/test/cases/io-format-conversion-keystroke-savers/m2c/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0046/experr rename to test/cases/io-format-conversion-keystroke-savers/m2c/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0035/expout b/test/cases/io-format-conversion-keystroke-savers/m2c/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0035/expout rename to test/cases/io-format-conversion-keystroke-savers/m2c/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/m2d/cmd b/test/cases/io-format-conversion-keystroke-savers/m2d/cmd new file mode 100644 index 000000000..9619267ee --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/m2d/cmd @@ -0,0 +1 @@ +mlr --m2d cat test/input/abixy.md diff --git a/test/cases/io-format-conversion-keystroke-savers/0047/experr b/test/cases/io-format-conversion-keystroke-savers/m2d/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0047/experr rename to test/cases/io-format-conversion-keystroke-savers/m2d/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0030/expout b/test/cases/io-format-conversion-keystroke-savers/m2d/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0030/expout rename to test/cases/io-format-conversion-keystroke-savers/m2d/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/m2j/cmd b/test/cases/io-format-conversion-keystroke-savers/m2j/cmd new file mode 100644 index 000000000..ae6a63c33 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/m2j/cmd @@ -0,0 +1 @@ +mlr --m2j cat test/input/abixy.md diff --git a/test/cases/io-format-conversion-keystroke-savers/0048/experr b/test/cases/io-format-conversion-keystroke-savers/m2j/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0048/experr rename to test/cases/io-format-conversion-keystroke-savers/m2j/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0046/expout b/test/cases/io-format-conversion-keystroke-savers/m2j/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0046/expout rename to test/cases/io-format-conversion-keystroke-savers/m2j/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/m2l/cmd b/test/cases/io-format-conversion-keystroke-savers/m2l/cmd new file mode 100644 index 000000000..bf8cea46c --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/m2l/cmd @@ -0,0 +1 @@ +mlr --m2l cat test/input/abixy.md diff --git a/test/cases/io-format-conversion-keystroke-savers/0049/experr b/test/cases/io-format-conversion-keystroke-savers/m2l/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0049/experr rename to test/cases/io-format-conversion-keystroke-savers/m2l/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/m2l/expout b/test/cases/io-format-conversion-keystroke-savers/m2l/expout new file mode 100644 index 000000000..5a5818e0f --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/m2l/expout @@ -0,0 +1,10 @@ +{"a": "pan", "b": "pan", "i": 1, "x": 0.34679014, "y": 0.72680286} +{"a": "eks", "b": "pan", "i": 2, "x": 0.75867996, "y": 0.52215111} +{"a": "wye", "b": "wye", "i": 3, "x": 0.20460331, "y": 0.33831853} +{"a": "eks", "b": "wye", "i": 4, "x": 0.38139939, "y": 0.13418874} +{"a": "wye", "b": "pan", "i": 5, "x": 0.57328892, "y": 0.86362447} +{"a": "zee", "b": "pan", "i": 6, "x": 0.52712616, "y": 0.49322129} +{"a": "eks", "b": "zee", "i": 7, "x": 0.61178406, "y": 0.18788492} +{"a": "zee", "b": "wye", "i": 8, "x": 0.59855401, "y": 0.97618139} +{"a": "hat", "b": "wye", "i": 9, "x": 0.03144188, "y": 0.74955076} +{"a": "pan", "b": "wye", "i": 10, "x": 0.50262601, "y": 0.95261836} diff --git a/test/cases/io-format-conversion-keystroke-savers/m2n/cmd b/test/cases/io-format-conversion-keystroke-savers/m2n/cmd new file mode 100644 index 000000000..ba7179b2a --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/m2n/cmd @@ -0,0 +1 @@ +mlr --m2n cat test/input/abixy.md diff --git a/test/cases/io-format-conversion-keystroke-savers/0050/experr b/test/cases/io-format-conversion-keystroke-savers/m2n/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0050/experr rename to test/cases/io-format-conversion-keystroke-savers/m2n/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0031/expout b/test/cases/io-format-conversion-keystroke-savers/m2n/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0031/expout rename to test/cases/io-format-conversion-keystroke-savers/m2n/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/m2p/cmd b/test/cases/io-format-conversion-keystroke-savers/m2p/cmd new file mode 100644 index 000000000..5dfd5e425 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/m2p/cmd @@ -0,0 +1 @@ +mlr --m2p cat test/input/abixy.md diff --git a/test/cases/io-format-conversion-keystroke-savers/0051/experr b/test/cases/io-format-conversion-keystroke-savers/m2p/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0051/experr rename to test/cases/io-format-conversion-keystroke-savers/m2p/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/m2p/expout b/test/cases/io-format-conversion-keystroke-savers/m2p/expout new file mode 100644 index 000000000..b8ac13481 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/m2p/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +eks pan 2 0.75867996 0.52215111 +wye wye 3 0.20460331 0.33831853 +eks wye 4 0.38139939 0.13418874 +wye pan 5 0.57328892 0.86362447 +zee pan 6 0.52712616 0.49322129 +eks zee 7 0.61178406 0.18788492 +zee wye 8 0.59855401 0.97618139 +hat wye 9 0.03144188 0.74955076 +pan wye 10 0.50262601 0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/m2t/cmd b/test/cases/io-format-conversion-keystroke-savers/m2t/cmd new file mode 100644 index 000000000..b24a15801 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/m2t/cmd @@ -0,0 +1 @@ +mlr --m2t cat test/input/abixy.md diff --git a/test/cases/io-format-conversion-keystroke-savers/0052/experr b/test/cases/io-format-conversion-keystroke-savers/m2t/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0052/experr rename to test/cases/io-format-conversion-keystroke-savers/m2t/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0036/expout b/test/cases/io-format-conversion-keystroke-savers/m2t/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0036/expout rename to test/cases/io-format-conversion-keystroke-savers/m2t/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/m2x/cmd b/test/cases/io-format-conversion-keystroke-savers/m2x/cmd new file mode 100644 index 000000000..dfff2ec6b --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/m2x/cmd @@ -0,0 +1 @@ +mlr --m2x cat test/input/abixy.md diff --git a/test/cases/io-format-conversion-keystroke-savers/0053/experr b/test/cases/io-format-conversion-keystroke-savers/m2x/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0053/experr rename to test/cases/io-format-conversion-keystroke-savers/m2x/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0033/expout b/test/cases/io-format-conversion-keystroke-savers/m2x/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0033/expout rename to test/cases/io-format-conversion-keystroke-savers/m2x/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0021/cmd b/test/cases/io-format-conversion-keystroke-savers/n2c/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0021/cmd rename to test/cases/io-format-conversion-keystroke-savers/n2c/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0054/experr b/test/cases/io-format-conversion-keystroke-savers/n2c/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0054/experr rename to test/cases/io-format-conversion-keystroke-savers/n2c/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0021/expout b/test/cases/io-format-conversion-keystroke-savers/n2c/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0021/expout rename to test/cases/io-format-conversion-keystroke-savers/n2c/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0023/cmd b/test/cases/io-format-conversion-keystroke-savers/n2d/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0023/cmd rename to test/cases/io-format-conversion-keystroke-savers/n2d/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/0055/experr b/test/cases/io-format-conversion-keystroke-savers/n2d/experr similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0055/experr rename to test/cases/io-format-conversion-keystroke-savers/n2d/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0023/expout b/test/cases/io-format-conversion-keystroke-savers/n2d/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0023/expout rename to test/cases/io-format-conversion-keystroke-savers/n2d/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0024/cmd b/test/cases/io-format-conversion-keystroke-savers/n2j/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0024/cmd rename to test/cases/io-format-conversion-keystroke-savers/n2j/cmd diff --git a/test/cases/io-markdown-output/0001/experr b/test/cases/io-format-conversion-keystroke-savers/n2j/experr similarity index 100% rename from test/cases/io-markdown-output/0001/experr rename to test/cases/io-format-conversion-keystroke-savers/n2j/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0024/expout b/test/cases/io-format-conversion-keystroke-savers/n2j/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0024/expout rename to test/cases/io-format-conversion-keystroke-savers/n2j/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0027/cmd b/test/cases/io-format-conversion-keystroke-savers/n2m/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0027/cmd rename to test/cases/io-format-conversion-keystroke-savers/n2m/cmd diff --git a/test/cases/io-markdown-output/0002/experr b/test/cases/io-format-conversion-keystroke-savers/n2m/experr similarity index 100% rename from test/cases/io-markdown-output/0002/experr rename to test/cases/io-format-conversion-keystroke-savers/n2m/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0027/expout b/test/cases/io-format-conversion-keystroke-savers/n2m/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0027/expout rename to test/cases/io-format-conversion-keystroke-savers/n2m/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0025/cmd b/test/cases/io-format-conversion-keystroke-savers/n2m/n2p/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0025/cmd rename to test/cases/io-format-conversion-keystroke-savers/n2m/n2p/cmd diff --git a/test/cases/verb-sub-gsub-ssub/0001/experr b/test/cases/io-format-conversion-keystroke-savers/n2m/n2p/experr similarity index 100% rename from test/cases/verb-sub-gsub-ssub/0001/experr rename to test/cases/io-format-conversion-keystroke-savers/n2m/n2p/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0025/expout b/test/cases/io-format-conversion-keystroke-savers/n2m/n2p/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0025/expout rename to test/cases/io-format-conversion-keystroke-savers/n2m/n2p/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/n2p/cmd b/test/cases/io-format-conversion-keystroke-savers/n2p/cmd new file mode 100644 index 000000000..d1e5973fb --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/n2p/cmd @@ -0,0 +1 @@ +mlr --n2p cat test/input/abixy.nidx diff --git a/test/cases/verb-sub-gsub-ssub/0002/experr b/test/cases/io-format-conversion-keystroke-savers/n2p/experr similarity index 100% rename from test/cases/verb-sub-gsub-ssub/0002/experr rename to test/cases/io-format-conversion-keystroke-savers/n2p/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/n2p/expout b/test/cases/io-format-conversion-keystroke-savers/n2p/expout new file mode 100644 index 000000000..4deb25a0b --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/n2p/expout @@ -0,0 +1,11 @@ +1 2 3 4 5 +pan pan 1 0.34679014 0.72680286 +eks pan 2 0.75867996 0.52215111 +wye wye 3 0.20460331 0.33831853 +eks wye 4 0.38139939 0.13418874 +wye pan 5 0.57328892 0.86362447 +zee pan 6 0.52712616 0.49322129 +eks zee 7 0.61178406 0.18788492 +zee wye 8 0.59855401 0.97618139 +hat wye 9 0.03144188 0.74955076 +pan wye 10 0.50262601 0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/0022/cmd b/test/cases/io-format-conversion-keystroke-savers/n2t/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0022/cmd rename to test/cases/io-format-conversion-keystroke-savers/n2t/cmd diff --git a/test/cases/verb-sub-gsub-ssub/0003/experr b/test/cases/io-format-conversion-keystroke-savers/n2t/experr similarity index 100% rename from test/cases/verb-sub-gsub-ssub/0003/experr rename to test/cases/io-format-conversion-keystroke-savers/n2t/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0022/expout b/test/cases/io-format-conversion-keystroke-savers/n2t/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0022/expout rename to test/cases/io-format-conversion-keystroke-savers/n2t/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0026/cmd b/test/cases/io-format-conversion-keystroke-savers/n2x/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0026/cmd rename to test/cases/io-format-conversion-keystroke-savers/n2x/cmd diff --git a/test/cases/verb-sub-gsub-ssub/0004/experr b/test/cases/io-format-conversion-keystroke-savers/n2x/experr similarity index 100% rename from test/cases/verb-sub-gsub-ssub/0004/experr rename to test/cases/io-format-conversion-keystroke-savers/n2x/experr diff --git a/test/cases/io-format-conversion-keystroke-savers/0026/expout b/test/cases/io-format-conversion-keystroke-savers/n2x/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0026/expout rename to test/cases/io-format-conversion-keystroke-savers/n2x/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0035/cmd b/test/cases/io-format-conversion-keystroke-savers/p2c/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0035/cmd rename to test/cases/io-format-conversion-keystroke-savers/p2c/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/p2c/experr b/test/cases/io-format-conversion-keystroke-savers/p2c/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0042/expout b/test/cases/io-format-conversion-keystroke-savers/p2c/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0042/expout rename to test/cases/io-format-conversion-keystroke-savers/p2c/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0037/cmd b/test/cases/io-format-conversion-keystroke-savers/p2d/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0037/cmd rename to test/cases/io-format-conversion-keystroke-savers/p2d/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/p2d/experr b/test/cases/io-format-conversion-keystroke-savers/p2d/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0037/expout b/test/cases/io-format-conversion-keystroke-savers/p2d/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0037/expout rename to test/cases/io-format-conversion-keystroke-savers/p2d/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0039/cmd b/test/cases/io-format-conversion-keystroke-savers/p2j/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0039/cmd rename to test/cases/io-format-conversion-keystroke-savers/p2j/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/p2j/experr b/test/cases/io-format-conversion-keystroke-savers/p2j/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0051/expout b/test/cases/io-format-conversion-keystroke-savers/p2j/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0051/expout rename to test/cases/io-format-conversion-keystroke-savers/p2j/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0041/cmd b/test/cases/io-format-conversion-keystroke-savers/p2m/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0041/cmd rename to test/cases/io-format-conversion-keystroke-savers/p2m/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/p2m/experr b/test/cases/io-format-conversion-keystroke-savers/p2m/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0041/expout b/test/cases/io-format-conversion-keystroke-savers/p2m/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0041/expout rename to test/cases/io-format-conversion-keystroke-savers/p2m/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0047/cmd b/test/cases/io-format-conversion-keystroke-savers/p2m/x2p/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0047/cmd rename to test/cases/io-format-conversion-keystroke-savers/p2m/x2p/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/p2m/x2p/experr b/test/cases/io-format-conversion-keystroke-savers/p2m/x2p/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/p2m/x2p/expout b/test/cases/io-format-conversion-keystroke-savers/p2m/x2p/expout new file mode 100644 index 000000000..b8ac13481 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/p2m/x2p/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +eks pan 2 0.75867996 0.52215111 +wye wye 3 0.20460331 0.33831853 +eks wye 4 0.38139939 0.13418874 +wye pan 5 0.57328892 0.86362447 +zee pan 6 0.52712616 0.49322129 +eks zee 7 0.61178406 0.18788492 +zee wye 8 0.59855401 0.97618139 +hat wye 9 0.03144188 0.74955076 +pan wye 10 0.50262601 0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/0038/cmd b/test/cases/io-format-conversion-keystroke-savers/p2n/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0038/cmd rename to test/cases/io-format-conversion-keystroke-savers/p2n/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/p2n/experr b/test/cases/io-format-conversion-keystroke-savers/p2n/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0038/expout b/test/cases/io-format-conversion-keystroke-savers/p2n/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0038/expout rename to test/cases/io-format-conversion-keystroke-savers/p2n/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0036/cmd b/test/cases/io-format-conversion-keystroke-savers/p2t/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0036/cmd rename to test/cases/io-format-conversion-keystroke-savers/p2t/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/p2t/experr b/test/cases/io-format-conversion-keystroke-savers/p2t/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0043/expout b/test/cases/io-format-conversion-keystroke-savers/p2t/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0043/expout rename to test/cases/io-format-conversion-keystroke-savers/p2t/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0040/cmd b/test/cases/io-format-conversion-keystroke-savers/p2x/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0040/cmd rename to test/cases/io-format-conversion-keystroke-savers/p2x/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/p2x/experr b/test/cases/io-format-conversion-keystroke-savers/p2x/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0040/expout b/test/cases/io-format-conversion-keystroke-savers/p2x/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0040/expout rename to test/cases/io-format-conversion-keystroke-savers/p2x/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0050/cmd b/test/cases/io-format-conversion-keystroke-savers/t/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0050/cmd rename to test/cases/io-format-conversion-keystroke-savers/t/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/t/experr b/test/cases/io-format-conversion-keystroke-savers/t/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0050/expout b/test/cases/io-format-conversion-keystroke-savers/t/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0050/expout rename to test/cases/io-format-conversion-keystroke-savers/t/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0014/cmd b/test/cases/io-format-conversion-keystroke-savers/t2c/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0014/cmd rename to test/cases/io-format-conversion-keystroke-savers/t2c/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/t2c/experr b/test/cases/io-format-conversion-keystroke-savers/t2c/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0049/expout b/test/cases/io-format-conversion-keystroke-savers/t2c/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0049/expout rename to test/cases/io-format-conversion-keystroke-savers/t2c/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0015/cmd b/test/cases/io-format-conversion-keystroke-savers/t2d/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0015/cmd rename to test/cases/io-format-conversion-keystroke-savers/t2d/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/t2d/experr b/test/cases/io-format-conversion-keystroke-savers/t2d/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0044/expout b/test/cases/io-format-conversion-keystroke-savers/t2d/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0044/expout rename to test/cases/io-format-conversion-keystroke-savers/t2d/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0017/cmd b/test/cases/io-format-conversion-keystroke-savers/t2j/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0017/cmd rename to test/cases/io-format-conversion-keystroke-savers/t2j/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/t2j/experr b/test/cases/io-format-conversion-keystroke-savers/t2j/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/t2j/expout b/test/cases/io-format-conversion-keystroke-savers/t2j/expout new file mode 100644 index 000000000..48da25210 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/t2j/expout @@ -0,0 +1,72 @@ +[ +{ + "a": "pan", + "b": "pan", + "i": 1, + "x": 0.34679014, + "y": 0.72680286 +}, +{ + "a": "eks", + "b": "pan", + "i": 2, + "x": 0.75867996, + "y": 0.52215111 +}, +{ + "a": "wye", + "b": "wye", + "i": 3, + "x": 0.20460331, + "y": 0.33831853 +}, +{ + "a": "eks", + "b": "wye", + "i": 4, + "x": 0.38139939, + "y": 0.13418874 +}, +{ + "a": "wye", + "b": "pan", + "i": 5, + "x": 0.57328892, + "y": 0.86362447 +}, +{ + "a": "zee", + "b": "pan", + "i": 6, + "x": 0.52712616, + "y": 0.49322129 +}, +{ + "a": "eks", + "b": "zee", + "i": 7, + "x": 0.61178406, + "y": 0.18788492 +}, +{ + "a": "zee", + "b": "wye", + "i": 8, + "x": 0.59855401, + "y": 0.97618139 +}, +{ + "a": "hat", + "b": "wye", + "i": 9, + "x": 0.03144188, + "y": 0.74955076 +}, +{ + "a": "pan", + "b": "wye", + "i": 10, + "x": 0.50262601, + "y": 0.95261836 +} +] diff --git a/test/cases/io-format-conversion-keystroke-savers/0020/cmd b/test/cases/io-format-conversion-keystroke-savers/t2m/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0020/cmd rename to test/cases/io-format-conversion-keystroke-savers/t2m/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/t2m/experr b/test/cases/io-format-conversion-keystroke-savers/t2m/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0048/expout b/test/cases/io-format-conversion-keystroke-savers/t2m/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0048/expout rename to test/cases/io-format-conversion-keystroke-savers/t2m/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0018/cmd b/test/cases/io-format-conversion-keystroke-savers/t2m/t2p/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0018/cmd rename to test/cases/io-format-conversion-keystroke-savers/t2m/t2p/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/t2m/t2p/experr b/test/cases/io-format-conversion-keystroke-savers/t2m/t2p/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/t2m/t2p/expout b/test/cases/io-format-conversion-keystroke-savers/t2m/t2p/expout new file mode 100644 index 000000000..b8ac13481 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/t2m/t2p/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +eks pan 2 0.75867996 0.52215111 +wye wye 3 0.20460331 0.33831853 +eks wye 4 0.38139939 0.13418874 +wye pan 5 0.57328892 0.86362447 +zee pan 6 0.52712616 0.49322129 +eks zee 7 0.61178406 0.18788492 +zee wye 8 0.59855401 0.97618139 +hat wye 9 0.03144188 0.74955076 +pan wye 10 0.50262601 0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/0016/cmd b/test/cases/io-format-conversion-keystroke-savers/t2n/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0016/cmd rename to test/cases/io-format-conversion-keystroke-savers/t2n/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/t2n/experr b/test/cases/io-format-conversion-keystroke-savers/t2n/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0045/expout b/test/cases/io-format-conversion-keystroke-savers/t2n/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0045/expout rename to test/cases/io-format-conversion-keystroke-savers/t2n/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/t2p/cmd b/test/cases/io-format-conversion-keystroke-savers/t2p/cmd new file mode 100644 index 000000000..7963d101b --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/t2p/cmd @@ -0,0 +1 @@ +mlr --t2p cat test/input/abixy.tsv diff --git a/test/cases/io-format-conversion-keystroke-savers/t2p/experr b/test/cases/io-format-conversion-keystroke-savers/t2p/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/t2p/expout b/test/cases/io-format-conversion-keystroke-savers/t2p/expout new file mode 100644 index 000000000..b8ac13481 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/t2p/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +eks pan 2 0.75867996 0.52215111 +wye wye 3 0.20460331 0.33831853 +eks wye 4 0.38139939 0.13418874 +wye pan 5 0.57328892 0.86362447 +zee pan 6 0.52712616 0.49322129 +eks zee 7 0.61178406 0.18788492 +zee wye 8 0.59855401 0.97618139 +hat wye 9 0.03144188 0.74955076 +pan wye 10 0.50262601 0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/0019/cmd b/test/cases/io-format-conversion-keystroke-savers/t2x/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0019/cmd rename to test/cases/io-format-conversion-keystroke-savers/t2x/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/t2x/experr b/test/cases/io-format-conversion-keystroke-savers/t2x/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/t2x/expout b/test/cases/io-format-conversion-keystroke-savers/t2x/expout new file mode 100644 index 000000000..9c955fb87 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/t2x/expout @@ -0,0 +1,59 @@ +a pan +b pan +i 1 +x 0.34679014 +y 0.72680286 + +a eks +b pan +i 2 +x 0.75867996 +y 0.52215111 + +a wye +b wye +i 3 +x 0.20460331 +y 0.33831853 + +a eks +b wye +i 4 +x 0.38139939 +y 0.13418874 + +a wye +b pan +i 5 +x 0.57328892 +y 0.86362447 + +a zee +b pan +i 6 +x 0.52712616 +y 0.49322129 + +a eks +b zee +i 7 +x 0.61178406 +y 0.18788492 + +a zee +b wye +i 8 +x 0.59855401 +y 0.97618139 + +a hat +b wye +i 9 +x 0.03144188 +y 0.74955076 + +a pan +b wye +i 10 +x 0.50262601 +y 0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/0055/cmd b/test/cases/io-format-conversion-keystroke-savers/tsv/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0055/cmd rename to test/cases/io-format-conversion-keystroke-savers/tsv/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/tsv/experr b/test/cases/io-format-conversion-keystroke-savers/tsv/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0055/expout b/test/cases/io-format-conversion-keystroke-savers/tsv/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0055/expout rename to test/cases/io-format-conversion-keystroke-savers/tsv/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0054/input.tsv b/test/cases/io-format-conversion-keystroke-savers/tsv/input.tsv similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0054/input.tsv rename to test/cases/io-format-conversion-keystroke-savers/tsv/input.tsv diff --git a/test/cases/io-format-conversion-keystroke-savers/0054/cmd b/test/cases/io-format-conversion-keystroke-savers/tsvlite/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0054/cmd rename to test/cases/io-format-conversion-keystroke-savers/tsvlite/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/tsvlite/experr b/test/cases/io-format-conversion-keystroke-savers/tsvlite/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/0054/expout b/test/cases/io-format-conversion-keystroke-savers/tsvlite/expout similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0054/expout rename to test/cases/io-format-conversion-keystroke-savers/tsvlite/expout diff --git a/test/cases/io-format-conversion-keystroke-savers/0055/input.tsv b/test/cases/io-format-conversion-keystroke-savers/tsvlite/input.tsv similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0055/input.tsv rename to test/cases/io-format-conversion-keystroke-savers/tsvlite/input.tsv diff --git a/test/cases/io-format-conversion-keystroke-savers/0042/cmd b/test/cases/io-format-conversion-keystroke-savers/x2c/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0042/cmd rename to test/cases/io-format-conversion-keystroke-savers/x2c/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/x2c/experr b/test/cases/io-format-conversion-keystroke-savers/x2c/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/x2c/expout b/test/cases/io-format-conversion-keystroke-savers/x2c/expout new file mode 100644 index 000000000..37eac50e4 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/x2c/expout @@ -0,0 +1,11 @@ +a,b,i,x,y +pan,pan,1,0.34679014,0.72680286 +eks,pan,2,0.75867996,0.52215111 +wye,wye,3,0.20460331,0.33831853 +eks,wye,4,0.38139939,0.13418874 +wye,pan,5,0.57328892,0.86362447 +zee,pan,6,0.52712616,0.49322129 +eks,zee,7,0.61178406,0.18788492 +zee,wye,8,0.59855401,0.97618139 +hat,wye,9,0.03144188,0.74955076 +pan,wye,10,0.50262601,0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/0044/cmd b/test/cases/io-format-conversion-keystroke-savers/x2d/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0044/cmd rename to test/cases/io-format-conversion-keystroke-savers/x2d/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/x2d/experr b/test/cases/io-format-conversion-keystroke-savers/x2d/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/x2d/expout b/test/cases/io-format-conversion-keystroke-savers/x2d/expout new file mode 100644 index 000000000..940df3d11 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/x2d/expout @@ -0,0 +1,10 @@ +a=pan,b=pan,i=1,x=0.34679014,y=0.72680286 +a=eks,b=pan,i=2,x=0.75867996,y=0.52215111 +a=wye,b=wye,i=3,x=0.20460331,y=0.33831853 +a=eks,b=wye,i=4,x=0.38139939,y=0.13418874 +a=wye,b=pan,i=5,x=0.57328892,y=0.86362447 +a=zee,b=pan,i=6,x=0.52712616,y=0.49322129 +a=eks,b=zee,i=7,x=0.61178406,y=0.18788492 +a=zee,b=wye,i=8,x=0.59855401,y=0.97618139 +a=hat,b=wye,i=9,x=0.03144188,y=0.74955076 +a=pan,b=wye,i=10,x=0.50262601,y=0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/0046/cmd b/test/cases/io-format-conversion-keystroke-savers/x2j/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0046/cmd rename to test/cases/io-format-conversion-keystroke-savers/x2j/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/x2j/experr b/test/cases/io-format-conversion-keystroke-savers/x2j/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/x2j/expout b/test/cases/io-format-conversion-keystroke-savers/x2j/expout new file mode 100644 index 000000000..48da25210 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/x2j/expout @@ -0,0 +1,72 @@ +[ +{ + "a": "pan", + "b": "pan", + "i": 1, + "x": 0.34679014, + "y": 0.72680286 +}, +{ + "a": "eks", + "b": "pan", + "i": 2, + "x": 0.75867996, + "y": 0.52215111 +}, +{ + "a": "wye", + "b": "wye", + "i": 3, + "x": 0.20460331, + "y": 0.33831853 +}, +{ + "a": "eks", + "b": "wye", + "i": 4, + "x": 0.38139939, + "y": 0.13418874 +}, +{ + "a": "wye", + "b": "pan", + "i": 5, + "x": 0.57328892, + "y": 0.86362447 +}, +{ + "a": "zee", + "b": "pan", + "i": 6, + "x": 0.52712616, + "y": 0.49322129 +}, +{ + "a": "eks", + "b": "zee", + "i": 7, + "x": 0.61178406, + "y": 0.18788492 +}, +{ + "a": "zee", + "b": "wye", + "i": 8, + "x": 0.59855401, + "y": 0.97618139 +}, +{ + "a": "hat", + "b": "wye", + "i": 9, + "x": 0.03144188, + "y": 0.74955076 +}, +{ + "a": "pan", + "b": "wye", + "i": 10, + "x": 0.50262601, + "y": 0.95261836 +} +] diff --git a/test/cases/io-format-conversion-keystroke-savers/0048/cmd b/test/cases/io-format-conversion-keystroke-savers/x2m/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0048/cmd rename to test/cases/io-format-conversion-keystroke-savers/x2m/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/x2m/experr b/test/cases/io-format-conversion-keystroke-savers/x2m/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/x2m/expout b/test/cases/io-format-conversion-keystroke-savers/x2m/expout new file mode 100644 index 000000000..3d77a0324 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/x2m/expout @@ -0,0 +1,12 @@ +| a | b | i | x | y | +| --- | --- | --- | --- | --- | +| pan | pan | 1 | 0.34679014 | 0.72680286 | +| eks | pan | 2 | 0.75867996 | 0.52215111 | +| wye | wye | 3 | 0.20460331 | 0.33831853 | +| eks | wye | 4 | 0.38139939 | 0.13418874 | +| wye | pan | 5 | 0.57328892 | 0.86362447 | +| zee | pan | 6 | 0.52712616 | 0.49322129 | +| eks | zee | 7 | 0.61178406 | 0.18788492 | +| zee | wye | 8 | 0.59855401 | 0.97618139 | +| hat | wye | 9 | 0.03144188 | 0.74955076 | +| pan | wye | 10 | 0.50262601 | 0.95261836 | diff --git a/test/cases/io-format-conversion-keystroke-savers/x2m/x2p/cmd b/test/cases/io-format-conversion-keystroke-savers/x2m/x2p/cmd new file mode 100644 index 000000000..2dad8a232 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/x2m/x2p/cmd @@ -0,0 +1 @@ +mlr --x2p cat test/input/abixy.xtab diff --git a/test/cases/io-format-conversion-keystroke-savers/x2m/x2p/experr b/test/cases/io-format-conversion-keystroke-savers/x2m/x2p/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/x2m/x2p/expout b/test/cases/io-format-conversion-keystroke-savers/x2m/x2p/expout new file mode 100644 index 000000000..b8ac13481 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/x2m/x2p/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +eks pan 2 0.75867996 0.52215111 +wye wye 3 0.20460331 0.33831853 +eks wye 4 0.38139939 0.13418874 +wye pan 5 0.57328892 0.86362447 +zee pan 6 0.52712616 0.49322129 +eks zee 7 0.61178406 0.18788492 +zee wye 8 0.59855401 0.97618139 +hat wye 9 0.03144188 0.74955076 +pan wye 10 0.50262601 0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/0045/cmd b/test/cases/io-format-conversion-keystroke-savers/x2n/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0045/cmd rename to test/cases/io-format-conversion-keystroke-savers/x2n/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/x2n/experr b/test/cases/io-format-conversion-keystroke-savers/x2n/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/x2n/expout b/test/cases/io-format-conversion-keystroke-savers/x2n/expout new file mode 100644 index 000000000..17f7e1ee6 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/x2n/expout @@ -0,0 +1,10 @@ +pan pan 1 0.34679014 0.72680286 +eks pan 2 0.75867996 0.52215111 +wye wye 3 0.20460331 0.33831853 +eks wye 4 0.38139939 0.13418874 +wye pan 5 0.57328892 0.86362447 +zee pan 6 0.52712616 0.49322129 +eks zee 7 0.61178406 0.18788492 +zee wye 8 0.59855401 0.97618139 +hat wye 9 0.03144188 0.74955076 +pan wye 10 0.50262601 0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/x2p/cmd b/test/cases/io-format-conversion-keystroke-savers/x2p/cmd new file mode 100644 index 000000000..2dad8a232 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/x2p/cmd @@ -0,0 +1 @@ +mlr --x2p cat test/input/abixy.xtab diff --git a/test/cases/io-format-conversion-keystroke-savers/x2p/experr b/test/cases/io-format-conversion-keystroke-savers/x2p/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/x2p/expout b/test/cases/io-format-conversion-keystroke-savers/x2p/expout new file mode 100644 index 000000000..b8ac13481 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/x2p/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +eks pan 2 0.75867996 0.52215111 +wye wye 3 0.20460331 0.33831853 +eks wye 4 0.38139939 0.13418874 +wye pan 5 0.57328892 0.86362447 +zee pan 6 0.52712616 0.49322129 +eks zee 7 0.61178406 0.18788492 +zee wye 8 0.59855401 0.97618139 +hat wye 9 0.03144188 0.74955076 +pan wye 10 0.50262601 0.95261836 diff --git a/test/cases/io-format-conversion-keystroke-savers/0043/cmd b/test/cases/io-format-conversion-keystroke-savers/x2t/cmd similarity index 100% rename from test/cases/io-format-conversion-keystroke-savers/0043/cmd rename to test/cases/io-format-conversion-keystroke-savers/x2t/cmd diff --git a/test/cases/io-format-conversion-keystroke-savers/x2t/experr b/test/cases/io-format-conversion-keystroke-savers/x2t/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-format-conversion-keystroke-savers/x2t/expout b/test/cases/io-format-conversion-keystroke-savers/x2t/expout new file mode 100644 index 000000000..03ac8f384 --- /dev/null +++ b/test/cases/io-format-conversion-keystroke-savers/x2t/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +eks pan 2 0.75867996 0.52215111 +wye wye 3 0.20460331 0.33831853 +eks wye 4 0.38139939 0.13418874 +wye pan 5 0.57328892 0.86362447 +zee pan 6 0.52712616 0.49322129 +eks zee 7 0.61178406 0.18788492 +zee wye 8 0.59855401 0.97618139 +hat wye 9 0.03144188 0.74955076 +pan wye 10 0.50262601 0.95261836 diff --git a/test/cases/io-json-io/0036/cmd b/test/cases/io-json-io/0036/cmd new file mode 100644 index 000000000..a298f0f2e --- /dev/null +++ b/test/cases/io-json-io/0036/cmd @@ -0,0 +1 @@ +mlr --ijson --opprint cat test/input/binary.json diff --git a/test/cases/io-json-io/0036/experr b/test/cases/io-json-io/0036/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-json-io/0036/expout b/test/cases/io-json-io/0036/expout new file mode 100644 index 000000000..dd883f4e5 --- /dev/null +++ b/test/cases/io-json-io/0036/expout @@ -0,0 +1,2 @@ +msg +X����Y diff --git a/test/cases/io-json-io/0037/cmd b/test/cases/io-json-io/0037/cmd new file mode 100644 index 000000000..abcffb242 --- /dev/null +++ b/test/cases/io-json-io/0037/cmd @@ -0,0 +1 @@ +mlr -j cat test/input/binary.json diff --git a/test/cases/io-json-io/0037/experr b/test/cases/io-json-io/0037/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-json-io/0037/expout b/test/cases/io-json-io/0037/expout new file mode 100644 index 000000000..9bf2f47be --- /dev/null +++ b/test/cases/io-json-io/0037/expout @@ -0,0 +1,5 @@ +[ +{ + "msg": "X\u0001\b����\u0012Y" +} +] diff --git a/test/cases/io-jsonl-io/0004/cmd b/test/cases/io-jsonl-io/0004/cmd index 380bba0ca..8aa87f37e 100644 --- a/test/cases/io-jsonl-io/0004/cmd +++ b/test/cases/io-jsonl-io/0004/cmd @@ -1 +1 @@ -mlr --ojsonl cat test/input/json-output-options.dkvp +mlr -o jsonl cat test/input/json-output-options.dkvp diff --git a/test/cases/io-markdown-output/0001/cmd b/test/cases/io-markdown/0001/cmd similarity index 100% rename from test/cases/io-markdown-output/0001/cmd rename to test/cases/io-markdown/0001/cmd diff --git a/test/cases/io-markdown/0001/experr b/test/cases/io-markdown/0001/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-markdown-output/0001/expout b/test/cases/io-markdown/0001/expout similarity index 100% rename from test/cases/io-markdown-output/0001/expout rename to test/cases/io-markdown/0001/expout diff --git a/test/cases/io-markdown-output/0002/cmd b/test/cases/io-markdown/0002/cmd similarity index 100% rename from test/cases/io-markdown-output/0002/cmd rename to test/cases/io-markdown/0002/cmd diff --git a/test/cases/io-markdown/0002/experr b/test/cases/io-markdown/0002/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-markdown-output/0002/expout b/test/cases/io-markdown/0002/expout similarity index 100% rename from test/cases/io-markdown-output/0002/expout rename to test/cases/io-markdown/0002/expout diff --git a/test/cases/io-markdown-output/0002/input b/test/cases/io-markdown/0002/input similarity index 100% rename from test/cases/io-markdown-output/0002/input rename to test/cases/io-markdown/0002/input diff --git a/test/cases/io-markdown/markdown-input-headerless/cmd b/test/cases/io-markdown/markdown-input-headerless/cmd new file mode 100644 index 000000000..f38836b97 --- /dev/null +++ b/test/cases/io-markdown/markdown-input-headerless/cmd @@ -0,0 +1 @@ +mlr --hi -i markdown -o json cat test/input/abixy.md diff --git a/test/cases/io-markdown/markdown-input-headerless/experr b/test/cases/io-markdown/markdown-input-headerless/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-markdown/markdown-input-headerless/expout b/test/cases/io-markdown/markdown-input-headerless/expout new file mode 100644 index 000000000..e08b56a90 --- /dev/null +++ b/test/cases/io-markdown/markdown-input-headerless/expout @@ -0,0 +1,79 @@ +[ +{ + "1": "a", + "2": "b", + "3": "i", + "4": "x", + "5": "y" +}, +{ + "1": "pan", + "2": "pan", + "3": 1, + "4": 0.34679014, + "5": 0.72680286 +}, +{ + "1": "eks", + "2": "pan", + "3": 2, + "4": 0.75867996, + "5": 0.52215111 +}, +{ + "1": "wye", + "2": "wye", + "3": 3, + "4": 0.20460331, + "5": 0.33831853 +}, +{ + "1": "eks", + "2": "wye", + "3": 4, + "4": 0.38139939, + "5": 0.13418874 +}, +{ + "1": "wye", + "2": "pan", + "3": 5, + "4": 0.57328892, + "5": 0.86362447 +}, +{ + "1": "zee", + "2": "pan", + "3": 6, + "4": 0.52712616, + "5": 0.49322129 +}, +{ + "1": "eks", + "2": "zee", + "3": 7, + "4": 0.61178406, + "5": 0.18788492 +}, +{ + "1": "zee", + "2": "wye", + "3": 8, + "4": 0.59855401, + "5": 0.97618139 +}, +{ + "1": "hat", + "2": "wye", + "3": 9, + "4": 0.03144188, + "5": 0.74955076 +}, +{ + "1": "pan", + "2": "wye", + "3": 10, + "4": 0.50262601, + "5": 0.95261836 +} +] diff --git a/test/cases/io-markdown/markdown-input/cmd b/test/cases/io-markdown/markdown-input/cmd new file mode 100644 index 000000000..5dd4673c5 --- /dev/null +++ b/test/cases/io-markdown/markdown-input/cmd @@ -0,0 +1 @@ +mlr -i markdown -o json cat test/input/abixy.md diff --git a/test/cases/io-markdown/markdown-input/experr b/test/cases/io-markdown/markdown-input/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-markdown/markdown-input/expout b/test/cases/io-markdown/markdown-input/expout new file mode 100644 index 000000000..48da25210 --- /dev/null +++ b/test/cases/io-markdown/markdown-input/expout @@ -0,0 +1,72 @@ +[ +{ + "a": "pan", + "b": "pan", + "i": 1, + "x": 0.34679014, + "y": 0.72680286 +}, +{ + "a": "eks", + "b": "pan", + "i": 2, + "x": 0.75867996, + "y": 0.52215111 +}, +{ + "a": "wye", + "b": "wye", + "i": 3, + "x": 0.20460331, + "y": 0.33831853 +}, +{ + "a": "eks", + "b": "wye", + "i": 4, + "x": 0.38139939, + "y": 0.13418874 +}, +{ + "a": "wye", + "b": "pan", + "i": 5, + "x": 0.57328892, + "y": 0.86362447 +}, +{ + "a": "zee", + "b": "pan", + "i": 6, + "x": 0.52712616, + "y": 0.49322129 +}, +{ + "a": "eks", + "b": "zee", + "i": 7, + "x": 0.61178406, + "y": 0.18788492 +}, +{ + "a": "zee", + "b": "wye", + "i": 8, + "x": 0.59855401, + "y": 0.97618139 +}, +{ + "a": "hat", + "b": "wye", + "i": 9, + "x": 0.03144188, + "y": 0.74955076 +}, +{ + "a": "pan", + "b": "wye", + "i": 10, + "x": 0.50262601, + "y": 0.95261836 +} +] diff --git a/test/cases/io-multi/0010/experr b/test/cases/io-multi/0010/experr index e69de29bb..15e296abb 100644 --- a/test/cases/io-multi/0010/experr +++ b/test/cases/io-multi/0010/experr @@ -0,0 +1,2 @@ +mlr: CSV schema change: first keys "host"; current keys "df/tmp,uptime" +mlr: exiting due to data error. diff --git a/test/cases/io-multi/0010/expout b/test/cases/io-multi/0010/expout index 0d20e38d9..57d47ff76 100644 --- a/test/cases/io-multi/0010/expout +++ b/test/cases/io-multi/0010/expout @@ -1,35 +1,2 @@ host jupiter - -df/tmp,uptime -2.43MB,32345sec - -host -saturn - -df/tmp,uptime -1.34MB,234214132sec - -host -mars - -df/tmp,uptime -4.97MB,345089805sec - -host -jupiter - -df/tmp,uptime -0.04MB,890sec - -host -mars - -df/tmp,uptime -8.55MB,787897777sec - -host -saturn - -df/tmp,uptime -9.47MB,234289080sec diff --git a/test/cases/io-multi/0010/should-fail b/test/cases/io-multi/0010/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-multi/0033/experr b/test/cases/io-multi/0033/experr index e69de29bb..15e296abb 100644 --- a/test/cases/io-multi/0033/experr +++ b/test/cases/io-multi/0033/experr @@ -0,0 +1,2 @@ +mlr: CSV schema change: first keys "host"; current keys "df/tmp,uptime" +mlr: exiting due to data error. diff --git a/test/cases/io-multi/0033/expout b/test/cases/io-multi/0033/expout index 0d20e38d9..57d47ff76 100644 --- a/test/cases/io-multi/0033/expout +++ b/test/cases/io-multi/0033/expout @@ -1,35 +1,2 @@ host jupiter - -df/tmp,uptime -2.43MB,32345sec - -host -saturn - -df/tmp,uptime -1.34MB,234214132sec - -host -mars - -df/tmp,uptime -4.97MB,345089805sec - -host -jupiter - -df/tmp,uptime -0.04MB,890sec - -host -mars - -df/tmp,uptime -8.55MB,787897777sec - -host -saturn - -df/tmp,uptime -9.47MB,234289080sec diff --git a/test/cases/io-multi/0033/should-fail b/test/cases/io-multi/0033/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-multi/0034/experr b/test/cases/io-multi/0034/experr index e69de29bb..15e296abb 100644 --- a/test/cases/io-multi/0034/experr +++ b/test/cases/io-multi/0034/experr @@ -0,0 +1,2 @@ +mlr: CSV schema change: first keys "host"; current keys "df/tmp,uptime" +mlr: exiting due to data error. diff --git a/test/cases/io-multi/0034/expout b/test/cases/io-multi/0034/expout index 2a14e7a0b..9ad9ee391 100644 --- a/test/cases/io-multi/0034/expout +++ b/test/cases/io-multi/0034/expout @@ -1,23 +1 @@ jupiter - -2.43MB,32345sec - -saturn - -1.34MB,234214132sec - -mars - -4.97MB,345089805sec - -jupiter - -0.04MB,890sec - -mars - -8.55MB,787897777sec - -saturn - -9.47MB,234289080sec diff --git a/test/cases/io-multi/0034/should-fail b/test/cases/io-multi/0034/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-multi/0053/expout b/test/cases/io-multi/0053/expout index 0d4f101c7..e69de29bb 100644 --- a/test/cases/io-multi/0053/expout +++ b/test/cases/io-multi/0053/expout @@ -1,2 +0,0 @@ -[ -] diff --git a/test/cases/io-multi/0057/expout b/test/cases/io-multi/0057/expout index 0d4f101c7..e69de29bb 100644 --- a/test/cases/io-multi/0057/expout +++ b/test/cases/io-multi/0057/expout @@ -1,2 +0,0 @@ -[ -] diff --git a/test/cases/io-skip-pass-comments/pr-1346/cmd b/test/cases/io-skip-pass-comments/pr-1346/cmd new file mode 100644 index 000000000..611187612 --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1346/cmd @@ -0,0 +1 @@ +mlr --skip-comments --csv --pass-comments cat test/input/pr-1346.csv diff --git a/test/cases/io-skip-pass-comments/pr-1346/experr b/test/cases/io-skip-pass-comments/pr-1346/experr new file mode 100644 index 000000000..10864f8ab --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1346/experr @@ -0,0 +1 @@ +mlr: mlr: CSV header/data length mismatch 2 != 1 at filename test/input/pr-1346.csv row 4. diff --git a/test/cases/io-skip-pass-comments/pr-1346/expout b/test/cases/io-skip-pass-comments/pr-1346/expout new file mode 100644 index 000000000..b7872a7a9 --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1346/expout @@ -0,0 +1,5 @@ +field1,field2 +a,b +# that was the first record +c,d +# that was the second record, and there is no more data diff --git a/test/cases/io-skip-pass-comments/pr-1346/should-fail b/test/cases/io-skip-pass-comments/pr-1346/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-skip-pass-comments/pr-1787-a/cmd b/test/cases/io-skip-pass-comments/pr-1787-a/cmd new file mode 100644 index 000000000..8ecdde63e --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-a/cmd @@ -0,0 +1 @@ +mlr --csv cat test/input/pr-1787.csv diff --git a/test/cases/io-skip-pass-comments/pr-1787-a/experr b/test/cases/io-skip-pass-comments/pr-1787-a/experr new file mode 100644 index 000000000..9e02e68bc --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-a/experr @@ -0,0 +1 @@ +mlr: parse error on line 3, column 4: bare " in non-quoted-field. diff --git a/test/cases/io-skip-pass-comments/pr-1787-a/expout b/test/cases/io-skip-pass-comments/pr-1787-a/expout new file mode 100644 index 000000000..bfde6bfa0 --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-a/expout @@ -0,0 +1,2 @@ +a,b,c +1,2,3 diff --git a/test/cases/io-skip-pass-comments/pr-1787-a/should-fail b/test/cases/io-skip-pass-comments/pr-1787-a/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-skip-pass-comments/pr-1787-b/cmd b/test/cases/io-skip-pass-comments/pr-1787-b/cmd new file mode 100644 index 000000000..c79588a16 --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-b/cmd @@ -0,0 +1 @@ +mlr --csv --pass-comments cat test/input/pr-1787.csv diff --git a/test/cases/io-skip-pass-comments/pr-1787-b/experr b/test/cases/io-skip-pass-comments/pr-1787-b/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-skip-pass-comments/pr-1787-b/expout b/test/cases/io-skip-pass-comments/pr-1787-b/expout new file mode 100644 index 000000000..23b8c638c --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-b/expout @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +# x"y +4,5,6 diff --git a/test/cases/io-skip-pass-comments/pr-1787-c/cmd b/test/cases/io-skip-pass-comments/pr-1787-c/cmd new file mode 100644 index 000000000..8e17a1f3e --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-c/cmd @@ -0,0 +1 @@ +mlr --csv --skip-comments cat test/input/pr-1787.csv diff --git a/test/cases/io-skip-pass-comments/pr-1787-c/experr b/test/cases/io-skip-pass-comments/pr-1787-c/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-skip-pass-comments/pr-1787-c/expout b/test/cases/io-skip-pass-comments/pr-1787-c/expout new file mode 100644 index 000000000..88700c714 --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-c/expout @@ -0,0 +1,3 @@ +a,b,c +1,2,3 +4,5,6 diff --git a/test/cases/io-skip-pass-comments/pr-1787-d/cmd b/test/cases/io-skip-pass-comments/pr-1787-d/cmd new file mode 100644 index 000000000..9db12e96e --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-d/cmd @@ -0,0 +1 @@ +mlr --csv --skip-comments-with '##' cat test/input/pr-1787.csv diff --git a/test/cases/io-skip-pass-comments/pr-1787-d/experr b/test/cases/io-skip-pass-comments/pr-1787-d/experr new file mode 100644 index 000000000..f8b7d1e1a --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-d/experr @@ -0,0 +1 @@ +mlr: for CSV, the comment prefix must be a single character. diff --git a/test/cases/io-skip-pass-comments/pr-1787-d/expout b/test/cases/io-skip-pass-comments/pr-1787-d/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-skip-pass-comments/pr-1787-d/should-fail b/test/cases/io-skip-pass-comments/pr-1787-d/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-spec-tsv/0004/experr b/test/cases/io-spec-tsv/0004/experr index 77ead78b2..a996791ac 100644 --- a/test/cases/io-spec-tsv/0004/experr +++ b/test/cases/io-spec-tsv/0004/experr @@ -1,2 +1 @@ -mlr: mlr: TSV header/data length mismatch 1 != 0 at filename test/cases/io-spec-tsv/0004/single-column-with-blank.tsv line 4. -. +mlr: mlr: TSV header/data length mismatch 1 != 0 at filename test/cases/io-spec-tsv/0004/single-column-with-blank.tsv line 4. diff --git a/test/cases/io-tsv-auto-unsparsify/at/cmd b/test/cases/io-tsv-auto-unsparsify/at/cmd new file mode 100644 index 000000000..818cba82b --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/at/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/at/experr b/test/cases/io-tsv-auto-unsparsify/at/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-tsv-auto-unsparsify/at/expout b/test/cases/io-tsv-auto-unsparsify/at/expout new file mode 100644 index 000000000..c0232182d --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/at/expout @@ -0,0 +1,4 @@ +a b c +1 2 3 +4 5 6 +7 8 9 diff --git a/test/cases/io-tsv-auto-unsparsify/at/input.json b/test/cases/io-tsv-auto-unsparsify/at/input.json new file mode 100644 index 000000000..832be9c9e --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/at/input.json @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/cmd b/test/cases/io-tsv-auto-unsparsify/key-change/cmd new file mode 100644 index 000000000..818cba82b --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/experr b/test/cases/io-tsv-auto-unsparsify/key-change/experr new file mode 100644 index 000000000..ce615563a --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/experr @@ -0,0 +1,2 @@ +mlr: TSV schema change: first keys "a b c"; current keys "a X c" +mlr: exiting due to data error. diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/expout b/test/cases/io-tsv-auto-unsparsify/key-change/expout new file mode 100644 index 000000000..c96a25f19 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/expout @@ -0,0 +1,3 @@ +a b c +1 2 3 +4 5 6 diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/input.json b/test/cases/io-tsv-auto-unsparsify/key-change/input.json new file mode 100644 index 000000000..841abab57 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/input.json @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6 +}, +{ + "a": 7, + "X": 8, + "c": 9 +} +] diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/should-fail b/test/cases/io-tsv-auto-unsparsify/key-change/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-tsv-auto-unsparsify/over/cmd b/test/cases/io-tsv-auto-unsparsify/over/cmd new file mode 100644 index 000000000..818cba82b --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/over/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/over/experr b/test/cases/io-tsv-auto-unsparsify/over/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-tsv-auto-unsparsify/over/expout b/test/cases/io-tsv-auto-unsparsify/over/expout new file mode 100644 index 000000000..0a61a2406 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/over/expout @@ -0,0 +1,4 @@ +a b c +1 2 3 +4 5 6 7 +7 8 9 diff --git a/test/cases/io-tsv-auto-unsparsify/over/input.json b/test/cases/io-tsv-auto-unsparsify/over/input.json new file mode 100644 index 000000000..38b47c2f0 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/over/input.json @@ -0,0 +1,18 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6, + "d": 7 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-tsv-auto-unsparsify/under/cmd b/test/cases/io-tsv-auto-unsparsify/under/cmd new file mode 100644 index 000000000..818cba82b --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/under/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/under/experr b/test/cases/io-tsv-auto-unsparsify/under/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-tsv-auto-unsparsify/under/expout b/test/cases/io-tsv-auto-unsparsify/under/expout new file mode 100644 index 000000000..7b24f5bdb --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/under/expout @@ -0,0 +1,4 @@ +a b c +1 2 3 +4 5 +7 8 9 diff --git a/test/cases/io-tsv-auto-unsparsify/under/input.json b/test/cases/io-tsv-auto-unsparsify/under/input.json new file mode 100644 index 000000000..e90f7439a --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/under/input.json @@ -0,0 +1,16 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/non-windows/file-not-found/csv/cmd b/test/cases/non-windows/file-not-found/csv/cmd new file mode 100644 index 000000000..c0111df1c --- /dev/null +++ b/test/cases/non-windows/file-not-found/csv/cmd @@ -0,0 +1 @@ +mlr --csv cat /nonesuch/nope/never diff --git a/test/cases/non-windows/file-not-found/csv/experr b/test/cases/non-windows/file-not-found/csv/experr new file mode 100644 index 000000000..486e326b3 --- /dev/null +++ b/test/cases/non-windows/file-not-found/csv/experr @@ -0,0 +1 @@ +mlr: open /nonesuch/nope/never: no such file or directory. diff --git a/test/cases/non-windows/file-not-found/csv/expout b/test/cases/non-windows/file-not-found/csv/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/csv/should-fail b/test/cases/non-windows/file-not-found/csv/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/dkvp/cmd b/test/cases/non-windows/file-not-found/dkvp/cmd new file mode 100644 index 000000000..e0a95d06c --- /dev/null +++ b/test/cases/non-windows/file-not-found/dkvp/cmd @@ -0,0 +1 @@ +mlr --dkvp cat /nonesuch/nope/never diff --git a/test/cases/non-windows/file-not-found/dkvp/experr b/test/cases/non-windows/file-not-found/dkvp/experr new file mode 100644 index 000000000..486e326b3 --- /dev/null +++ b/test/cases/non-windows/file-not-found/dkvp/experr @@ -0,0 +1 @@ +mlr: open /nonesuch/nope/never: no such file or directory. diff --git a/test/cases/non-windows/file-not-found/dkvp/expout b/test/cases/non-windows/file-not-found/dkvp/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/dkvp/should-fail b/test/cases/non-windows/file-not-found/dkvp/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/imd/cmd b/test/cases/non-windows/file-not-found/imd/cmd new file mode 100644 index 000000000..53adb3fe5 --- /dev/null +++ b/test/cases/non-windows/file-not-found/imd/cmd @@ -0,0 +1 @@ +mlr --imd cat /nonesuch/nope/never diff --git a/test/cases/non-windows/file-not-found/imd/experr b/test/cases/non-windows/file-not-found/imd/experr new file mode 100644 index 000000000..486e326b3 --- /dev/null +++ b/test/cases/non-windows/file-not-found/imd/experr @@ -0,0 +1 @@ +mlr: open /nonesuch/nope/never: no such file or directory. diff --git a/test/cases/non-windows/file-not-found/imd/expout b/test/cases/non-windows/file-not-found/imd/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/imd/should-fail b/test/cases/non-windows/file-not-found/imd/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/json/cmd b/test/cases/non-windows/file-not-found/json/cmd new file mode 100644 index 000000000..92c89b17f --- /dev/null +++ b/test/cases/non-windows/file-not-found/json/cmd @@ -0,0 +1 @@ +mlr --json cat /nonesuch/nope/never diff --git a/test/cases/non-windows/file-not-found/json/experr b/test/cases/non-windows/file-not-found/json/experr new file mode 100644 index 000000000..486e326b3 --- /dev/null +++ b/test/cases/non-windows/file-not-found/json/experr @@ -0,0 +1 @@ +mlr: open /nonesuch/nope/never: no such file or directory. diff --git a/test/cases/non-windows/file-not-found/json/expout b/test/cases/non-windows/file-not-found/json/expout new file mode 100644 index 000000000..0d4f101c7 --- /dev/null +++ b/test/cases/non-windows/file-not-found/json/expout @@ -0,0 +1,2 @@ +[ +] diff --git a/test/cases/non-windows/file-not-found/json/should-fail b/test/cases/non-windows/file-not-found/json/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/jsonl/cmd b/test/cases/non-windows/file-not-found/jsonl/cmd new file mode 100644 index 000000000..551c0b0a7 --- /dev/null +++ b/test/cases/non-windows/file-not-found/jsonl/cmd @@ -0,0 +1 @@ +mlr --jsonl cat /nonesuch/nope/never diff --git a/test/cases/non-windows/file-not-found/jsonl/experr b/test/cases/non-windows/file-not-found/jsonl/experr new file mode 100644 index 000000000..486e326b3 --- /dev/null +++ b/test/cases/non-windows/file-not-found/jsonl/experr @@ -0,0 +1 @@ +mlr: open /nonesuch/nope/never: no such file or directory. diff --git a/test/cases/non-windows/file-not-found/jsonl/expout b/test/cases/non-windows/file-not-found/jsonl/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/jsonl/should-fail b/test/cases/non-windows/file-not-found/jsonl/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/nidx/cmd b/test/cases/non-windows/file-not-found/nidx/cmd new file mode 100644 index 000000000..b5b1a2316 --- /dev/null +++ b/test/cases/non-windows/file-not-found/nidx/cmd @@ -0,0 +1 @@ +mlr --nidx cat /nonesuch/nope/never diff --git a/test/cases/non-windows/file-not-found/nidx/experr b/test/cases/non-windows/file-not-found/nidx/experr new file mode 100644 index 000000000..486e326b3 --- /dev/null +++ b/test/cases/non-windows/file-not-found/nidx/experr @@ -0,0 +1 @@ +mlr: open /nonesuch/nope/never: no such file or directory. diff --git a/test/cases/non-windows/file-not-found/nidx/expout b/test/cases/non-windows/file-not-found/nidx/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/nidx/should-fail b/test/cases/non-windows/file-not-found/nidx/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/pprint/cmd b/test/cases/non-windows/file-not-found/pprint/cmd new file mode 100644 index 000000000..51a3a4d6e --- /dev/null +++ b/test/cases/non-windows/file-not-found/pprint/cmd @@ -0,0 +1 @@ +mlr --pprint cat /nonesuch/nope/never diff --git a/test/cases/non-windows/file-not-found/pprint/experr b/test/cases/non-windows/file-not-found/pprint/experr new file mode 100644 index 000000000..486e326b3 --- /dev/null +++ b/test/cases/non-windows/file-not-found/pprint/experr @@ -0,0 +1 @@ +mlr: open /nonesuch/nope/never: no such file or directory. diff --git a/test/cases/non-windows/file-not-found/pprint/expout b/test/cases/non-windows/file-not-found/pprint/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/pprint/should-fail b/test/cases/non-windows/file-not-found/pprint/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/tsv/cmd b/test/cases/non-windows/file-not-found/tsv/cmd new file mode 100644 index 000000000..fbb231b7e --- /dev/null +++ b/test/cases/non-windows/file-not-found/tsv/cmd @@ -0,0 +1 @@ +mlr --tsv cat /nonesuch/nope/never diff --git a/test/cases/non-windows/file-not-found/tsv/experr b/test/cases/non-windows/file-not-found/tsv/experr new file mode 100644 index 000000000..486e326b3 --- /dev/null +++ b/test/cases/non-windows/file-not-found/tsv/experr @@ -0,0 +1 @@ +mlr: open /nonesuch/nope/never: no such file or directory. diff --git a/test/cases/non-windows/file-not-found/tsv/expout b/test/cases/non-windows/file-not-found/tsv/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/tsv/should-fail b/test/cases/non-windows/file-not-found/tsv/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/xtab/cmd b/test/cases/non-windows/file-not-found/xtab/cmd new file mode 100644 index 000000000..dca5d1681 --- /dev/null +++ b/test/cases/non-windows/file-not-found/xtab/cmd @@ -0,0 +1 @@ +mlr --xtab cat /nonesuch/nope/never diff --git a/test/cases/non-windows/file-not-found/xtab/experr b/test/cases/non-windows/file-not-found/xtab/experr new file mode 100644 index 000000000..486e326b3 --- /dev/null +++ b/test/cases/non-windows/file-not-found/xtab/experr @@ -0,0 +1 @@ +mlr: open /nonesuch/nope/never: no such file or directory. diff --git a/test/cases/non-windows/file-not-found/xtab/expout b/test/cases/non-windows/file-not-found/xtab/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/file-not-found/xtab/should-fail b/test/cases/non-windows/file-not-found/xtab/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/io-compressed-input/0017/cmd b/test/cases/non-windows/io-compressed-input/0017/cmd new file mode 100644 index 000000000..71f61cc5e --- /dev/null +++ b/test/cases/non-windows/io-compressed-input/0017/cmd @@ -0,0 +1 @@ +mlr --csv cat test/input/whitespace*.csv diff --git a/test/cases/non-windows/io-compressed-input/0017/experr b/test/cases/non-windows/io-compressed-input/0017/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/non-windows/io-compressed-input/0017/expout b/test/cases/non-windows/io-compressed-input/0017/expout new file mode 100644 index 000000000..88700c714 --- /dev/null +++ b/test/cases/non-windows/io-compressed-input/0017/expout @@ -0,0 +1,3 @@ +a,b,c +1,2,3 +4,5,6 diff --git a/test/cases/verb-flatten-unflatten/0011/expout b/test/cases/verb-flatten-unflatten/0011/expout index 9a45bc186..18f737223 100644 --- a/test/cases/verb-flatten-unflatten/0011/expout +++ b/test/cases/verb-flatten-unflatten/0011/expout @@ -24,6 +24,13 @@ "wrapper": { "empty3": {}, "emtpy4": [] - } + }, + "x": { + "y": 1 + }, + "@": 2, + "x@": 3, + "@y": 4, + "x@@y": 5 } ] diff --git a/test/cases/verb-format-values/0003/expout b/test/cases/verb-format-values/0003/expout index 06216b5ca..9a45bc186 100644 --- a/test/cases/verb-format-values/0003/expout +++ b/test/cases/verb-format-values/0003/expout @@ -1,7 +1,7 @@ [ { "hostname": "localhost", - "pid": 0x3039, + "pid": 12345, "req": { "id": 6789, "method": "GET", diff --git a/test/cases/verb-label/0009/experr b/test/cases/verb-label/0009/experr index 615cfe2bf..0933a079d 100644 --- a/test/cases/verb-label/0009/experr +++ b/test/cases/verb-label/0009/experr @@ -1 +1 @@ -mlr label: labels must be unique; got duplicate "d" +mlr label: labels must be unique; got duplicate "d" \ No newline at end of file diff --git a/test/cases/verb-reorder/regex-after/cmd b/test/cases/verb-reorder/regex-after/cmd new file mode 100644 index 000000000..59a79f7f6 --- /dev/null +++ b/test/cases/verb-reorder/regex-after/cmd @@ -0,0 +1 @@ +mlr --n2x reorder -r 3,9,8 -a 6 test/input/reorder-regex.nidx diff --git a/test/cases/verb-reorder/regex-after/experr b/test/cases/verb-reorder/regex-after/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-reorder/regex-after/expout b/test/cases/verb-reorder/regex-after/expout new file mode 100644 index 000000000..62cb82ad5 --- /dev/null +++ b/test/cases/verb-reorder/regex-after/expout @@ -0,0 +1,10 @@ +1 a +2 b +4 d +5 e +6 f +3 c +8 h +9 i +7 g +10 j diff --git a/test/cases/verb-reorder/regex-before/cmd b/test/cases/verb-reorder/regex-before/cmd new file mode 100644 index 000000000..f207567a8 --- /dev/null +++ b/test/cases/verb-reorder/regex-before/cmd @@ -0,0 +1 @@ +mlr --n2x reorder -r 3,9,8 -b 6 test/input/reorder-regex.nidx diff --git a/test/cases/verb-reorder/regex-before/experr b/test/cases/verb-reorder/regex-before/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-reorder/regex-before/expout b/test/cases/verb-reorder/regex-before/expout new file mode 100644 index 000000000..ef4d4f166 --- /dev/null +++ b/test/cases/verb-reorder/regex-before/expout @@ -0,0 +1,10 @@ +1 a +2 b +4 d +5 e +3 c +8 h +9 i +6 f +7 g +10 j diff --git a/test/cases/verb-reorder/regex-end/cmd b/test/cases/verb-reorder/regex-end/cmd new file mode 100644 index 000000000..8c3e21c81 --- /dev/null +++ b/test/cases/verb-reorder/regex-end/cmd @@ -0,0 +1 @@ +mlr --n2x reorder -r 3,9,8 -e test/input/reorder-regex.nidx diff --git a/test/cases/verb-reorder/regex-end/experr b/test/cases/verb-reorder/regex-end/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-reorder/regex-end/expout b/test/cases/verb-reorder/regex-end/expout new file mode 100644 index 000000000..7a7424aa9 --- /dev/null +++ b/test/cases/verb-reorder/regex-end/expout @@ -0,0 +1,10 @@ +1 a +2 b +4 d +5 e +6 f +7 g +10 j +3 c +8 h +9 i diff --git a/test/cases/verb-reorder/regex-start/cmd b/test/cases/verb-reorder/regex-start/cmd new file mode 100644 index 000000000..2020a1393 --- /dev/null +++ b/test/cases/verb-reorder/regex-start/cmd @@ -0,0 +1 @@ +mlr --n2x reorder -r 3,9,8 test/input/reorder-regex.nidx diff --git a/test/cases/verb-reorder/regex-start/experr b/test/cases/verb-reorder/regex-start/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-reorder/regex-start/expout b/test/cases/verb-reorder/regex-start/expout new file mode 100644 index 000000000..ee16332d9 --- /dev/null +++ b/test/cases/verb-reorder/regex-start/expout @@ -0,0 +1,10 @@ +3 c +8 h +9 i +1 a +2 b +4 d +5 e +6 f +7 g +10 j diff --git a/test/cases/verb-sparsify/0001/cmd b/test/cases/verb-sparsify/0001/cmd new file mode 100644 index 000000000..38ec29b15 --- /dev/null +++ b/test/cases/verb-sparsify/0001/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify diff --git a/test/cases/verb-sparsify/0001/experr b/test/cases/verb-sparsify/0001/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0001/expout b/test/cases/verb-sparsify/0001/expout new file mode 100644 index 000000000..e9c9893a9 --- /dev/null +++ b/test/cases/verb-sparsify/0001/expout @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5 +}, +{}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0002/cmd b/test/cases/verb-sparsify/0002/cmd new file mode 100644 index 000000000..3ac1c9630 --- /dev/null +++ b/test/cases/verb-sparsify/0002/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f a diff --git a/test/cases/verb-sparsify/0002/experr b/test/cases/verb-sparsify/0002/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0002/expout b/test/cases/verb-sparsify/0002/expout new file mode 100644 index 000000000..8bc89d0aa --- /dev/null +++ b/test/cases/verb-sparsify/0002/expout @@ -0,0 +1,21 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "b": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0003/cmd b/test/cases/verb-sparsify/0003/cmd new file mode 100644 index 000000000..fc08ebef9 --- /dev/null +++ b/test/cases/verb-sparsify/0003/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f b diff --git a/test/cases/verb-sparsify/0003/experr b/test/cases/verb-sparsify/0003/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0003/expout b/test/cases/verb-sparsify/0003/expout new file mode 100644 index 000000000..b607e3893 --- /dev/null +++ b/test/cases/verb-sparsify/0003/expout @@ -0,0 +1,21 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "a": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0004/cmd b/test/cases/verb-sparsify/0004/cmd new file mode 100644 index 000000000..5ea1aa7bd --- /dev/null +++ b/test/cases/verb-sparsify/0004/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f b,c diff --git a/test/cases/verb-sparsify/0004/experr b/test/cases/verb-sparsify/0004/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0004/expout b/test/cases/verb-sparsify/0004/expout new file mode 100644 index 000000000..ebf9878cd --- /dev/null +++ b/test/cases/verb-sparsify/0004/expout @@ -0,0 +1,19 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5 +}, +{ + "a": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0005/cmd b/test/cases/verb-sparsify/0005/cmd new file mode 100644 index 000000000..012aee2b6 --- /dev/null +++ b/test/cases/verb-sparsify/0005/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -s 1 diff --git a/test/cases/verb-sparsify/0005/experr b/test/cases/verb-sparsify/0005/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0005/expout b/test/cases/verb-sparsify/0005/expout new file mode 100644 index 000000000..839476d58 --- /dev/null +++ b/test/cases/verb-sparsify/0005/expout @@ -0,0 +1,21 @@ +[ +{ + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "a": "", + "b": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0006/cmd b/test/cases/verb-sparsify/0006/cmd new file mode 100644 index 000000000..42567786a --- /dev/null +++ b/test/cases/verb-sparsify/0006/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f a -s 1 diff --git a/test/cases/verb-sparsify/0006/experr b/test/cases/verb-sparsify/0006/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0006/expout b/test/cases/verb-sparsify/0006/expout new file mode 100644 index 000000000..839476d58 --- /dev/null +++ b/test/cases/verb-sparsify/0006/expout @@ -0,0 +1,21 @@ +[ +{ + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "a": "", + "b": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0007/cmd b/test/cases/verb-sparsify/0007/cmd new file mode 100644 index 000000000..99b590da4 --- /dev/null +++ b/test/cases/verb-sparsify/0007/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f b -s 1 diff --git a/test/cases/verb-sparsify/0007/experr b/test/cases/verb-sparsify/0007/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0007/expout b/test/cases/verb-sparsify/0007/expout new file mode 100644 index 000000000..d7f95feba --- /dev/null +++ b/test/cases/verb-sparsify/0007/expout @@ -0,0 +1,22 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "a": "", + "b": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0008/cmd b/test/cases/verb-sparsify/0008/cmd new file mode 100644 index 000000000..b943d2c79 --- /dev/null +++ b/test/cases/verb-sparsify/0008/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f b,c -s 1 diff --git a/test/cases/verb-sparsify/0008/experr b/test/cases/verb-sparsify/0008/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0008/expout b/test/cases/verb-sparsify/0008/expout new file mode 100644 index 000000000..d7f95feba --- /dev/null +++ b/test/cases/verb-sparsify/0008/expout @@ -0,0 +1,22 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "a": "", + "b": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-stats1/0001/cmd b/test/cases/verb-stats1/0001/cmd index 1e5931d0f..93753529c 100644 --- a/test/cases/verb-stats1/0001/cmd +++ b/test/cases/verb-stats1/0001/cmd @@ -1 +1 @@ -mlr --oxtab stats1 -a mean,sum,count,min,max,antimode,mode -f i,x,y test/input/abixy +mlr --oxtab stats1 -a mean,sum,count,min,max,antimode,mode,mad -f i,x,y test/input/abixy diff --git a/test/cases/verb-stats1/0001/expout b/test/cases/verb-stats1/0001/expout index e99cdf2b0..8c52b5166 100644 --- a/test/cases/verb-stats1/0001/expout +++ b/test/cases/verb-stats1/0001/expout @@ -5,6 +5,7 @@ i_min 1 i_max 10 i_antimode 1 i_mode 1 +i_mad 2.50000000 x_mean 0.45362938 x_sum 4.53629384 x_count 10 @@ -12,6 +13,7 @@ x_min 0.03144188 x_max 0.75867996 x_antimode 0.34679014 x_mode 0.34679014 +x_mad 0.17005656 y_mean 0.59445424 y_sum 5.94454242 y_count 10 @@ -19,3 +21,4 @@ y_min 0.13418874 y_max 0.97618139 y_antimode 0.72680286 y_mode 0.72680286 +y_mad 0.25930133 diff --git a/test/cases/verb-stats1/0018/experr b/test/cases/verb-stats1/0018/experr index 03f44410a..43dafc363 100644 --- a/test/cases/verb-stats1/0018/experr +++ b/test/cases/verb-stats1/0018/experr @@ -1 +1 @@ -mlr stats1: accumulator "nonesuch" not found. +mlr stats1: accumulator "nonesuch" not found diff --git a/test/cases/verb-sub-gsub-ssub/0001/cmd b/test/cases/verb-sub-gsub-ssub/0001/cmd deleted file mode 100644 index 7d4cec775..000000000 --- a/test/cases/verb-sub-gsub-ssub/0001/cmd +++ /dev/null @@ -1 +0,0 @@ -mlr --d2p --from test/input/abixy sub -f a,b e X diff --git a/test/cases/verb-sub-gsub-ssub/0001/expout b/test/cases/verb-sub-gsub-ssub/0001/expout deleted file mode 100644 index 917c3f5ed..000000000 --- a/test/cases/verb-sub-gsub-ssub/0001/expout +++ /dev/null @@ -1,11 +0,0 @@ -a b i x y -pan pan 1 0.34679014 0.72680286 -Xks pan 2 0.75867996 0.52215111 -wyX wyX 3 0.20460331 0.33831853 -Xks wyX 4 0.38139939 0.13418874 -wyX pan 5 0.57328892 0.86362447 -zXe pan 6 0.52712616 0.49322129 -Xks zXe 7 0.61178406 0.18788492 -zXe wyX 8 0.59855401 0.97618139 -hat wyX 9 0.03144188 0.74955076 -pan wyX 10 0.50262601 0.95261836 diff --git a/test/cases/verb-sub-gsub-ssub/0002/cmd b/test/cases/verb-sub-gsub-ssub/0002/cmd deleted file mode 100644 index f33200891..000000000 --- a/test/cases/verb-sub-gsub-ssub/0002/cmd +++ /dev/null @@ -1 +0,0 @@ -mlr --d2p --from test/input/abixy gsub -f a,b e X diff --git a/test/cases/verb-sub-gsub-ssub/0002/expout b/test/cases/verb-sub-gsub-ssub/0002/expout deleted file mode 100644 index 49d53727b..000000000 --- a/test/cases/verb-sub-gsub-ssub/0002/expout +++ /dev/null @@ -1,11 +0,0 @@ -a b i x y -pan pan 1 0.34679014 0.72680286 -Xks pan 2 0.75867996 0.52215111 -wyX wyX 3 0.20460331 0.33831853 -Xks wyX 4 0.38139939 0.13418874 -wyX pan 5 0.57328892 0.86362447 -zXX pan 6 0.52712616 0.49322129 -Xks zXX 7 0.61178406 0.18788492 -zXX wyX 8 0.59855401 0.97618139 -hat wyX 9 0.03144188 0.74955076 -pan wyX 10 0.50262601 0.95261836 diff --git a/test/cases/verb-sub-gsub-ssub/0003/cmd b/test/cases/verb-sub-gsub-ssub/0003/cmd deleted file mode 100644 index ff6b15c4a..000000000 --- a/test/cases/verb-sub-gsub-ssub/0003/cmd +++ /dev/null @@ -1 +0,0 @@ -mlr --d2p --from test/input/abixy sub -f a,b . X diff --git a/test/cases/verb-sub-gsub-ssub/0003/expout b/test/cases/verb-sub-gsub-ssub/0003/expout deleted file mode 100644 index a8b8e8643..000000000 --- a/test/cases/verb-sub-gsub-ssub/0003/expout +++ /dev/null @@ -1,11 +0,0 @@ -a b i x y -Xan Xan 1 0.34679014 0.72680286 -Xks Xan 2 0.75867996 0.52215111 -Xye Xye 3 0.20460331 0.33831853 -Xks Xye 4 0.38139939 0.13418874 -Xye Xan 5 0.57328892 0.86362447 -Xee Xan 6 0.52712616 0.49322129 -Xks Xee 7 0.61178406 0.18788492 -Xee Xye 8 0.59855401 0.97618139 -Xat Xye 9 0.03144188 0.74955076 -Xan Xye 10 0.50262601 0.95261836 diff --git a/test/cases/verb-sub-gsub-ssub/0004/cmd b/test/cases/verb-sub-gsub-ssub/0004/cmd deleted file mode 100644 index 8770d578d..000000000 --- a/test/cases/verb-sub-gsub-ssub/0004/cmd +++ /dev/null @@ -1 +0,0 @@ -mlr --d2p --from test/input/abixy ssub -f a,b e X diff --git a/test/cases/verb-sub-gsub-ssub/0004/expout b/test/cases/verb-sub-gsub-ssub/0004/expout deleted file mode 100644 index 917c3f5ed..000000000 --- a/test/cases/verb-sub-gsub-ssub/0004/expout +++ /dev/null @@ -1,11 +0,0 @@ -a b i x y -pan pan 1 0.34679014 0.72680286 -Xks pan 2 0.75867996 0.52215111 -wyX wyX 3 0.20460331 0.33831853 -Xks wyX 4 0.38139939 0.13418874 -wyX pan 5 0.57328892 0.86362447 -zXe pan 6 0.52712616 0.49322129 -Xks zXe 7 0.61178406 0.18788492 -zXe wyX 8 0.59855401 0.97618139 -hat wyX 9 0.03144188 0.74955076 -pan wyX 10 0.50262601 0.95261836 diff --git a/test/cases/verb-sub-gsub-ssub/gsub-a/cmd b/test/cases/verb-sub-gsub-ssub/gsub-a/cmd new file mode 100644 index 000000000..21a9e342c --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/gsub-a/cmd @@ -0,0 +1 @@ +mlr --c2p --from test/input/example.csv gsub -a l X diff --git a/test/cases/verb-sub-gsub-ssub/gsub-a/experr b/test/cases/verb-sub-gsub-ssub/gsub-a/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sub-gsub-ssub/gsub-a/expout b/test/cases/verb-sub-gsub-ssub/gsub-a/expout new file mode 100644 index 000000000..b75a98d8f --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/gsub-a/expout @@ -0,0 +1,11 @@ +color shape flag k index quantity rate +yeXXow triangXe true 1 11 43.64980000 9.88700000 +red square true 2 15 79.27780000 0.01300000 +red circXe true 3 16 13.81030000 2.90100000 +red square faXse 4 48 77.55420000 7.46700000 +purpXe triangXe faXse 5 51 81.22900000 8.59100000 +red square faXse 6 64 77.19910000 9.53100000 +purpXe triangXe faXse 7 65 80.14050000 5.82400000 +yeXXow circXe true 8 73 63.97850000 4.23700000 +yeXXow circXe true 9 87 63.50580000 8.33500000 +purpXe square faXse 10 91 72.37350000 8.24300000 diff --git a/test/cases/verb-sub-gsub-ssub/gsub-f/cmd b/test/cases/verb-sub-gsub-ssub/gsub-f/cmd new file mode 100644 index 000000000..a4c3ffc4b --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/gsub-f/cmd @@ -0,0 +1 @@ +mlr --c2p --from test/input/example.csv gsub -f color,shape,index l X diff --git a/test/cases/verb-sub-gsub-ssub/gsub-f/experr b/test/cases/verb-sub-gsub-ssub/gsub-f/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sub-gsub-ssub/gsub-f/expout b/test/cases/verb-sub-gsub-ssub/gsub-f/expout new file mode 100644 index 000000000..fda761674 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/gsub-f/expout @@ -0,0 +1,11 @@ +color shape flag k index quantity rate +yeXXow triangXe true 1 11 43.64980000 9.88700000 +red square true 2 15 79.27780000 0.01300000 +red circXe true 3 16 13.81030000 2.90100000 +red square false 4 48 77.55420000 7.46700000 +purpXe triangXe false 5 51 81.22900000 8.59100000 +red square false 6 64 77.19910000 9.53100000 +purpXe triangXe false 7 65 80.14050000 5.82400000 +yeXXow circXe true 8 73 63.97850000 4.23700000 +yeXXow circXe true 9 87 63.50580000 8.33500000 +purpXe square false 10 91 72.37350000 8.24300000 diff --git a/test/cases/verb-sub-gsub-ssub/non-windows/gsub-r/cmd b/test/cases/verb-sub-gsub-ssub/non-windows/gsub-r/cmd new file mode 100644 index 000000000..14c697154 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/non-windows/gsub-r/cmd @@ -0,0 +1 @@ +mlr --c2p --from test/input/example.csv gsub -r -f '.*e' l X diff --git a/test/cases/verb-sub-gsub-ssub/non-windows/gsub-r/experr b/test/cases/verb-sub-gsub-ssub/non-windows/gsub-r/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sub-gsub-ssub/non-windows/gsub-r/expout b/test/cases/verb-sub-gsub-ssub/non-windows/gsub-r/expout new file mode 100644 index 000000000..93b24ea0c --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/non-windows/gsub-r/expout @@ -0,0 +1,11 @@ +color shape flag k index quantity rate +yellow triangXe true 1 11 43.64980000 9.88700000 +red square true 2 15 79.27780000 0.01300000 +red circXe true 3 16 13.81030000 2.90100000 +red square false 4 48 77.55420000 7.46700000 +purple triangXe false 5 51 81.22900000 8.59100000 +red square false 6 64 77.19910000 9.53100000 +purple triangXe false 7 65 80.14050000 5.82400000 +yellow circXe true 8 73 63.97850000 4.23700000 +yellow circXe true 9 87 63.50580000 8.33500000 +purple square false 10 91 72.37350000 8.24300000 diff --git a/test/cases/verb-sub-gsub-ssub/non-windows/ssub-r/cmd b/test/cases/verb-sub-gsub-ssub/non-windows/ssub-r/cmd new file mode 100644 index 000000000..f6cf74d5e --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/non-windows/ssub-r/cmd @@ -0,0 +1 @@ +mlr --c2p --from test/input/example.csv ssub -r -f '.*e' l X diff --git a/test/cases/verb-sub-gsub-ssub/non-windows/ssub-r/experr b/test/cases/verb-sub-gsub-ssub/non-windows/ssub-r/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sub-gsub-ssub/non-windows/ssub-r/expout b/test/cases/verb-sub-gsub-ssub/non-windows/ssub-r/expout new file mode 100644 index 000000000..93b24ea0c --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/non-windows/ssub-r/expout @@ -0,0 +1,11 @@ +color shape flag k index quantity rate +yellow triangXe true 1 11 43.64980000 9.88700000 +red square true 2 15 79.27780000 0.01300000 +red circXe true 3 16 13.81030000 2.90100000 +red square false 4 48 77.55420000 7.46700000 +purple triangXe false 5 51 81.22900000 8.59100000 +red square false 6 64 77.19910000 9.53100000 +purple triangXe false 7 65 80.14050000 5.82400000 +yellow circXe true 8 73 63.97850000 4.23700000 +yellow circXe true 9 87 63.50580000 8.33500000 +purple square false 10 91 72.37350000 8.24300000 diff --git a/test/cases/verb-sub-gsub-ssub/non-windows/sub-r/cmd b/test/cases/verb-sub-gsub-ssub/non-windows/sub-r/cmd new file mode 100644 index 000000000..cae049e51 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/non-windows/sub-r/cmd @@ -0,0 +1 @@ +mlr --c2p --from test/input/example.csv sub -r -f '.*e' l X diff --git a/test/cases/verb-sub-gsub-ssub/non-windows/sub-r/experr b/test/cases/verb-sub-gsub-ssub/non-windows/sub-r/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sub-gsub-ssub/non-windows/sub-r/expout b/test/cases/verb-sub-gsub-ssub/non-windows/sub-r/expout new file mode 100644 index 000000000..93b24ea0c --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/non-windows/sub-r/expout @@ -0,0 +1,11 @@ +color shape flag k index quantity rate +yellow triangXe true 1 11 43.64980000 9.88700000 +red square true 2 15 79.27780000 0.01300000 +red circXe true 3 16 13.81030000 2.90100000 +red square false 4 48 77.55420000 7.46700000 +purple triangXe false 5 51 81.22900000 8.59100000 +red square false 6 64 77.19910000 9.53100000 +purple triangXe false 7 65 80.14050000 5.82400000 +yellow circXe true 8 73 63.97850000 4.23700000 +yellow circXe true 9 87 63.50580000 8.33500000 +purple square false 10 91 72.37350000 8.24300000 diff --git a/test/cases/verb-sub-gsub-ssub/ssub-a/cmd b/test/cases/verb-sub-gsub-ssub/ssub-a/cmd new file mode 100644 index 000000000..f0af9a1c9 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/ssub-a/cmd @@ -0,0 +1 @@ +mlr --c2p --from test/input/example.csv ssub -a l X diff --git a/test/cases/verb-sub-gsub-ssub/ssub-a/experr b/test/cases/verb-sub-gsub-ssub/ssub-a/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sub-gsub-ssub/ssub-a/expout b/test/cases/verb-sub-gsub-ssub/ssub-a/expout new file mode 100644 index 000000000..643a0290f --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/ssub-a/expout @@ -0,0 +1,11 @@ +color shape flag k index quantity rate +yeXlow triangXe true 1 11 43.64980000 9.88700000 +red square true 2 15 79.27780000 0.01300000 +red circXe true 3 16 13.81030000 2.90100000 +red square faXse 4 48 77.55420000 7.46700000 +purpXe triangXe faXse 5 51 81.22900000 8.59100000 +red square faXse 6 64 77.19910000 9.53100000 +purpXe triangXe faXse 7 65 80.14050000 5.82400000 +yeXlow circXe true 8 73 63.97850000 4.23700000 +yeXlow circXe true 9 87 63.50580000 8.33500000 +purpXe square faXse 10 91 72.37350000 8.24300000 diff --git a/test/cases/verb-sub-gsub-ssub/ssub-f/cmd b/test/cases/verb-sub-gsub-ssub/ssub-f/cmd new file mode 100644 index 000000000..26b395415 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/ssub-f/cmd @@ -0,0 +1 @@ +mlr --c2p --from test/input/example.csv ssub -f color,shape,index l X diff --git a/test/cases/verb-sub-gsub-ssub/ssub-f/experr b/test/cases/verb-sub-gsub-ssub/ssub-f/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sub-gsub-ssub/ssub-f/expout b/test/cases/verb-sub-gsub-ssub/ssub-f/expout new file mode 100644 index 000000000..f4bf55be4 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/ssub-f/expout @@ -0,0 +1,11 @@ +color shape flag k index quantity rate +yeXlow triangXe true 1 11 43.64980000 9.88700000 +red square true 2 15 79.27780000 0.01300000 +red circXe true 3 16 13.81030000 2.90100000 +red square false 4 48 77.55420000 7.46700000 +purpXe triangXe false 5 51 81.22900000 8.59100000 +red square false 6 64 77.19910000 9.53100000 +purpXe triangXe false 7 65 80.14050000 5.82400000 +yeXlow circXe true 8 73 63.97850000 4.23700000 +yeXlow circXe true 9 87 63.50580000 8.33500000 +purpXe square false 10 91 72.37350000 8.24300000 diff --git a/test/cases/verb-sub-gsub-ssub/sub-a/cmd b/test/cases/verb-sub-gsub-ssub/sub-a/cmd new file mode 100644 index 000000000..b0ca748b5 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/sub-a/cmd @@ -0,0 +1 @@ +mlr --c2p --from test/input/example.csv sub -a l X diff --git a/test/cases/verb-sub-gsub-ssub/sub-a/experr b/test/cases/verb-sub-gsub-ssub/sub-a/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sub-gsub-ssub/sub-a/expout b/test/cases/verb-sub-gsub-ssub/sub-a/expout new file mode 100644 index 000000000..643a0290f --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/sub-a/expout @@ -0,0 +1,11 @@ +color shape flag k index quantity rate +yeXlow triangXe true 1 11 43.64980000 9.88700000 +red square true 2 15 79.27780000 0.01300000 +red circXe true 3 16 13.81030000 2.90100000 +red square faXse 4 48 77.55420000 7.46700000 +purpXe triangXe faXse 5 51 81.22900000 8.59100000 +red square faXse 6 64 77.19910000 9.53100000 +purpXe triangXe faXse 7 65 80.14050000 5.82400000 +yeXlow circXe true 8 73 63.97850000 4.23700000 +yeXlow circXe true 9 87 63.50580000 8.33500000 +purpXe square faXse 10 91 72.37350000 8.24300000 diff --git a/test/cases/verb-sub-gsub-ssub/sub-f-2/cmd b/test/cases/verb-sub-gsub-ssub/sub-f-2/cmd new file mode 100644 index 000000000..8d5de9b90 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/sub-f-2/cmd @@ -0,0 +1 @@ +mlr --c2p --from test/input/example.csv sub -f a,b l X diff --git a/test/cases/verb-sub-gsub-ssub/sub-f-2/experr b/test/cases/verb-sub-gsub-ssub/sub-f-2/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sub-gsub-ssub/sub-f-2/expout b/test/cases/verb-sub-gsub-ssub/sub-f-2/expout new file mode 100644 index 000000000..c3c56133c --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/sub-f-2/expout @@ -0,0 +1,11 @@ +color shape flag k index quantity rate +yellow triangle true 1 11 43.64980000 9.88700000 +red square true 2 15 79.27780000 0.01300000 +red circle true 3 16 13.81030000 2.90100000 +red square false 4 48 77.55420000 7.46700000 +purple triangle false 5 51 81.22900000 8.59100000 +red square false 6 64 77.19910000 9.53100000 +purple triangle false 7 65 80.14050000 5.82400000 +yellow circle true 8 73 63.97850000 4.23700000 +yellow circle true 9 87 63.50580000 8.33500000 +purple square false 10 91 72.37350000 8.24300000 diff --git a/test/cases/verb-sub-gsub-ssub/sub-f/cmd b/test/cases/verb-sub-gsub-ssub/sub-f/cmd new file mode 100644 index 000000000..605605ad0 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/sub-f/cmd @@ -0,0 +1 @@ +mlr --c2p --from test/input/example.csv sub -f color,shape,index l X diff --git a/test/cases/verb-sub-gsub-ssub/sub-f/experr b/test/cases/verb-sub-gsub-ssub/sub-f/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sub-gsub-ssub/sub-f/expout b/test/cases/verb-sub-gsub-ssub/sub-f/expout new file mode 100644 index 000000000..f4bf55be4 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/sub-f/expout @@ -0,0 +1,11 @@ +color shape flag k index quantity rate +yeXlow triangXe true 1 11 43.64980000 9.88700000 +red square true 2 15 79.27780000 0.01300000 +red circXe true 3 16 13.81030000 2.90100000 +red square false 4 48 77.55420000 7.46700000 +purpXe triangXe false 5 51 81.22900000 8.59100000 +red square false 6 64 77.19910000 9.53100000 +purpXe triangXe false 7 65 80.14050000 5.82400000 +yeXlow circXe true 8 73 63.97850000 4.23700000 +yeXlow circXe true 9 87 63.50580000 8.33500000 +purpXe square false 10 91 72.37350000 8.24300000 diff --git a/test/cases/verb-surv/0001/cmd b/test/cases/verb-surv/0001/cmd new file mode 100644 index 000000000..d50e07397 --- /dev/null +++ b/test/cases/verb-surv/0001/cmd @@ -0,0 +1 @@ +mlr --csv --from test/input/surv.csv surv -d duration -s status \ No newline at end of file diff --git a/test/cases/verb-surv/0001/experr b/test/cases/verb-surv/0001/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-surv/0001/expout b/test/cases/verb-surv/0001/expout new file mode 100644 index 000000000..9b23a50f5 --- /dev/null +++ b/test/cases/verb-surv/0001/expout @@ -0,0 +1,4 @@ +time,survival +1.00000000,0.80000000 +3.00000000,0.53333333 +5.00000000,0.00000000 diff --git a/test/cases/verb-uniq/uniq-c-x-change/cmd b/test/cases/verb-uniq/uniq-c-x-change/cmd new file mode 100644 index 000000000..2f3418461 --- /dev/null +++ b/test/cases/verb-uniq/uniq-c-x-change/cmd @@ -0,0 +1 @@ +mlr --dkvp uniq -c -x flag,k,index,quantity,rate test/input/example-with-changed-keys.dkvp diff --git a/test/cases/verb-uniq/uniq-c-x-change/experr b/test/cases/verb-uniq/uniq-c-x-change/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-uniq/uniq-c-x-change/expout b/test/cases/verb-uniq/uniq-c-x-change/expout new file mode 100644 index 000000000..a61ce84e8 --- /dev/null +++ b/test/cases/verb-uniq/uniq-c-x-change/expout @@ -0,0 +1,7 @@ +color=yellow,shape=triangle,count=1 +color=red,shape=square,count=2 +weird=red,shape=circle,count=1 +color=purple,shape=triangle,count=2 +color=red,shape=square,odd=77.19910000,count=1 +color=yellow,shape=circle,count=2 +color=purple,shape=square,count=1 diff --git a/test/cases/verb-uniq/uniq-c-x-het/cmd b/test/cases/verb-uniq/uniq-c-x-het/cmd new file mode 100644 index 000000000..051906fe1 --- /dev/null +++ b/test/cases/verb-uniq/uniq-c-x-het/cmd @@ -0,0 +1 @@ +mlr --dkvp uniq -c -x flag,k,index,quantity,rate test/input/example.dkvp diff --git a/test/cases/verb-uniq/uniq-c-x-het/experr b/test/cases/verb-uniq/uniq-c-x-het/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-uniq/uniq-c-x-het/expout b/test/cases/verb-uniq/uniq-c-x-het/expout new file mode 100644 index 000000000..5392f140e --- /dev/null +++ b/test/cases/verb-uniq/uniq-c-x-het/expout @@ -0,0 +1,6 @@ +color=yellow,shape=triangle,count=1 +color=red,shape=square,count=3 +color=red,shape=circle,count=1 +color=purple,shape=triangle,count=2 +color=yellow,shape=circle,count=2 +color=purple,shape=square,count=1 diff --git a/test/cases/verb-uniq/uniq-c-x-long/cmd b/test/cases/verb-uniq/uniq-c-x-long/cmd new file mode 100644 index 000000000..38fe9e5c3 --- /dev/null +++ b/test/cases/verb-uniq/uniq-c-x-long/cmd @@ -0,0 +1 @@ +mlr --dkvp uniq -c -x flag,k,index,quantity,rate test/input/example-with-extra-keys.dkvp diff --git a/test/cases/verb-uniq/uniq-c-x-long/experr b/test/cases/verb-uniq/uniq-c-x-long/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-uniq/uniq-c-x-long/expout b/test/cases/verb-uniq/uniq-c-x-long/expout new file mode 100644 index 000000000..d77e08b27 --- /dev/null +++ b/test/cases/verb-uniq/uniq-c-x-long/expout @@ -0,0 +1,7 @@ +color=yellow,shape=triangle,count=1 +color=red,shape=square,count=3 +color=red,shape=circle,count=1 +color=purple,shape=triangle,extra=here,count=1 +color=purple,shape=triangle,count=1 +color=yellow,shape=circle,count=2 +color=purple,shape=square,count=1 diff --git a/test/cases/verb-uniq/uniq-c-x-short/cmd b/test/cases/verb-uniq/uniq-c-x-short/cmd new file mode 100644 index 000000000..9561cc361 --- /dev/null +++ b/test/cases/verb-uniq/uniq-c-x-short/cmd @@ -0,0 +1 @@ +mlr --dkvp uniq -c -x flag,k,index,quantity,rate test/input/example-with-missing-keys.dkvp diff --git a/test/cases/verb-uniq/uniq-c-x-short/experr b/test/cases/verb-uniq/uniq-c-x-short/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-uniq/uniq-c-x-short/expout b/test/cases/verb-uniq/uniq-c-x-short/expout new file mode 100644 index 000000000..7e7269533 --- /dev/null +++ b/test/cases/verb-uniq/uniq-c-x-short/expout @@ -0,0 +1,6 @@ +color=yellow,shape=triangle,count=1 +color=red,shape=square,count=3 +shape=circle,count=1 +color=purple,shape=triangle,count=2 +color=yellow,shape=circle,count=2 +color=purple,shape=square,count=1 diff --git a/test/cases/verb-uniq/uniq-x-change/cmd b/test/cases/verb-uniq/uniq-x-change/cmd new file mode 100644 index 000000000..43006f390 --- /dev/null +++ b/test/cases/verb-uniq/uniq-x-change/cmd @@ -0,0 +1 @@ +mlr --dkvp uniq -x flag,k,index,quantity,rate test/input/example-with-changed-keys.dkvp diff --git a/test/cases/verb-uniq/uniq-x-change/experr b/test/cases/verb-uniq/uniq-x-change/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-uniq/uniq-x-change/expout b/test/cases/verb-uniq/uniq-x-change/expout new file mode 100644 index 000000000..67f9598af --- /dev/null +++ b/test/cases/verb-uniq/uniq-x-change/expout @@ -0,0 +1,7 @@ +color=yellow,shape=triangle +color=red,shape=square +weird=red,shape=circle +color=purple,shape=triangle +color=red,shape=square,odd=77.19910000 +color=yellow,shape=circle +color=purple,shape=square diff --git a/test/cases/verb-uniq/uniq-x-het/cmd b/test/cases/verb-uniq/uniq-x-het/cmd new file mode 100644 index 000000000..326412e62 --- /dev/null +++ b/test/cases/verb-uniq/uniq-x-het/cmd @@ -0,0 +1 @@ +mlr --dkvp uniq -x flag,k,index,quantity,rate test/input/example.dkvp diff --git a/test/cases/verb-uniq/uniq-x-het/experr b/test/cases/verb-uniq/uniq-x-het/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-uniq/uniq-x-het/expout b/test/cases/verb-uniq/uniq-x-het/expout new file mode 100644 index 000000000..ddc9002b1 --- /dev/null +++ b/test/cases/verb-uniq/uniq-x-het/expout @@ -0,0 +1,6 @@ +color=yellow,shape=triangle +color=red,shape=square +color=red,shape=circle +color=purple,shape=triangle +color=yellow,shape=circle +color=purple,shape=square diff --git a/test/cases/verb-uniq/uniq-x-long/cmd b/test/cases/verb-uniq/uniq-x-long/cmd new file mode 100644 index 000000000..bcdfe98e0 --- /dev/null +++ b/test/cases/verb-uniq/uniq-x-long/cmd @@ -0,0 +1 @@ +mlr --dkvp uniq -x flag,k,index,quantity,rate test/input/example-with-extra-keys.dkvp diff --git a/test/cases/verb-uniq/uniq-x-long/experr b/test/cases/verb-uniq/uniq-x-long/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-uniq/uniq-x-long/expout b/test/cases/verb-uniq/uniq-x-long/expout new file mode 100644 index 000000000..d5b3f26eb --- /dev/null +++ b/test/cases/verb-uniq/uniq-x-long/expout @@ -0,0 +1,7 @@ +color=yellow,shape=triangle +color=red,shape=square +color=red,shape=circle +color=purple,shape=triangle,extra=here +color=purple,shape=triangle +color=yellow,shape=circle +color=purple,shape=square diff --git a/test/cases/verb-uniq/uniq-x-short/cmd b/test/cases/verb-uniq/uniq-x-short/cmd new file mode 100644 index 000000000..5c2f73021 --- /dev/null +++ b/test/cases/verb-uniq/uniq-x-short/cmd @@ -0,0 +1 @@ +mlr --dkvp uniq -x flag,k,index,quantity,rate test/input/example-with-missing-keys.dkvp diff --git a/test/cases/verb-uniq/uniq-x-short/experr b/test/cases/verb-uniq/uniq-x-short/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-uniq/uniq-x-short/expout b/test/cases/verb-uniq/uniq-x-short/expout new file mode 100644 index 000000000..b566d5536 --- /dev/null +++ b/test/cases/verb-uniq/uniq-x-short/expout @@ -0,0 +1,6 @@ +color=yellow,shape=triangle +color=red,shape=square +shape=circle +color=purple,shape=triangle +color=yellow,shape=circle +color=purple,shape=square diff --git a/test/input/abixy.tbl b/test/input/abixy.tbl new file mode 100644 index 000000000..448a68bf0 --- /dev/null +++ b/test/input/abixy.tbl @@ -0,0 +1,14 @@ ++-----+-----+----+---------------------+---------------------+ +| a | b | i | x | y | ++-----+-----+----+---------------------+---------------------+ +| pan | pan | 1 | 0.3467901443380824 | 0.7268028627434533 | +| eks | pan | 2 | 0.7586799647899636 | 0.5221511083334797 | +| wye | wye | 3 | 0.20460330576630303 | 0.33831852551664776 | +| eks | wye | 4 | 0.38139939387114097 | 0.13418874328430463 | +| wye | pan | 5 | 0.5732889198020006 | 0.8636244699032729 | +| zee | pan | 6 | 0.5271261600918548 | 0.49322128674835697 | +| eks | zee | 7 | 0.6117840605678454 | 0.1878849191181694 | +| zee | wye | 8 | 0.5985540091064224 | 0.976181385699006 | +| hat | wye | 9 | 0.03144187646093577 | 0.7495507603507059 | +| pan | wye | 10 | 0.5026260055412137 | 0.9526183602969864 | ++-----+-----+----+---------------------+---------------------+ diff --git a/test/input/binary.json b/test/input/binary.json new file mode 100644 index 000000000..9bf2f47be --- /dev/null +++ b/test/input/binary.json @@ -0,0 +1,5 @@ +[ +{ + "msg": "X\u0001\b����\u0012Y" +} +] diff --git a/test/input/example-with-changed-keys.dkvp b/test/input/example-with-changed-keys.dkvp new file mode 100644 index 000000000..4ec2ac863 --- /dev/null +++ b/test/input/example-with-changed-keys.dkvp @@ -0,0 +1,10 @@ +color=yellow,shape=triangle,flag=true,k=1,index=11,quantity=43.6498,rate=9.8870 +color=red,shape=square,flag=true,k=2,index=15,quantity=79.2778,rate=0.0130 +weird=red,shape=circle,flag=true,k=3,index=16,quantity=13.8103,rate=2.9010 +color=red,shape=square,flag=false,k=4,index=48,quantity=77.5542,rate=7.4670 +color=purple,shape=triangle,flag=false,k=5,index=51,quantity=81.2290,rate=8.5910 +color=red,shape=square,flag=false,k=6,index=64,odd=77.1991,rate=9.5310 +color=purple,shape=triangle,flag=false,k=7,index=65,quantity=80.1405,rate=5.8240 +color=yellow,shape=circle,flag=true,k=8,index=73,quantity=63.9785,rate=4.2370 +color=yellow,shape=circle,flag=true,k=9,index=87,quantity=63.5058,rate=8.3350 +color=purple,shape=square,flag=false,k=10,index=91,quantity=72.3735,rate=8.2430 diff --git a/test/input/example-with-extra-keys.dkvp b/test/input/example-with-extra-keys.dkvp new file mode 100644 index 000000000..54ecf74e2 --- /dev/null +++ b/test/input/example-with-extra-keys.dkvp @@ -0,0 +1,10 @@ +color=yellow,shape=triangle,flag=true,k=1,index=11,quantity=43.6498,rate=9.8870 +color=red,shape=square,flag=true,k=2,index=15,quantity=79.2778,rate=0.0130 +color=red,shape=circle,flag=true,k=3,index=16,quantity=13.8103,rate=2.9010 +color=red,shape=square,flag=false,k=4,index=48,quantity=77.5542,rate=7.4670 +color=purple,shape=triangle,flag=false,k=5,index=51,quantity=81.2290,rate=8.5910,extra=here +color=red,shape=square,flag=false,k=6,index=64,quantity=77.1991,rate=9.5310 +color=purple,shape=triangle,flag=false,k=7,index=65,quantity=80.1405,rate=5.8240 +color=yellow,shape=circle,flag=true,k=8,index=73,quantity=63.9785,rate=4.2370 +color=yellow,shape=circle,flag=true,k=9,index=87,quantity=63.5058,rate=8.3350 +color=purple,shape=square,flag=false,k=10,index=91,quantity=72.3735,rate=8.2430 diff --git a/test/input/example-with-missing-keys.dkvp b/test/input/example-with-missing-keys.dkvp new file mode 100644 index 000000000..ae8632ec0 --- /dev/null +++ b/test/input/example-with-missing-keys.dkvp @@ -0,0 +1,10 @@ +color=yellow,shape=triangle,flag=true,k=1,index=11,quantity=43.6498,rate=9.8870 +color=red,shape=square,flag=true,k=2,index=15,quantity=79.2778,rate=0.0130 +shape=circle,flag=true,k=3,index=16,quantity=13.8103,rate=2.9010 +color=red,shape=square,flag=false,k=4,index=48,quantity=77.5542,rate=7.4670 +color=purple,shape=triangle,flag=false,index=51,quantity=81.2290,rate=8.5910 +color=red,shape=square,flag=false,k=6,index=64,quantity=77.1991,rate=9.5310 +color=purple,shape=triangle,flag=false,k=7,index=65,quantity=80.1405,rate=5.8240 +color=yellow,shape=circle,flag=true,k=8,index=73,quantity=63.9785,rate=4.2370 +color=yellow,shape=circle,flag=true,k=9,index=87,quantity=63.5058,rate=8.3350 +color=purple,shape=square,flag=false,k=10,index=91,quantity=72.3735,rate=8.2430 diff --git a/test/input/example.dkvp b/test/input/example.dkvp new file mode 100644 index 000000000..73bc10242 --- /dev/null +++ b/test/input/example.dkvp @@ -0,0 +1,10 @@ +color=yellow,shape=triangle,flag=true,k=1,index=11,quantity=43.6498,rate=9.8870 +color=red,shape=square,flag=true,k=2,index=15,quantity=79.2778,rate=0.0130 +color=red,shape=circle,flag=true,k=3,index=16,quantity=13.8103,rate=2.9010 +color=red,shape=square,flag=false,k=4,index=48,quantity=77.5542,rate=7.4670 +color=purple,shape=triangle,flag=false,k=5,index=51,quantity=81.2290,rate=8.5910 +color=red,shape=square,flag=false,k=6,index=64,quantity=77.1991,rate=9.5310 +color=purple,shape=triangle,flag=false,k=7,index=65,quantity=80.1405,rate=5.8240 +color=yellow,shape=circle,flag=true,k=8,index=73,quantity=63.9785,rate=4.2370 +color=yellow,shape=circle,flag=true,k=9,index=87,quantity=63.5058,rate=8.3350 +color=purple,shape=square,flag=false,k=10,index=91,quantity=72.3735,rate=8.2430 diff --git a/test/input/pr-1346.csv b/test/input/pr-1346.csv new file mode 100644 index 000000000..6a46e0994 --- /dev/null +++ b/test/input/pr-1346.csv @@ -0,0 +1,6 @@ +field1,field2 +a,b +# that was the first record +c,d +# that was the second record, and there is no more data + diff --git a/test/input/pr-1787.csv b/test/input/pr-1787.csv new file mode 100644 index 000000000..23b8c638c --- /dev/null +++ b/test/input/pr-1787.csv @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +# x"y +4,5,6 diff --git a/test/input/reorder-regex.nidx b/test/input/reorder-regex.nidx new file mode 100644 index 000000000..6a76ef8fa --- /dev/null +++ b/test/input/reorder-regex.nidx @@ -0,0 +1 @@ +a b c d e f g h i j diff --git a/test/input/sparsify-input.csv b/test/input/sparsify-input.csv new file mode 100644 index 000000000..16916596e --- /dev/null +++ b/test/input/sparsify-input.csv @@ -0,0 +1,5 @@ +a,b,c +1,2,3 +4,5, +,, +7,8,9 diff --git a/test/input/surv.csv b/test/input/surv.csv new file mode 100644 index 000000000..f025f6a5c --- /dev/null +++ b/test/input/surv.csv @@ -0,0 +1,6 @@ +duration,status +1,1 +2,0 +3,1 +4,0 +5,1 \ No newline at end of file diff --git a/test/input/unflatten-input-2.xtab b/test/input/unflatten-input-2.xtab index 97b1941e1..21ea4bd2b 100644 --- a/test/input/unflatten-input-2.xtab +++ b/test/input/unflatten-input-2.xtab @@ -13,3 +13,8 @@ empty1 {} empty2 [] wrapper@empty3 {} wrapper@emtpy4 [] +x@y 1 +@ 2 +x@ 3 +@y 4 +x@@y 5 diff --git a/test/input/whitespace 1.csv b/test/input/whitespace 1.csv new file mode 100644 index 000000000..bfde6bfa0 --- /dev/null +++ b/test/input/whitespace 1.csv @@ -0,0 +1,2 @@ +a,b,c +1,2,3 diff --git a/test/input/whitespace 2.csv b/test/input/whitespace 2.csv new file mode 100644 index 000000000..a9411aa9d --- /dev/null +++ b/test/input/whitespace 2.csv @@ -0,0 +1,2 @@ +a,b,c +4,5,6 diff --git a/tools/build-dsl b/tools/build-dsl index e2a6186d2..4cf70cbf5 100755 --- a/tools/build-dsl +++ b/tools/build-dsl @@ -27,8 +27,8 @@ if [ $# -eq 1 ]; then fi fi -# Build the bin/gocc executable: -go install github.com/goccmack/gocc +# Build the bin/gocc executable (use my fork for performance): +go install github.com/johnkerl/gocc go mod tidy bingocc="$HOME/go/bin/gocc" if [ ! -x "$bingocc" ]; then diff --git a/xtodo.txt b/xtodo.txt deleted file mode 100644 index e3dab2ea5..000000000 --- a/xtodo.txt +++ /dev/null @@ -1,70 +0,0 @@ ----------------------------------------------------------------- -* look at: mr -vvv test/cases/io-spec-tsv/0004/cmd - ----------------------------------------------------------------- - -func (keeper *PercentileKeeper) EmitNamed(name string) *mlrval.Mlrval { - if name == "min" { - return keeper.EmitNonInterpolated(0.0) - } else if name == "p25" { - return keeper.EmitNonInterpolated(25.0) - } else if name == "median" { - return keeper.EmitNonInterpolated(50.0) - } else if name == "p75" { - return keeper.EmitNonInterpolated(75.0) - } else if name == "max" { - return keeper.EmitNonInterpolated(100.0) - - } else if name == "iqr" { - p25 := keeper.EmitNonInterpolated(25.0) - p75 := keeper.EmitNonInterpolated(75.0) - if p25.IsNumeric() && p75.IsNumeric() { - return bifs.BIF_minus_binary(p75, p25) - } else { - return mlrval.VOID - } - - } else if name == "lof" { - p25 := keeper.EmitNonInterpolated(25.0) - iqr := keeper.EmitNamed("iqr") - if p25.IsNumeric() && iqr.IsNumeric() { - return bifs.BIF_minus_binary(p25, bifs.BIF_times(fenceOuterK, iqr)) - } else { - return mlrval.VOID - } - - } else if name == "lif" { - p25 := keeper.EmitNonInterpolated(25.0) - iqr := keeper.EmitNamed("iqr") - if p25.IsNumeric() && iqr.IsNumeric() { - return bifs.BIF_minus_binary(p25, bifs.BIF_times(fenceInnerK, iqr)) - } else { - return mlrval.VOID - } - - } else if name == "uif" { - p75 := keeper.EmitNonInterpolated(75.0) - iqr := keeper.EmitNamed("iqr") - if p75.IsNumeric() && iqr.IsNumeric() { - return bifs.BIF_plus_binary(p75, bifs.BIF_times(fenceInnerK, iqr)) - } else { - return mlrval.VOID - } - - } else if name == "uof" { - p75 := keeper.EmitNonInterpolated(75.0) - iqr := keeper.EmitNamed("iqr") - if p75.IsNumeric() && iqr.IsNumeric() { - return bifs.BIF_plus_binary(p75, bifs.BIF_times(fenceOuterK, iqr)) - } else { - return mlrval.VOID - } - - } else { - return mlrval.FromError( - errors.New( - "stats1: unrecognized - ), - ) - } -}