mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
Compare commits
216 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f98a35bb05 | ||
|
|
09083a0d25 | ||
|
|
b13037c84f | ||
|
|
8ec8de61e3 | ||
|
|
888d27acdb | ||
|
|
49869ba8e4 | ||
|
|
eb972e19eb | ||
|
|
4ce21e998b | ||
|
|
e08e3ca80c | ||
|
|
1cc17e27b0 | ||
|
|
a504e16b93 | ||
|
|
cee04c0747 | ||
|
|
421042833a | ||
|
|
b8db798a2f | ||
|
|
5b6f64669a | ||
|
|
7b8822e2ef | ||
|
|
ac30743242 | ||
|
|
0b8da34b4a | ||
|
|
dc9105a922 | ||
|
|
38e9ff212b | ||
|
|
8f1e327b4e | ||
|
|
e5d65fd28c | ||
|
|
fe6c8d57bc | ||
|
|
c078c80361 | ||
|
|
34b1f0d4e9 | ||
|
|
9920e28b91 | ||
|
|
1279a9b4a7 | ||
|
|
155227cb4c | ||
|
|
2f46fec72d | ||
|
|
93be5051ff | ||
|
|
df74ffe40d | ||
|
|
439c4a2061 | ||
|
|
efb7b55da5 | ||
|
|
2aa664bfea | ||
|
|
e5218ed8e7 | ||
|
|
a66e45539d | ||
|
|
6351f51eeb | ||
|
|
df8e979b66 | ||
|
|
2a78d165ae | ||
|
|
bc9c718cf9 | ||
|
|
9149fd0d34 | ||
|
|
aea74327ff | ||
|
|
6100f21785 | ||
|
|
3e374f8861 | ||
|
|
74f4901d05 | ||
|
|
e71b36d8c1 | ||
|
|
1557e47ae1 | ||
|
|
8f882b2f75 | ||
|
|
f5226e87fe | ||
|
|
f485bc07a5 | ||
|
|
eac1785756 | ||
|
|
f350581175 | ||
|
|
5c5281fe28 | ||
|
|
14e0229c34 | ||
|
|
fbe1143e8a | ||
|
|
46a86503ea | ||
|
|
2d29beb204 | ||
|
|
aec5c03093 | ||
|
|
26826a0b4b | ||
|
|
46653f0a8f | ||
|
|
d87bd9f7d3 | ||
|
|
3b9f169162 | ||
|
|
05429ee3ba | ||
|
|
2f3b6d38f9 | ||
|
|
74e8e3cef6 | ||
|
|
2f38933a87 | ||
|
|
43f6fa9ea6 | ||
|
|
d0f824aefe | ||
|
|
120e977c1e | ||
|
|
6266a869eb | ||
|
|
6509ed4586 | ||
|
|
db11c17e54 | ||
|
|
3c2d4b22d2 | ||
|
|
3ad00b5686 | ||
|
|
d2925aafe5 | ||
|
|
8b524b3ada | ||
|
|
4d83e88ff6 | ||
|
|
cd6431f7aa | ||
|
|
4ebef873d2 | ||
|
|
06e16ea3ee | ||
|
|
369156b70d | ||
|
|
78da997077 | ||
|
|
d4ace7527b | ||
|
|
f3a8fd42bc | ||
|
|
24a6e98709 | ||
|
|
ab7a80cbf4 | ||
|
|
44ddaea651 | ||
|
|
19e72f9dac | ||
|
|
3b8668d06f | ||
|
|
e6ca3f6856 | ||
|
|
1ef87c6278 | ||
|
|
226c9555ef | ||
|
|
cf03b6d49c | ||
|
|
f3fdfc4e29 | ||
|
|
52b7a47ae9 | ||
|
|
c4c3ae2119 | ||
|
|
b77d9826ea | ||
|
|
9445046bfe | ||
|
|
fccdf215e6 | ||
|
|
d264f562dc | ||
|
|
e7fe363d9a | ||
|
|
865c9cc563 | ||
|
|
23acc8424a | ||
|
|
f673c1a30e | ||
|
|
3137313867 | ||
|
|
0ba6710a79 | ||
|
|
127c4925a2 | ||
|
|
fefb304650 | ||
|
|
7a6958926d | ||
|
|
b7248bae98 | ||
|
|
99a98b0dc7 | ||
|
|
d6cd981c87 | ||
|
|
e67bdef98e | ||
|
|
4d84f99120 | ||
|
|
de05d9665b | ||
|
|
d30501a69b | ||
|
|
34c9d764d8 | ||
|
|
8e07a2f78d | ||
|
|
cc7f72b741 | ||
|
|
68f2845578 | ||
|
|
ea242a242a | ||
|
|
d14dc76318 | ||
|
|
230b348a71 | ||
|
|
e9637bba9d | ||
|
|
df73ad8ec0 | ||
|
|
35c7eeb977 | ||
|
|
ca7d47454d | ||
|
|
bbcf903647 | ||
|
|
34bc8a1c3d | ||
|
|
100166532c | ||
|
|
629aebb989 | ||
|
|
121dd9425f | ||
|
|
07130d8d65 | ||
|
|
b6ee2eb202 | ||
|
|
6e6e893bda | ||
|
|
f13a246754 | ||
|
|
48eba537aa | ||
|
|
1bfb8b0cc4 | ||
|
|
b0addbe4f7 | ||
|
|
d45e7b06a6 | ||
|
|
d08ee47732 | ||
|
|
9963df4090 | ||
|
|
7d51030b88 | ||
|
|
8e11fd36d5 | ||
|
|
4fe7051c1e | ||
|
|
ea0550b09b | ||
|
|
a9a2549074 | ||
|
|
20e1c87801 | ||
|
|
bd2497a285 | ||
|
|
225072384a | ||
|
|
6bed7bb560 | ||
|
|
813a5204dc | ||
|
|
70c485695c | ||
|
|
107e57e3e4 | ||
|
|
cf458f0230 | ||
|
|
3738b617ae | ||
|
|
ce3123b3fa | ||
|
|
e3a1e833f0 | ||
|
|
2b6fa35388 | ||
|
|
9bf883233e | ||
|
|
a83470d16c | ||
|
|
6287b04fa8 | ||
|
|
0060cceafc | ||
|
|
cc1cd954ea | ||
|
|
8088850505 | ||
|
|
06e33c0f82 | ||
|
|
929a2357d0 | ||
|
|
dde2cd20a7 | ||
|
|
8bc3c5f645 | ||
|
|
63654683f0 | ||
|
|
c01fe78fbd | ||
|
|
e62a0b4b20 | ||
|
|
0614b37dfa | ||
|
|
a728524bf3 | ||
|
|
9f77bbe096 | ||
|
|
5c65edba95 | ||
|
|
019b15a310 | ||
|
|
3050e0aeea | ||
|
|
87da641d48 | ||
|
|
2868fb6e7e | ||
|
|
c189b6a2d8 | ||
|
|
b0f9e03609 | ||
|
|
3d17ca117c | ||
|
|
cd3b0a62ab | ||
|
|
214129a95e | ||
|
|
193a2ee37b | ||
|
|
296430fe41 | ||
|
|
5424e753a4 | ||
|
|
41649bf4f9 | ||
|
|
b4ff26a7d0 | ||
|
|
02bd5344b9 | ||
|
|
8c791f5466 | ||
|
|
04a9b9decd | ||
|
|
cc8a3c4b4e | ||
|
|
047cb4bc28 | ||
|
|
d7a5997d70 | ||
|
|
1f6432e260 | ||
|
|
7225f2c094 | ||
|
|
bf320bcc99 | ||
|
|
05aa16cfcf | ||
|
|
07c896833c | ||
|
|
979addd3c3 | ||
|
|
2b4a0c2ca8 | ||
|
|
4e3b500f94 | ||
|
|
acc8a490e8 | ||
|
|
e9fbd9f48d | ||
|
|
6ea8e238db | ||
|
|
fd3e0d8ffc | ||
|
|
bfa1fd4b28 | ||
|
|
e18eac29db | ||
|
|
8789f73d7b | ||
|
|
6eb5721070 | ||
|
|
7a0320fc27 | ||
|
|
39c88041d6 | ||
|
|
a0d65c3035 | ||
|
|
f751084013 |
350 changed files with 2315 additions and 1846 deletions
8
.github/workflows/codeql-analysis.yml
vendored
8
.github/workflows/codeql-analysis.yml
vendored
|
|
@ -36,11 +36,11 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea
|
||||
uses: github/codeql-action/init@cdefb33c0f6224e58673d9004f47f7cb3e328b89
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
|
|
@ -51,7 +51,7 @@ jobs:
|
|||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea
|
||||
uses: github/codeql-action/autobuild@cdefb33c0f6224e58673d9004f47f7cb3e328b89
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 https://git.io/JvXDl
|
||||
|
|
@ -65,4 +65,4 @@ jobs:
|
|||
# make release
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea
|
||||
uses: github/codeql-action/analyze@cdefb33c0f6224e58673d9004f47f7cb3e328b89
|
||||
|
|
|
|||
4
.github/workflows/codespell.yml
vendored
4
.github/workflows/codespell.yml
vendored
|
|
@ -21,7 +21,7 @@ jobs:
|
|||
steps:
|
||||
# Check out the code base
|
||||
- name: Check out code
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
|
||||
with:
|
||||
# Full git history is needed to get a proper list of changed files within `super-linter`
|
||||
fetch-depth: 0
|
||||
|
|
@ -29,7 +29,7 @@ jobs:
|
|||
# Run linter against code base
|
||||
# https://github.com/codespell-project/codespell
|
||||
- name: Codespell
|
||||
uses: codespell-project/actions-codespell@406322ec52dd7b488e48c1c4b82e2a8b3a1bf630
|
||||
uses: codespell-project/actions-codespell@8f01853be192eb0f849a5c7d721450e7a467c579
|
||||
with:
|
||||
check_filenames: true
|
||||
ignore_words_file: .codespellignore
|
||||
|
|
|
|||
8
.github/workflows/go.yml
vendored
8
.github/workflows/go.yml
vendored
|
|
@ -15,12 +15,12 @@ jobs:
|
|||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32
|
||||
uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5
|
||||
with:
|
||||
go-version: 1.19
|
||||
go-version: 1.24
|
||||
|
||||
- name: Build
|
||||
run: make build
|
||||
|
|
@ -41,7 +41,7 @@ jobs:
|
|||
if: matrix.os == 'windows-latest'
|
||||
run: mkdir -p bin/${{matrix.os}} && cp mlr.exe bin/${{matrix.os}}
|
||||
|
||||
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
- uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
|
||||
with:
|
||||
name: mlr-${{matrix.os}}
|
||||
path: bin/${{matrix.os}}/*
|
||||
|
|
|
|||
29
.github/workflows/release-snap.yaml
vendored
Normal file
29
.github/workflows/release-snap.yaml
vendored
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
name: Release for Snap
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- v*
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
snap:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, ubuntu-24.04-arm]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build snap
|
||||
uses: snapcore/action-build@v1
|
||||
id: build
|
||||
|
||||
- name: Publish to Snap Store
|
||||
uses: snapcore/action-publish@v1
|
||||
env:
|
||||
SNAPCRAFT_STORE_CREDENTIALS: ${{ secrets.SNAPCRAFT_TOKEN }}
|
||||
with:
|
||||
snap: ${{ steps.build.outputs.snap }}
|
||||
# release: stable # or edge, beta, candidate
|
||||
release: stable
|
||||
12
.github/workflows/release.yml
vendored
12
.github/workflows/release.yml
vendored
|
|
@ -1,4 +1,4 @@
|
|||
name: Release
|
||||
name: Release for GitHub
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
|
|
@ -6,7 +6,7 @@ on:
|
|||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
GO_VERSION: 1.21.1
|
||||
GO_VERSION: 1.24.5
|
||||
|
||||
jobs:
|
||||
release:
|
||||
|
|
@ -17,19 +17,19 @@ jobs:
|
|||
runs-on: ${{ matrix.platform }}
|
||||
steps:
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32
|
||||
uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5
|
||||
with:
|
||||
go-version: ${{ env.GO_VERSION }}
|
||||
id: go
|
||||
|
||||
- name: Check out code into the Go module directory
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
# https://github.com/marketplace/actions/cache
|
||||
- name: Cache Go modules
|
||||
uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9
|
||||
uses: actions/cache@8b402f58fbc84540c8b491a91e594a4576fec3d7
|
||||
with:
|
||||
path: |
|
||||
~/.cache/go-build
|
||||
|
|
@ -40,7 +40,7 @@ jobs:
|
|||
|
||||
# https://goreleaser.com/ci/actions/
|
||||
- name: Run GoReleaser
|
||||
uses: goreleaser/goreleaser-action@286f3b13b1b49da4ac219696163fb8c1c93e1200
|
||||
uses: goreleaser/goreleaser-action@e435ccd777264be153ace6237001ef4d979d3a7a
|
||||
#if: startsWith(github.ref, 'refs/tags/v')
|
||||
with:
|
||||
version: latest
|
||||
|
|
|
|||
28
.github/workflows/test-snap-can-build.yml
vendored
Normal file
28
.github/workflows/test-snap-can-build.yml
vendored
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
name: 🧪 Snap Builds
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: '*'
|
||||
pull_request:
|
||||
branches: '*'
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
node-version: [20.x]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- uses: snapcore/action-build@v1
|
||||
id: build
|
||||
|
||||
- uses: diddlesnaps/snapcraft-review-action@v1
|
||||
with:
|
||||
snap: ${{ steps.build.outputs.snap }}
|
||||
isClassic: 'false'
|
||||
# Plugs and Slots declarations to override default denial (requires store assertion to publish)
|
||||
# plugs: ./plug-declaration.json
|
||||
# slots: ./slot-declaration.json
|
||||
|
|
@ -95,13 +95,14 @@ So, in broad overview, the key packages are:
|
|||
|
||||
* Miller dependencies are all in the Go standard library, except two:
|
||||
* GOCC lexer/parser code-generator from [github.com/goccmack/gocc](https://github.com/goccmack/gocc):
|
||||
* Forked at [github.com/johnkerl/gocc](github.com/johnkerl/gocc).
|
||||
* This package defines the grammar for Miller's domain-specific language (DSL) for the Miller `put` and `filter` verbs. And, GOCC is a joy to use. :)
|
||||
* It is used on the terms of its open-source license.
|
||||
* [golang.org/x/term](https://pkg.go.dev/golang.org/x/term):
|
||||
* Just a one-line Miller callsite for is-a-terminal checking for the [Miller REPL](./pkg/terminals/repl/README.md).
|
||||
* It is used on the terms of its open-source license.
|
||||
* See also [./go.mod](go.mod). Setup:
|
||||
* `go get github.com/goccmack/gocc`
|
||||
* `go get github.com/johnkerl/gocc`
|
||||
* `go get golang.org/x/term`
|
||||
|
||||
### Miller per se
|
||||
|
|
|
|||
14
README.md
14
README.md
|
|
@ -29,6 +29,7 @@ key-value-pair data in a variety of data formats.
|
|||
* [Miller in 10 minutes](https://miller.readthedocs.io/en/latest/10min)
|
||||
* [A Guide To Command-Line Data Manipulation](https://www.smashingmagazine.com/2022/12/guide-command-line-data-manipulation-cli-miller)
|
||||
* [A quick tutorial on Miller](https://www.ict4g.net/adolfo/notes/data-analysis/miller-quick-tutorial.html)
|
||||
* [Miller Exercises](https://github.com/GuilloteauQ/miller-exercises)
|
||||
* [Tools to manipulate CSV files from the Command Line](https://www.ict4g.net/adolfo/notes/data-analysis/tools-to-manipulate-csv.html)
|
||||
* [www.togaware.com/linux/survivor/CSV_Files.html](https://www.togaware.com/linux/survivor/CSV_Files.html)
|
||||
* [MLR for CSV manipulation](https://guillim.github.io/terminal/2018/06/19/MLR-for-CSV-manipulation.html)
|
||||
|
|
@ -45,22 +46,18 @@ key-value-pair data in a variety of data formats.
|
|||
* [Active issues](https://github.com/johnkerl/miller/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc)
|
||||
|
||||
# Installing
|
||||
|
||||
There's a good chance you can get Miller pre-built for your system:
|
||||
|
||||
[](https://launchpad.net/ubuntu/+source/miller)
|
||||
[](https://launchpad.net/ubuntu/xenial/+package/miller)
|
||||
[](https://packages.fedoraproject.org/pkgs/miller/miller/)
|
||||
[](https://packages.debian.org/stable/miller)
|
||||
[](https://packages.gentoo.org/packages/sys-apps/miller)
|
||||
|
||||
[](http://www.pro-linux.de/cgi-bin/DBApp/check.cgi?ShowApp..20427.100)
|
||||
[](https://aur.archlinux.org/packages/miller-git)
|
||||
|
||||
[](http://pkgsrc.se/textproc/miller)
|
||||
[](https://www.freshports.org/textproc/miller/)
|
||||
|
||||
[](https://anaconda.org/conda-forge/miller/)
|
||||
[](https://snapcraft.io/miller)
|
||||
[](https://formulae.brew.sh/formula/miller)
|
||||
[](https://www.macports.org/ports.php?by=name&substr=miller)
|
||||
[](https://chocolatey.org/packages/miller)
|
||||
|
|
@ -68,9 +65,9 @@ There's a good chance you can get Miller pre-built for your system:
|
|||
|
||||
|OS|Installation command|
|
||||
|---|---|
|
||||
|Linux|`yum install miller`<br/> `apt-get install miller`|
|
||||
|Linux|`yum install miller`<br/> `apt-get install miller`<br/> `snap install miller`|
|
||||
|Mac|`brew install miller`<br/>`port install miller`|
|
||||
|Windows|`choco install miller`<br/>`winget install Miller.Miller`|
|
||||
|Windows|`choco install miller`<br/>`winget install Miller.Miller`<br/>`scoop install main/miller`|
|
||||
|
||||
See also [README-versions.md](./README-versions.md) for a full list of package versions. Note that long-term-support (LtS) releases will likely be on older versions.
|
||||
|
||||
|
|
@ -94,6 +91,7 @@ See also [building from source](https://miller.readthedocs.io/en/latest/build.ht
|
|||
[](https://github.com/johnkerl/miller/actions/workflows/go.yml)
|
||||
[](https://github.com/johnkerl/miller/actions/workflows/codeql-analysis.yml)
|
||||
[](https://github.com/johnkerl/miller/actions/workflows/codespell.yml)
|
||||
[](https://github.com/johnkerl/miller/actions/workflows/test-snap-can-build.yml)
|
||||
<!--
|
||||
[](https://github.com/johnkerl/miller/actions/workflows/release.yml)
|
||||
-->
|
||||
|
|
@ -112,7 +110,7 @@ See also [building from source](https://miller.readthedocs.io/en/latest/build.ht
|
|||
* Without `make`:
|
||||
* To build: `go build github.com/johnkerl/miller/v6/cmd/mlr`.
|
||||
* To run tests: `go test github.com/johnkerl/miller/v6/pkg/...` and `mlr regtest`.
|
||||
* To install: `go install github.com/johnkerl/miller/v6/cmd/mlr` will install to _GOPATH_`/bin/mlr`.
|
||||
* To install: `go install github.com/johnkerl/miller/v6/cmd/mlr@latest` will install to _GOPATH_`/bin/mlr`.
|
||||
* See also the doc page on [building from source](https://miller.readthedocs.io/en/latest/build).
|
||||
* For more developer information please see [README-dev.md](./README-dev.md).
|
||||
|
||||
|
|
|
|||
|
|
@ -6,12 +6,15 @@ import (
|
|||
"github.com/johnkerl/miller/v6/pkg/colorizer"
|
||||
)
|
||||
|
||||
const boldString = "\u001b[1m"
|
||||
const underlineString = "\u001b[4m"
|
||||
const reversedString = "\u001b[7m"
|
||||
const redString = "\u001b[1;31m"
|
||||
const blueString = "\u001b[1;34m"
|
||||
const defaultString = "\u001b[0m"
|
||||
const (
|
||||
boldString = "\u001b[1m"
|
||||
reversedString = "\u001b[7m"
|
||||
redString = "\u001b[1;31m"
|
||||
blueString = "\u001b[1;34m"
|
||||
defaultString = "\u001b[0m"
|
||||
|
||||
// underlineString = "\u001b[4m"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Printf("Hello, world!\n")
|
||||
|
|
|
|||
|
|
@ -28,9 +28,9 @@ mkdir -p $dir
|
|||
# ----------------------------------------------------------------
|
||||
# Run the parser-generator
|
||||
|
||||
# Build the bin/gocc executable:
|
||||
go get github.com/goccmack/gocc
|
||||
#go get github.com/johnkerl/gocc
|
||||
# Build the bin/gocc executable (use my fork for performance):
|
||||
# get github.com/goccmack/gocc
|
||||
go get github.com/johnkerl/gocc
|
||||
bingocc="$GOPATH/bin/gocc"
|
||||
|
||||
if [ ! -x "$bingocc" ]; then
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
module one
|
||||
|
||||
go 1.16
|
||||
go 1.24
|
||||
|
||||
require github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808 // indirect
|
||||
toolchain go1.24.5
|
||||
|
|
|
|||
|
|
@ -1,26 +0,0 @@
|
|||
github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808 h1:MBgZdx/wBJWTR2Q79mQfP6c8uXdQiu5JowfEz3KhFac=
|
||||
github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808/go.mod h1:dWhnuKE5wcnGTExA2DH6Iicu21YnWwOPMrc/GyhtbCk=
|
||||
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
|
|
@ -28,9 +28,9 @@ mkdir -p $dir
|
|||
# ----------------------------------------------------------------
|
||||
# Run the parser-generator
|
||||
|
||||
# Build the bin/gocc executable:
|
||||
go get github.com/goccmack/gocc
|
||||
#go get github.com/johnkerl/gocc
|
||||
# Build the bin/gocc executable (use my fork for performance):
|
||||
# go get github.com/goccmack/gocc
|
||||
go get github.com/johnkerl/gocc
|
||||
bingocc="$GOPATH/bin/gocc"
|
||||
if [ ! -x "$bingocc" ]; then
|
||||
exit 1
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
module two
|
||||
|
||||
go 1.16
|
||||
go 1.24
|
||||
|
||||
require github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808 // indirect
|
||||
toolchain go1.24.5
|
||||
|
|
|
|||
|
|
@ -1,26 +0,0 @@
|
|||
github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808 h1:MBgZdx/wBJWTR2Q79mQfP6c8uXdQiu5JowfEz3KhFac=
|
||||
github.com/goccmack/gocc v0.0.0-20210322175033-34358ebe5808/go.mod h1:dWhnuKE5wcnGTExA2DH6Iicu21YnWwOPMrc/GyhtbCk=
|
||||
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
5
delve.txt
Normal file
5
delve.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
dlv exec ./mlr -- --csv --from x.csv sub -a def ghi
|
||||
break main.main
|
||||
# or wherever
|
||||
restart
|
||||
continue
|
||||
|
|
@ -20,7 +20,7 @@ Quick links:
|
|||
|
||||
Let's take a quick look at some of the most useful Miller verbs -- file-format-aware, name-index-empowered equivalents of standard system commands.
|
||||
|
||||
For most of this section we'll use our [example.csv](./example.csv).
|
||||
For most of this section, we'll use our [example.csv](./example.csv).
|
||||
|
||||
`mlr cat` is like system `cat` (or `type` on Windows) -- it passes the data through unmodified:
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
Let's take a quick look at some of the most useful Miller verbs -- file-format-aware, name-index-empowered equivalents of standard system commands.
|
||||
|
||||
For most of this section we'll use our [example.csv](./example.csv).
|
||||
For most of this section, we'll use our [example.csv](./example.csv).
|
||||
|
||||
`mlr cat` is like system `cat` (or `type` on Windows) -- it passes the data through unmodified:
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ Quick links:
|
|||
|
||||
Please also see [Installation](installing-miller.md) for information about pre-built executables.
|
||||
|
||||
You will need to first install Go version 1.15 or higher: please see [https://go.dev](https://go.dev).
|
||||
You will need to first install Go ([this version](https://github.com/johnkerl/miller/blob/main/go.mod#L17)): please see [https://go.dev](https://go.dev).
|
||||
|
||||
## Miller license
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
Please also see [Installation](installing-miller.md) for information about pre-built executables.
|
||||
|
||||
You will need to first install Go version 1.15 or higher: please see [https://go.dev](https://go.dev).
|
||||
You will need to first install Go ([this version](https://github.com/johnkerl/miller/blob/main/go.mod#L17)): please see [https://go.dev](https://go.dev).
|
||||
|
||||
## Miller license
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ Vertical-tabular format is good for a quick look at CSV data layout -- seeing wh
|
|||
<b>wc -l data/flins.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
36635 data/flins.csv
|
||||
36635 data/flins.csv
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
|
|
@ -227,7 +227,7 @@ Peek at the data:
|
|||
<b>wc -l data/colored-shapes.dkvp</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
10078 data/colored-shapes.dkvp
|
||||
10078 data/colored-shapes.dkvp
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
|
|
|
|||
2
docs/src/data/flatten-dots.csv
Normal file
2
docs/src/data/flatten-dots.csv
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
a,b.,.c,.,d..e,f.g
|
||||
1,2,3,4,5,6
|
||||
|
|
|
@ -68,7 +68,7 @@ date,qoh
|
|||
<b>wc -l data/miss-date.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
1372 data/miss-date.csv
|
||||
1372 data/miss-date.csv
|
||||
</pre>
|
||||
|
||||
Since there are 1372 lines in the data file, some automation is called for. To find the missing dates, you can convert the dates to seconds since the epoch using `strptime`, then compute adjacent differences (the `cat -n` simply inserts record-counters):
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ Quick links:
|
|||
</div>
|
||||
# Features
|
||||
|
||||
Miller is like awk, sed, cut, join, and sort for **name-indexed data such as
|
||||
Miller is like awk, sed, cut, join, and sort for **name-indexed data, such as
|
||||
CSV, TSV, JSON, and JSON Lines**. You get to work with your data using named
|
||||
fields, without needing to count positional column indices.
|
||||
|
||||
|
|
@ -36,9 +36,9 @@ including but not limited to the familiar CSV, TSV, JSON, and JSON Lines.
|
|||
|
||||
* Miller complements SQL **databases**: you can slice, dice, and reformat data on the client side on its way into or out of a database. (See [SQL Examples](sql-examples.md).) You can also reap some of the benefits of databases for quick, setup-free one-off tasks when you just need to query some data in disk files in a hurry.
|
||||
|
||||
* Miller also goes beyond the classic Unix tools by stepping fully into our modern, **no-SQL** world: its essential record-heterogeneity property allows Miller to operate on data where records with different schema (field names) are interleaved.
|
||||
* Miller also goes beyond the classic Unix tools by stepping fully into our modern, **no-SQL** world: its essential record-heterogeneity property allows Miller to operate on data where records with different schemas (field names) are interleaved.
|
||||
|
||||
* Miller is **streaming**: most operations need only a single record in memory at a time, rather than ingesting all input before producing any output. For those operations which require deeper retention (`sort`, `tac`, `stats1`), Miller retains only as much data as needed. This means that whenever functionally possible, you can operate on files which are larger than your system's available RAM, and you can use Miller in **tail -f** contexts.
|
||||
* Miller is **streaming**: most operations need only a single record in memory at a time, rather than ingesting all input before producing any output. For those operations that require deeper retention (`sort`, `tac`, `stats1`), Miller retains only as much data as needed. This means that whenever functionally possible, you can operate on files that are larger than your system's available RAM, and you can use Miller in **tail -f** contexts.
|
||||
|
||||
* Miller is **pipe-friendly** and interoperates with the Unix toolkit
|
||||
|
||||
|
|
@ -46,10 +46,10 @@ including but not limited to the familiar CSV, TSV, JSON, and JSON Lines.
|
|||
|
||||
* Miller does **conversion** between formats
|
||||
|
||||
* Miller's **processing is format-aware**: e.g. CSV `sort` and `tac` keep header lines first
|
||||
* Miller's **processing is format-aware**: e.g., CSV `sort` and `tac` keep header lines first
|
||||
|
||||
* Miller has high-throughput **performance** on par with the Unix toolkit
|
||||
|
||||
* Not unlike [jq](https://stedolan.github.io/jq/) (for JSON), Miller is written in Go which is a portable, modern language, and Miller has no runtime dependencies. You can download or compile a single binary, `scp` it to a faraway machine, and expect it to work.
|
||||
* Not unlike [jq](https://stedolan.github.io/jq/) (for JSON), Miller is written in Go, which is a portable, modern language, and Miller has no runtime dependencies. You can download or compile a single binary, `scp` it to a faraway machine, and expect it to work.
|
||||
|
||||
Releases and release notes: [https://github.com/johnkerl/miller/releases](https://github.com/johnkerl/miller/releases).
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# Features
|
||||
|
||||
Miller is like awk, sed, cut, join, and sort for **name-indexed data such as
|
||||
Miller is like awk, sed, cut, join, and sort for **name-indexed data, such as
|
||||
CSV, TSV, JSON, and JSON Lines**. You get to work with your data using named
|
||||
fields, without needing to count positional column indices.
|
||||
|
||||
|
|
@ -20,9 +20,9 @@ including but not limited to the familiar CSV, TSV, JSON, and JSON Lines.
|
|||
|
||||
* Miller complements SQL **databases**: you can slice, dice, and reformat data on the client side on its way into or out of a database. (See [SQL Examples](sql-examples.md).) You can also reap some of the benefits of databases for quick, setup-free one-off tasks when you just need to query some data in disk files in a hurry.
|
||||
|
||||
* Miller also goes beyond the classic Unix tools by stepping fully into our modern, **no-SQL** world: its essential record-heterogeneity property allows Miller to operate on data where records with different schema (field names) are interleaved.
|
||||
* Miller also goes beyond the classic Unix tools by stepping fully into our modern, **no-SQL** world: its essential record-heterogeneity property allows Miller to operate on data where records with different schemas (field names) are interleaved.
|
||||
|
||||
* Miller is **streaming**: most operations need only a single record in memory at a time, rather than ingesting all input before producing any output. For those operations which require deeper retention (`sort`, `tac`, `stats1`), Miller retains only as much data as needed. This means that whenever functionally possible, you can operate on files which are larger than your system's available RAM, and you can use Miller in **tail -f** contexts.
|
||||
* Miller is **streaming**: most operations need only a single record in memory at a time, rather than ingesting all input before producing any output. For those operations that require deeper retention (`sort`, `tac`, `stats1`), Miller retains only as much data as needed. This means that whenever functionally possible, you can operate on files that are larger than your system's available RAM, and you can use Miller in **tail -f** contexts.
|
||||
|
||||
* Miller is **pipe-friendly** and interoperates with the Unix toolkit
|
||||
|
||||
|
|
@ -30,10 +30,10 @@ including but not limited to the familiar CSV, TSV, JSON, and JSON Lines.
|
|||
|
||||
* Miller does **conversion** between formats
|
||||
|
||||
* Miller's **processing is format-aware**: e.g. CSV `sort` and `tac` keep header lines first
|
||||
* Miller's **processing is format-aware**: e.g., CSV `sort` and `tac` keep header lines first
|
||||
|
||||
* Miller has high-throughput **performance** on par with the Unix toolkit
|
||||
|
||||
* Not unlike [jq](https://stedolan.github.io/jq/) (for JSON), Miller is written in Go which is a portable, modern language, and Miller has no runtime dependencies. You can download or compile a single binary, `scp` it to a faraway machine, and expect it to work.
|
||||
* Not unlike [jq](https://stedolan.github.io/jq/) (for JSON), Miller is written in Go, which is a portable, modern language, and Miller has no runtime dependencies. You can download or compile a single binary, `scp` it to a faraway machine, and expect it to work.
|
||||
|
||||
Releases and release notes: [https://github.com/johnkerl/miller/releases](https://github.com/johnkerl/miller/releases).
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ Miller handles name-indexed data using several formats: some you probably know
|
|||
by name, such as CSV, TSV, JSON, and JSON Lines -- and other formats you're likely already
|
||||
seeing and using in your structured data.
|
||||
|
||||
Additionally, Miller gives you the option of including comments within your data.
|
||||
Additionally, Miller gives you the option to include comments within your data.
|
||||
|
||||
## Examples
|
||||
|
||||
|
|
@ -102,13 +102,13 @@ NIDX: implicitly numerically indexed (Unix-toolkit style)
|
|||
|
||||
## CSV/TSV/ASV/USV/etc.
|
||||
|
||||
When `mlr` is invoked with the `--csv` or `--csvlite` option, key names are found on the first record and values are taken from subsequent records. This includes the case of CSV-formatted files. See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
|
||||
When `mlr` is invoked with the `--csv` or `--csvlite` option, key names are found on the first record, and values are taken from subsequent records. This includes the case of CSV-formatted files. See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
|
||||
|
||||
Miller has record separator `RS` and field separator `FS`, just as `awk` does. (See also the [separators page](reference-main-separators.md).)
|
||||
|
||||
**CSV (comma-separated values):** Miller's `--csv` flag supports [RFC-4180 CSV](https://tools.ietf.org/html/rfc4180).
|
||||
|
||||
* This includes CRLF line-terminators by default, regardless of platform.
|
||||
* This includes CRLF line terminators by default, regardless of platform.
|
||||
* Any cell containing a comma or a carriage return within it must be double-quoted.
|
||||
|
||||
**TSV (tab-separated values):** Miller's `--tsv` supports [IANA TSV](https://www.iana.org/assignments/media-types/text/tab-separated-values).
|
||||
|
|
@ -131,8 +131,8 @@ Here are the differences between CSV and CSV-lite:
|
|||
|
||||
* CSV does not allow heterogeneous data; CSV-lite does (see also [Record Heterogeneity](record-heterogeneity.md)).
|
||||
|
||||
* TSV-lite is simply CSV-lite with field separator set to tab instead of comma.
|
||||
In particular, no encode/decode of `\r`, `\n`, `\t`, or `\\` is done.
|
||||
* TSV-lite is simply CSV-lite with the field separator set to tab instead of a comma.
|
||||
In particular, no encoding/decoding of `\r`, `\n`, `\t`, or `\\` is done.
|
||||
|
||||
* CSV-lite allows changing FS and/or RS to any values, perhaps multi-character.
|
||||
|
||||
|
|
@ -208,21 +208,21 @@ mlr: exiting due to data error.
|
|||
|
||||
CSV, TSV, CSV-lite, and TSV-lite have in common the `--implicit-csv-header` flag for input and the `--headerless-csv-output` flag for output.
|
||||
|
||||
See also the [`--lazy-quotes` flag](reference-main-flag-list.md#csv-only-flags) which can help with CSV files which are not fully compliant with RFC-4180.
|
||||
See also the [`--lazy-quotes` flag](reference-main-flag-list.md#csv-only-flags), which can help with CSV files that are not fully compliant with RFC-4180.
|
||||
|
||||
## JSON
|
||||
|
||||
[JSON](https://json.org) is a format which supports scalars (numbers, strings,
|
||||
boolean, etc.) as well as "objects" (maps) and "arrays" (lists), while Miller
|
||||
booleans, etc.) as well as "objects" (maps) and "arrays" (lists), while Miller
|
||||
is a tool for handling **tabular data** only. By *tabular JSON* I mean the
|
||||
data is either a sequence of one or more objects, or an array consisting of one
|
||||
or more objects. Miller treats JSON objects as name-indexed records.
|
||||
|
||||
This means Miller cannot (and should not) handle arbitrary JSON. In practice,
|
||||
though, Miller can handle single JSON objects as well as list of them. The only
|
||||
kinds of JSON that are unmillerable are single scalars (e.g. file contents `3`)
|
||||
and arrays of non-object (e.g. file contents `[1,2,3,4,5]`). Check out
|
||||
[jq](https://stedolan.github.io/jq/) for a tool which handles all valid JSON.
|
||||
though, Miller can handle single JSON objects as well as lists of them. The only
|
||||
kinds of JSON that are unmillerable are single scalars (e.g., file contents `3`)
|
||||
and arrays of non-object (e.g., file contents `[1,2,3,4,5]`). Check out
|
||||
[jq](https://stedolan.github.io/jq/) for a tool that handles all valid JSON.
|
||||
|
||||
In short, if you have tabular data represented in JSON -- lists of objects,
|
||||
either with or without outermost `[...]` -- [then Miller can handle that for
|
||||
|
|
@ -336,7 +336,7 @@ input as well as output in JSON format, JSON structure is preserved throughout t
|
|||
]
|
||||
</pre>
|
||||
|
||||
But if the input format is JSON and the output format is not (or vice versa) then key-concatenation applies:
|
||||
But if the input format is JSON and the output format is not (or vice versa), then key-concatenation applies:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --ijson --opprint head -n 4 data/json-example-2.json</b>
|
||||
|
|
@ -355,7 +355,7 @@ Use `--jflatsep yourseparatorhere` to specify the string used for key concatenat
|
|||
|
||||
### JSON-in-CSV
|
||||
|
||||
It's quite common to have CSV data which contains stringified JSON as a column.
|
||||
It's quite common to have CSV data that contains stringified JSON as a column.
|
||||
See the [JSON parse and stringify section](reference-main-data-types.md#json-parse-and-stringify) for ways to
|
||||
decode these in Miller.
|
||||
|
||||
|
|
@ -410,7 +410,7 @@ records; using `--ojsonl`, you get no outermost `[...]`, and one line per record
|
|||
|
||||
## PPRINT: Pretty-printed tabular
|
||||
|
||||
Miller's pretty-print format is like CSV, but column-aligned. For example, compare
|
||||
Miller's pretty-print format is similar to CSV, but with column alignment. For example, compare
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --ocsv cat data/small</b>
|
||||
|
|
@ -436,7 +436,7 @@ eks wye 4 0.381399 0.134188
|
|||
wye pan 5 0.573288 0.863624
|
||||
</pre>
|
||||
|
||||
Note that while Miller is a line-at-a-time processor and retains input lines in memory only where necessary (e.g. for sort), pretty-print output requires it to accumulate all input lines (so that it can compute maximum column widths) before producing any output. This has two consequences: (a) pretty-print output won't work on `tail -f` contexts, where Miller will be waiting for an end-of-file marker which never arrives; (b) pretty-print output for large files is constrained by available machine memory.
|
||||
Note that while Miller is a line-at-a-time processor and retains input lines in memory only where necessary (e.g., for sort), pretty-print output requires it to accumulate all input lines (so that it can compute maximum column widths) before producing any output. This has two consequences: (a) Pretty-print output will not work in `tail -f` contexts, where Miller will be waiting for an end-of-file marker that never arrives; (b) Pretty-print output for large files is constrained by the available machine memory.
|
||||
|
||||
See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
|
||||
|
||||
|
|
@ -505,7 +505,7 @@ Markdown format looks like this:
|
|||
| wye | pan | 5 | 0.573288 | 0.863624 |
|
||||
</pre>
|
||||
|
||||
which renders like this when dropped into various web tools (e.g. github comments):
|
||||
which renders like this when dropped into various web tools (e.g. github.comments):
|
||||
|
||||

|
||||
|
||||
|
|
@ -594,7 +594,7 @@ a=eks,b=wye,i=4,x=0.381399,y=0.134188
|
|||
a=wye,b=pan,i=5,x=0.573288,y=0.863624
|
||||
</pre>
|
||||
|
||||
Such data are easy to generate, e.g. in Ruby with
|
||||
Such data is easy to generate, e.g., in Ruby with
|
||||
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
puts "host=#{hostname},seconds=#{t2-t1},message=#{msg}"
|
||||
|
|
@ -616,7 +616,7 @@ logger.log("type=3,user=$USER,date=$date\n");
|
|||
|
||||
Fields lacking an IPS will have positional index (starting at 1) used as the key, as in NIDX format. For example, `dish=7,egg=8,flint` is parsed as `"dish" => "7", "egg" => "8", "3" => "flint"` and `dish,egg,flint` is parsed as `"1" => "dish", "2" => "egg", "3" => "flint"`.
|
||||
|
||||
As discussed in [Record Heterogeneity](record-heterogeneity.md), Miller handles changes of field names within the same data stream. But using DKVP format this is particularly natural. One of my favorite use-cases for Miller is in application/server logs, where I log all sorts of lines such as
|
||||
As discussed in [Record Heterogeneity](record-heterogeneity.md), Miller handles changes of field names within the same data stream. But using DKVP format, this is particularly natural. One of my favorite use-cases for Miller is in application/server logs, where I log all sorts of lines such as
|
||||
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
resource=/path/to/file,loadsec=0.45,ok=true
|
||||
|
|
@ -624,10 +624,9 @@ record_count=100, resource=/path/to/file
|
|||
resource=/some/other/path,loadsec=0.97,ok=false
|
||||
</pre>
|
||||
|
||||
etc. and I just log them as needed. Then later, I can use `grep`, `mlr --opprint group-like`, etc.
|
||||
to analyze my logs.
|
||||
etc., and I log them as needed. Then later, I can use `grep`, `mlr --opprint group-like`, etc. to analyze my logs.
|
||||
|
||||
See the [separators page](reference-main-separators.md) regarding how to specify separators other than the default equals-sign and comma.
|
||||
See the [separators page](reference-main-separators.md) regarding how to specify separators other than the default equals sign and comma.
|
||||
|
||||
## NIDX: Index-numbered (toolkit style)
|
||||
|
||||
|
|
@ -712,17 +711,17 @@ As keystroke-savers for format-conversion you may use the following.
|
|||
The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX,
|
||||
JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively.
|
||||
|
||||
| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
|
||||
+----------+-------+-------+--------+--------+--------+--------+--------+--------+----------|
|
||||
| CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
|
||||
| TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
|
||||
| JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
|
||||
| JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m |
|
||||
| DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m |
|
||||
| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m |
|
||||
| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m |
|
||||
| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m |
|
||||
| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
|
||||
| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
|
||||
+----------+----------+----------+----------+-------+-------+-------+-------+--------+----------|
|
||||
| CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
|
||||
| TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
|
||||
| JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
|
||||
| JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m |
|
||||
| DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m |
|
||||
| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m |
|
||||
| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m |
|
||||
| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m |
|
||||
| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
|
||||
|
||||
-p Keystroke-saver for `--nidx --fs space --repifs`.
|
||||
-T Keystroke-saver for `--nidx --fs tab`.
|
||||
|
|
@ -730,7 +729,7 @@ JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively.
|
|||
|
||||
## Comments in data
|
||||
|
||||
You can include comments within your data files, and either have them ignored, or passed directly through to the standard output as soon as they are encountered:
|
||||
You can include comments within your data files, and either have them ignored or passed directly through to the standard output as soon as they are encountered:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr help comments-in-data-flags</b>
|
||||
|
|
@ -758,12 +757,14 @@ Notes:
|
|||
within the input.
|
||||
--pass-comments-with {string}
|
||||
Immediately print commented lines within input, with
|
||||
specified prefix.
|
||||
specified prefix. For CSV input format, the prefix
|
||||
must be a single character.
|
||||
--skip-comments Ignore commented lines (prefixed by `#`) within the
|
||||
input.
|
||||
--skip-comments-with {string}
|
||||
Ignore commented lines within input, with specified
|
||||
prefix.
|
||||
prefix. For CSV input format, the prefix must be a
|
||||
single character.
|
||||
</pre>
|
||||
|
||||
Examples:
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ Miller handles name-indexed data using several formats: some you probably know
|
|||
by name, such as CSV, TSV, JSON, and JSON Lines -- and other formats you're likely already
|
||||
seeing and using in your structured data.
|
||||
|
||||
Additionally, Miller gives you the option of including comments within your data.
|
||||
Additionally, Miller gives you the option to include comments within your data.
|
||||
|
||||
## Examples
|
||||
|
||||
|
|
@ -14,13 +14,13 @@ GENMD-EOF
|
|||
|
||||
## CSV/TSV/ASV/USV/etc.
|
||||
|
||||
When `mlr` is invoked with the `--csv` or `--csvlite` option, key names are found on the first record and values are taken from subsequent records. This includes the case of CSV-formatted files. See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
|
||||
When `mlr` is invoked with the `--csv` or `--csvlite` option, key names are found on the first record, and values are taken from subsequent records. This includes the case of CSV-formatted files. See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
|
||||
|
||||
Miller has record separator `RS` and field separator `FS`, just as `awk` does. (See also the [separators page](reference-main-separators.md).)
|
||||
|
||||
**CSV (comma-separated values):** Miller's `--csv` flag supports [RFC-4180 CSV](https://tools.ietf.org/html/rfc4180).
|
||||
|
||||
* This includes CRLF line-terminators by default, regardless of platform.
|
||||
* This includes CRLF line terminators by default, regardless of platform.
|
||||
* Any cell containing a comma or a carriage return within it must be double-quoted.
|
||||
|
||||
**TSV (tab-separated values):** Miller's `--tsv` supports [IANA TSV](https://www.iana.org/assignments/media-types/text/tab-separated-values).
|
||||
|
|
@ -43,8 +43,8 @@ Here are the differences between CSV and CSV-lite:
|
|||
|
||||
* CSV does not allow heterogeneous data; CSV-lite does (see also [Record Heterogeneity](record-heterogeneity.md)).
|
||||
|
||||
* TSV-lite is simply CSV-lite with field separator set to tab instead of comma.
|
||||
In particular, no encode/decode of `\r`, `\n`, `\t`, or `\\` is done.
|
||||
* TSV-lite is simply CSV-lite with the field separator set to tab instead of a comma.
|
||||
In particular, no encoding/decoding of `\r`, `\n`, `\t`, or `\\` is done.
|
||||
|
||||
* CSV-lite allows changing FS and/or RS to any values, perhaps multi-character.
|
||||
|
||||
|
|
@ -77,21 +77,21 @@ GENMD-EOF
|
|||
|
||||
CSV, TSV, CSV-lite, and TSV-lite have in common the `--implicit-csv-header` flag for input and the `--headerless-csv-output` flag for output.
|
||||
|
||||
See also the [`--lazy-quotes` flag](reference-main-flag-list.md#csv-only-flags) which can help with CSV files which are not fully compliant with RFC-4180.
|
||||
See also the [`--lazy-quotes` flag](reference-main-flag-list.md#csv-only-flags), which can help with CSV files that are not fully compliant with RFC-4180.
|
||||
|
||||
## JSON
|
||||
|
||||
[JSON](https://json.org) is a format which supports scalars (numbers, strings,
|
||||
boolean, etc.) as well as "objects" (maps) and "arrays" (lists), while Miller
|
||||
booleans, etc.) as well as "objects" (maps) and "arrays" (lists), while Miller
|
||||
is a tool for handling **tabular data** only. By *tabular JSON* I mean the
|
||||
data is either a sequence of one or more objects, or an array consisting of one
|
||||
or more objects. Miller treats JSON objects as name-indexed records.
|
||||
|
||||
This means Miller cannot (and should not) handle arbitrary JSON. In practice,
|
||||
though, Miller can handle single JSON objects as well as list of them. The only
|
||||
kinds of JSON that are unmillerable are single scalars (e.g. file contents `3`)
|
||||
and arrays of non-object (e.g. file contents `[1,2,3,4,5]`). Check out
|
||||
[jq](https://stedolan.github.io/jq/) for a tool which handles all valid JSON.
|
||||
though, Miller can handle single JSON objects as well as lists of them. The only
|
||||
kinds of JSON that are unmillerable are single scalars (e.g., file contents `3`)
|
||||
and arrays of non-object (e.g., file contents `[1,2,3,4,5]`). Check out
|
||||
[jq](https://stedolan.github.io/jq/) for a tool that handles all valid JSON.
|
||||
|
||||
In short, if you have tabular data represented in JSON -- lists of objects,
|
||||
either with or without outermost `[...]` -- [then Miller can handle that for
|
||||
|
|
@ -129,7 +129,7 @@ GENMD-RUN-COMMAND
|
|||
mlr --json head -n 2 data/json-example-2.json
|
||||
GENMD-EOF
|
||||
|
||||
But if the input format is JSON and the output format is not (or vice versa) then key-concatenation applies:
|
||||
But if the input format is JSON and the output format is not (or vice versa), then key-concatenation applies:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --ijson --opprint head -n 4 data/json-example-2.json
|
||||
|
|
@ -141,7 +141,7 @@ Use `--jflatsep yourseparatorhere` to specify the string used for key concatenat
|
|||
|
||||
### JSON-in-CSV
|
||||
|
||||
It's quite common to have CSV data which contains stringified JSON as a column.
|
||||
It's quite common to have CSV data that contains stringified JSON as a column.
|
||||
See the [JSON parse and stringify section](reference-main-data-types.md#json-parse-and-stringify) for ways to
|
||||
decode these in Miller.
|
||||
|
||||
|
|
@ -170,7 +170,7 @@ records; using `--ojsonl`, you get no outermost `[...]`, and one line per record
|
|||
|
||||
## PPRINT: Pretty-printed tabular
|
||||
|
||||
Miller's pretty-print format is like CSV, but column-aligned. For example, compare
|
||||
Miller's pretty-print format is similar to CSV, but with column alignment. For example, compare
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --ocsv cat data/small
|
||||
|
|
@ -180,7 +180,7 @@ GENMD-RUN-COMMAND
|
|||
mlr --opprint cat data/small
|
||||
GENMD-EOF
|
||||
|
||||
Note that while Miller is a line-at-a-time processor and retains input lines in memory only where necessary (e.g. for sort), pretty-print output requires it to accumulate all input lines (so that it can compute maximum column widths) before producing any output. This has two consequences: (a) pretty-print output won't work on `tail -f` contexts, where Miller will be waiting for an end-of-file marker which never arrives; (b) pretty-print output for large files is constrained by available machine memory.
|
||||
Note that while Miller is a line-at-a-time processor and retains input lines in memory only where necessary (e.g., for sort), pretty-print output requires it to accumulate all input lines (so that it can compute maximum column widths) before producing any output. This has two consequences: (a) Pretty-print output will not work in `tail -f` contexts, where Miller will be waiting for an end-of-file marker that never arrives; (b) Pretty-print output for large files is constrained by the available machine memory.
|
||||
|
||||
See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
|
||||
|
||||
|
|
@ -204,7 +204,7 @@ GENMD-RUN-COMMAND
|
|||
mlr --omd cat data/small
|
||||
GENMD-EOF
|
||||
|
||||
which renders like this when dropped into various web tools (e.g. github comments):
|
||||
which renders like this when dropped into various web tools (e.g. github.comments):
|
||||
|
||||

|
||||
|
||||
|
|
@ -280,7 +280,7 @@ GENMD-RUN-COMMAND
|
|||
mlr cat data/small
|
||||
GENMD-EOF
|
||||
|
||||
Such data are easy to generate, e.g. in Ruby with
|
||||
Such data is easy to generate, e.g., in Ruby with
|
||||
|
||||
GENMD-CARDIFY
|
||||
puts "host=#{hostname},seconds=#{t2-t1},message=#{msg}"
|
||||
|
|
@ -302,7 +302,7 @@ GENMD-EOF
|
|||
|
||||
Fields lacking an IPS will have positional index (starting at 1) used as the key, as in NIDX format. For example, `dish=7,egg=8,flint` is parsed as `"dish" => "7", "egg" => "8", "3" => "flint"` and `dish,egg,flint` is parsed as `"1" => "dish", "2" => "egg", "3" => "flint"`.
|
||||
|
||||
As discussed in [Record Heterogeneity](record-heterogeneity.md), Miller handles changes of field names within the same data stream. But using DKVP format this is particularly natural. One of my favorite use-cases for Miller is in application/server logs, where I log all sorts of lines such as
|
||||
As discussed in [Record Heterogeneity](record-heterogeneity.md), Miller handles changes of field names within the same data stream. But using DKVP format, this is particularly natural. One of my favorite use-cases for Miller is in application/server logs, where I log all sorts of lines such as
|
||||
|
||||
GENMD-CARDIFY
|
||||
resource=/path/to/file,loadsec=0.45,ok=true
|
||||
|
|
@ -310,10 +310,9 @@ record_count=100, resource=/path/to/file
|
|||
resource=/some/other/path,loadsec=0.97,ok=false
|
||||
GENMD-EOF
|
||||
|
||||
etc. and I just log them as needed. Then later, I can use `grep`, `mlr --opprint group-like`, etc.
|
||||
to analyze my logs.
|
||||
etc., and I log them as needed. Then later, I can use `grep`, `mlr --opprint group-like`, etc. to analyze my logs.
|
||||
|
||||
See the [separators page](reference-main-separators.md) regarding how to specify separators other than the default equals-sign and comma.
|
||||
See the [separators page](reference-main-separators.md) regarding how to specify separators other than the default equals sign and comma.
|
||||
|
||||
## NIDX: Index-numbered (toolkit style)
|
||||
|
||||
|
|
@ -361,7 +360,7 @@ GENMD-EOF
|
|||
|
||||
## Comments in data
|
||||
|
||||
You can include comments within your data files, and either have them ignored, or passed directly through to the standard output as soon as they are encountered:
|
||||
You can include comments within your data files, and either have them ignored or passed directly through to the standard output as soon as they are encountered:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr help comments-in-data-flags
|
||||
|
|
|
|||
|
|
@ -348,6 +348,50 @@ a.1,a.3,a.5
|
|||
]
|
||||
</pre>
|
||||
|
||||
## Non-inferencing cases
|
||||
|
||||
An additional heuristic is that if a field name starts with a `.`, ends with
|
||||
a `.`, or has two or more consecutive `.` characters, no attempt is made
|
||||
to unflatten it on conversion from non-JSON to JSON.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat data/flatten-dots.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
a,b.,.c,.,d..e,f.g
|
||||
1,2,3,4,5,6
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --icsv --oxtab cat data/flatten-dots.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
a 1
|
||||
b. 2
|
||||
.c 3
|
||||
. 4
|
||||
d..e 5
|
||||
f.g 6
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --icsv --ojson cat data/flatten-dots.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
[
|
||||
{
|
||||
"a": 1,
|
||||
"b.": 2,
|
||||
".c": 3,
|
||||
".": 4,
|
||||
"d..e": 5,
|
||||
"f": {
|
||||
"g": 6
|
||||
}
|
||||
}
|
||||
]
|
||||
</pre>
|
||||
|
||||
## Manual control
|
||||
|
||||
To see what our options are for manually controlling flattening and
|
||||
|
|
|
|||
|
|
@ -156,6 +156,24 @@ GENMD-RUN-COMMAND
|
|||
mlr --c2j cat data/non-consecutive.csv
|
||||
GENMD-EOF
|
||||
|
||||
## Non-inferencing cases
|
||||
|
||||
An additional heuristic is that if a field name starts with a `.`, ends with
|
||||
a `.`, or has two or more consecutive `.` characters, no attempt is made
|
||||
to unflatten it on conversion from non-JSON to JSON.
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
cat data/flatten-dots.csv
|
||||
GENMD-EOF
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --icsv --oxtab cat data/flatten-dots.csv
|
||||
GENMD-EOF
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --icsv --ojson cat data/flatten-dots.csv
|
||||
GENMD-EOF
|
||||
|
||||
## Manual control
|
||||
|
||||
To see what our options are for manually controlling flattening and
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ In this example I am using version 6.2.0 to 6.3.0; of course that will change fo
|
|||
* This creates `miller-6.3.0.tar.gz` which we'll upload to GitHub, the URL of which will be in our `miller.spec`
|
||||
* Prepare the source RPM following [README-RPM.md](https://github.com/johnkerl/miller/blob/main/README-RPM.md).
|
||||
|
||||
* Create the Github release tag:
|
||||
* Create the GitHub release tag:
|
||||
|
||||
* Don't forget the `v` in `v6.3.0`
|
||||
* Write the release notes -- save as a pre-release until below
|
||||
|
|
@ -48,7 +48,7 @@ In this example I am using version 6.2.0 to 6.3.0; of course that will change fo
|
|||
* Thanks to [PR 822](https://github.com/johnkerl/miller/pull/822) which introduces [goreleaser](https://github.com/johnkerl/miller/blob/main/.goreleaser.yml) there are versions for many platforms auto-built and auto-attached to the GitHub release.
|
||||
* Attach the release tarball and SRPM. Double-check assets were successfully uploaded.
|
||||
* Publish the release in pre-release mode, until all CI jobs finish successfully. Note that gorelease will create and attach the rest of the binaries.
|
||||
* Before marking the release as public, download an executable from among the generated binaries and make sure its `mlr version` prints what you expect -- else, restart this process.
|
||||
* Before marking the release as public, download an executable from among the generated binaries and make sure its `mlr version` prints what you expect -- else, restart this process. MacOS: `xattr -d com.apple.quarantine ./mlr` first.
|
||||
* Then mark the release as public.
|
||||
|
||||
* Build the release-specific docs:
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ In this example I am using version 6.2.0 to 6.3.0; of course that will change fo
|
|||
* This creates `miller-6.3.0.tar.gz` which we'll upload to GitHub, the URL of which will be in our `miller.spec`
|
||||
* Prepare the source RPM following [README-RPM.md](https://github.com/johnkerl/miller/blob/main/README-RPM.md).
|
||||
|
||||
* Create the Github release tag:
|
||||
* Create the GitHub release tag:
|
||||
|
||||
* Don't forget the `v` in `v6.3.0`
|
||||
* Write the release notes -- save as a pre-release until below
|
||||
|
|
@ -32,7 +32,7 @@ In this example I am using version 6.2.0 to 6.3.0; of course that will change fo
|
|||
* Thanks to [PR 822](https://github.com/johnkerl/miller/pull/822) which introduces [goreleaser](https://github.com/johnkerl/miller/blob/main/.goreleaser.yml) there are versions for many platforms auto-built and auto-attached to the GitHub release.
|
||||
* Attach the release tarball and SRPM. Double-check assets were successfully uploaded.
|
||||
* Publish the release in pre-release mode, until all CI jobs finish successfully. Note that gorelease will create and attach the rest of the binaries.
|
||||
* Before marking the release as public, download an executable from among the generated binaries and make sure its `mlr version` prints what you expect -- else, restart this process.
|
||||
* Before marking the release as public, download an executable from among the generated binaries and make sure its `mlr version` prints what you expect -- else, restart this process. MacOS: `xattr -d com.apple.quarantine ./mlr` first.
|
||||
* Then mark the release as public.
|
||||
|
||||
* Build the release-specific docs:
|
||||
|
|
|
|||
|
|
@ -16,20 +16,20 @@ Quick links:
|
|||
</div>
|
||||
# Introduction
|
||||
|
||||
**Miller is a command-line tool for querying, shaping, and reformatting data files in various formats including CSV, TSV, JSON, and JSON Lines.**
|
||||
**Miller is a command-line tool for querying, shaping, and reformatting data files in various formats, including CSV, TSV, JSON, and JSON Lines.**
|
||||
|
||||
**The big picture:** Even well into the 21st century, our world is full of text-formatted data like CSV. Google _CSV memes_, for example. We need tooling to _thrive in this world_, nimbly manipulating data which is in CSVs. And we need tooling to _move beyond CSV_, to be able to pull data out and into other storage and processing systems. Miller is designed for both these goals.
|
||||
**The big picture:** Even well into the 21st century, our world is full of text-formatted data such as CSV. Google _CSV memes_, for example. We need tooling to _thrive in this world_, nimbly manipulating data which is in CSVs. And we need tooling to _move beyond CSV_, to be able to pull data out and into other storage and processing systems. Miller is designed for both of these goals.
|
||||
|
||||
In several senses, Miller is more than one tool:
|
||||
|
||||
**Format conversion:** You can convert CSV files to JSON, or vice versa, or
|
||||
pretty-print your data horizontally or vertically to make it easier to read.
|
||||
|
||||
**Data manipulation:** With a few keystrokes you can remove columns you don't care about -- or, make new ones.
|
||||
**Data manipulation:** With a few keystrokes, you can remove columns you don't care about -- or make new ones.
|
||||
|
||||
**Pre-processing/post-processing vs standalone use:** You can use Miller to clean data files and put them into standard formats, perhaps in preparation to load them into a database or a hands-off data-processing pipeline. Or you can use it post-process and summary database-query output. As well, you can use Miller to explore and analyze your data interactively.
|
||||
**Pre-processing/post-processing vs standalone use:** You can use Miller to clean data files and put them into standard formats, perhaps in preparation for loading them into a database or a hands-off data-processing pipeline. Or you can use it post-process and summarize database-query output. As well, you can use Miller to explore and analyze your data interactively.
|
||||
|
||||
**Compact verbs vs programming language:** For low-keystroking you can do things like
|
||||
**Compact verbs vs programming language:** For low-keystroking, you can do things like
|
||||
|
||||
<pre class="pre-highlight-non-pair">
|
||||
<b>mlr --csv sort -f name input.csv</b>
|
||||
|
|
@ -39,16 +39,16 @@ pretty-print your data horizontally or vertically to make it easier to read.
|
|||
<b>mlr --json head -n 1 myfile.json</b>
|
||||
</pre>
|
||||
|
||||
The `sort`, `head`, etc are called *verbs*. They're analogs of familiar command-line tools like `sort`, `head`, and so on -- but they're aware of name-indexed, multi-line file formats like CSV, TSV, and JSON. In addition, though, using Miller's `put` verb you can use programming-language statements for expressions like
|
||||
The `sort`, `head`, etc., are called *verbs*. They're analogs of familiar command-line tools like `sort`, `head`, and so on -- but they're aware of name-indexed, multi-line file formats like CSV, TSV, and JSON. In addition, though, using Miller's `put` verb, you can use programming-language statements for expressions like
|
||||
|
||||
<pre class="pre-highlight-non-pair">
|
||||
<b>mlr --csv put '$rate = $units / $seconds' input.csv</b>
|
||||
</pre>
|
||||
|
||||
which allow you to succinctly express your own logic.
|
||||
which allow you to express your own logic succinctly.
|
||||
|
||||
**Multiple domains:** People use Miller for data analysis, data science, software engineering, devops/system-administration, journalism, scientific research, and more.
|
||||
|
||||
In the following you can see how CSV, TSV, tabular, JSON, and other **file formats** share a common theme which is **lists of key-value-pairs**. Miller embraces this common theme.
|
||||
In the following, you can see how CSV, TSV, tabular, JSON, and other **file formats** share a common theme which is **lists of key-value-pairs**. Miller embraces this common theme.
|
||||
|
||||

|
||||
|
|
|
|||
|
|
@ -1,19 +1,19 @@
|
|||
# Introduction
|
||||
|
||||
**Miller is a command-line tool for querying, shaping, and reformatting data files in various formats including CSV, TSV, JSON, and JSON Lines.**
|
||||
**Miller is a command-line tool for querying, shaping, and reformatting data files in various formats, including CSV, TSV, JSON, and JSON Lines.**
|
||||
|
||||
**The big picture:** Even well into the 21st century, our world is full of text-formatted data like CSV. Google _CSV memes_, for example. We need tooling to _thrive in this world_, nimbly manipulating data which is in CSVs. And we need tooling to _move beyond CSV_, to be able to pull data out and into other storage and processing systems. Miller is designed for both these goals.
|
||||
**The big picture:** Even well into the 21st century, our world is full of text-formatted data such as CSV. Google _CSV memes_, for example. We need tooling to _thrive in this world_, nimbly manipulating data which is in CSVs. And we need tooling to _move beyond CSV_, to be able to pull data out and into other storage and processing systems. Miller is designed for both of these goals.
|
||||
|
||||
In several senses, Miller is more than one tool:
|
||||
|
||||
**Format conversion:** You can convert CSV files to JSON, or vice versa, or
|
||||
pretty-print your data horizontally or vertically to make it easier to read.
|
||||
|
||||
**Data manipulation:** With a few keystrokes you can remove columns you don't care about -- or, make new ones.
|
||||
**Data manipulation:** With a few keystrokes, you can remove columns you don't care about -- or make new ones.
|
||||
|
||||
**Pre-processing/post-processing vs standalone use:** You can use Miller to clean data files and put them into standard formats, perhaps in preparation to load them into a database or a hands-off data-processing pipeline. Or you can use it post-process and summary database-query output. As well, you can use Miller to explore and analyze your data interactively.
|
||||
**Pre-processing/post-processing vs standalone use:** You can use Miller to clean data files and put them into standard formats, perhaps in preparation for loading them into a database or a hands-off data-processing pipeline. Or you can use it post-process and summarize database-query output. As well, you can use Miller to explore and analyze your data interactively.
|
||||
|
||||
**Compact verbs vs programming language:** For low-keystroking you can do things like
|
||||
**Compact verbs vs programming language:** For low-keystroking, you can do things like
|
||||
|
||||
GENMD-SHOW-COMMAND
|
||||
mlr --csv sort -f name input.csv
|
||||
|
|
@ -23,16 +23,16 @@ GENMD-SHOW-COMMAND
|
|||
mlr --json head -n 1 myfile.json
|
||||
GENMD-EOF
|
||||
|
||||
The `sort`, `head`, etc are called *verbs*. They're analogs of familiar command-line tools like `sort`, `head`, and so on -- but they're aware of name-indexed, multi-line file formats like CSV, TSV, and JSON. In addition, though, using Miller's `put` verb you can use programming-language statements for expressions like
|
||||
The `sort`, `head`, etc., are called *verbs*. They're analogs of familiar command-line tools like `sort`, `head`, and so on -- but they're aware of name-indexed, multi-line file formats like CSV, TSV, and JSON. In addition, though, using Miller's `put` verb, you can use programming-language statements for expressions like
|
||||
|
||||
GENMD-SHOW-COMMAND
|
||||
mlr --csv put '$rate = $units / $seconds' input.csv
|
||||
GENMD-EOF
|
||||
|
||||
which allow you to succinctly express your own logic.
|
||||
which allow you to express your own logic succinctly.
|
||||
|
||||
**Multiple domains:** People use Miller for data analysis, data science, software engineering, devops/system-administration, journalism, scientific research, and more.
|
||||
|
||||
In the following you can see how CSV, TSV, tabular, JSON, and other **file formats** share a common theme which is **lists of key-value-pairs**. Miller embraces this common theme.
|
||||
In the following, you can see how CSV, TSV, tabular, JSON, and other **file formats** share a common theme which is **lists of key-value-pairs**. Miller embraces this common theme.
|
||||
|
||||

|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ You can install Miller for various platforms as follows.
|
|||
Download a binary:
|
||||
|
||||
* You can get binaries for several platforms on the [releases page](https://github.com/johnkerl/miller/releases).
|
||||
* You can get latest (head) builds for Linux, MacOS, and Windows by visiting [https://github.com/johnkerl/miller/actions](https://github.com/johnkerl/miller/actions), selecting the latest build, and clicking _Artifacts_. (These are retained for 5 days after each commit.)
|
||||
* You can get the latest (head) builds for Linux, MacOS, and Windows by visiting [https://github.com/johnkerl/miller/actions](https://github.com/johnkerl/miller/actions), selecting the latest build, and clicking _Artifacts_. (These are retained for 5 days after each commit.)
|
||||
* See also the [build page](build.md) if you prefer to build from source.
|
||||
|
||||
Using a package manager:
|
||||
|
|
@ -30,6 +30,7 @@ Using a package manager:
|
|||
* MacOS: `brew update` and `brew install miller`, or `sudo port selfupdate` and `sudo port install miller`, depending on your preference of [Homebrew](https://brew.sh) or [MacPorts](https://macports.org).
|
||||
* Windows: `choco install miller` using [Chocolatey](https://chocolatey.org).
|
||||
* Note: Miller 6 was released 2022-01-09; [several platforms](https://github.com/johnkerl/miller/blob/main/README-versions.md) may have Miller 5 available.
|
||||
* As of Miller 6.16.0, you can do `snap install miller`. Note however that the executable is named `miller`, _not_ `mlr`. See also [https://snapcraft.io/miller](https://snapcraft.io/miller).
|
||||
|
||||
See also:
|
||||
|
||||
|
|
@ -37,7 +38,7 @@ See also:
|
|||
* [@jauderho](https://github.com/jauderho)'s [docker images](https://hub.docker.com/r/jauderho/miller/tags) as discussed in [GitHub Discussions](https://github.com/johnkerl/miller/discussions/851#discussioncomment-1943255)
|
||||
* Example invocation: `docker run --rm -i jauderho/miller:latest --csv sort -f shape < ./example.csv`
|
||||
|
||||
Note that the [Miller releases page](https://github.com/johnkerl/miller/releases), `brew`, `macports`, `chocolatey`, and `conda` tend to have current versions; `yum` and `apt-get` may have outdate versions depending on your platform.
|
||||
Note that the [Miller releases page](https://github.com/johnkerl/miller/releases), `brew`, `macports`, `chocolatey`, and `conda` tend to have current versions; `yum` and `apt-get` may have outdated versions depending on your platform.
|
||||
|
||||
As a first check, you should be able to run `mlr --version` at your system's command prompt and see something like the following:
|
||||
|
||||
|
|
@ -50,7 +51,7 @@ mlr 6.0.0
|
|||
|
||||
A note on documentation:
|
||||
|
||||
* If you downloaded the Miller binary from a tagged release, or installed it using a package manager, you should see a version like `mlr 6.0.0` or `mlr 5.10.3` -- please see the [release docs page](release-docs.md) to find the documentation for your version.
|
||||
* If you downloaded the Miller binary from a tagged release or installed it using a package manager, you should see a version like `mlr 6.0.0` or `mlr 5.10.3` -- please see the [release docs page](release-docs.md) to find the documentation for your version.
|
||||
* If you installed from source or using a recent build artifact from GitHub Actions, you should see a version like `mlr 6.0.0-dev` -- [https://miller.readthedocs.io](https://miller.readthedocs.io) is the correct reference, since it contains information for the latest contributions to the [Miller repository](https://github.com/johnkerl/miller).
|
||||
|
||||
As a second check, given [example.csv](./example.csv) you should be able to do
|
||||
|
|
@ -89,6 +90,6 @@ yellow circle true 9 87 63.5058 8.3350
|
|||
purple square false 10 91 72.3735 8.2430
|
||||
</pre>
|
||||
|
||||
If you run into issues on these checks, please check out the resources on the [community page](community.md) for help.
|
||||
If you encounter issues with these checks, please refer to the resources on the [community page](community.md) for help.
|
||||
|
||||
Otherwise, let's go on to [Miller in 10 minutes](10min.md)!
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ You can install Miller for various platforms as follows.
|
|||
Download a binary:
|
||||
|
||||
* You can get binaries for several platforms on the [releases page](https://github.com/johnkerl/miller/releases).
|
||||
* You can get latest (head) builds for Linux, MacOS, and Windows by visiting [https://github.com/johnkerl/miller/actions](https://github.com/johnkerl/miller/actions), selecting the latest build, and clicking _Artifacts_. (These are retained for 5 days after each commit.)
|
||||
* You can get the latest (head) builds for Linux, MacOS, and Windows by visiting [https://github.com/johnkerl/miller/actions](https://github.com/johnkerl/miller/actions), selecting the latest build, and clicking _Artifacts_. (These are retained for 5 days after each commit.)
|
||||
* See also the [build page](build.md) if you prefer to build from source.
|
||||
|
||||
Using a package manager:
|
||||
|
|
@ -14,6 +14,7 @@ Using a package manager:
|
|||
* MacOS: `brew update` and `brew install miller`, or `sudo port selfupdate` and `sudo port install miller`, depending on your preference of [Homebrew](https://brew.sh) or [MacPorts](https://macports.org).
|
||||
* Windows: `choco install miller` using [Chocolatey](https://chocolatey.org).
|
||||
* Note: Miller 6 was released 2022-01-09; [several platforms](https://github.com/johnkerl/miller/blob/main/README-versions.md) may have Miller 5 available.
|
||||
* As of Miller 6.16.0, you can do `snap install miller`. Note however that the executable is named `miller`, _not_ `mlr`. See also [https://snapcraft.io/miller](https://snapcraft.io/miller).
|
||||
|
||||
See also:
|
||||
|
||||
|
|
@ -21,7 +22,7 @@ See also:
|
|||
* [@jauderho](https://github.com/jauderho)'s [docker images](https://hub.docker.com/r/jauderho/miller/tags) as discussed in [GitHub Discussions](https://github.com/johnkerl/miller/discussions/851#discussioncomment-1943255)
|
||||
* Example invocation: `docker run --rm -i jauderho/miller:latest --csv sort -f shape < ./example.csv`
|
||||
|
||||
Note that the [Miller releases page](https://github.com/johnkerl/miller/releases), `brew`, `macports`, `chocolatey`, and `conda` tend to have current versions; `yum` and `apt-get` may have outdate versions depending on your platform.
|
||||
Note that the [Miller releases page](https://github.com/johnkerl/miller/releases), `brew`, `macports`, `chocolatey`, and `conda` tend to have current versions; `yum` and `apt-get` may have outdated versions depending on your platform.
|
||||
|
||||
As a first check, you should be able to run `mlr --version` at your system's command prompt and see something like the following:
|
||||
|
||||
|
|
@ -32,7 +33,7 @@ GENMD-EOF
|
|||
|
||||
A note on documentation:
|
||||
|
||||
* If you downloaded the Miller binary from a tagged release, or installed it using a package manager, you should see a version like `mlr 6.0.0` or `mlr 5.10.3` -- please see the [release docs page](release-docs.md) to find the documentation for your version.
|
||||
* If you downloaded the Miller binary from a tagged release or installed it using a package manager, you should see a version like `mlr 6.0.0` or `mlr 5.10.3` -- please see the [release docs page](release-docs.md) to find the documentation for your version.
|
||||
* If you installed from source or using a recent build artifact from GitHub Actions, you should see a version like `mlr 6.0.0-dev` -- [https://miller.readthedocs.io](https://miller.readthedocs.io) is the correct reference, since it contains information for the latest contributions to the [Miller repository](https://github.com/johnkerl/miller).
|
||||
|
||||
As a second check, given [example.csv](./example.csv) you should be able to do
|
||||
|
|
@ -45,6 +46,6 @@ GENMD-RUN-COMMAND
|
|||
mlr --icsv --opprint cat example.csv
|
||||
GENMD-EOF
|
||||
|
||||
If you run into issues on these checks, please check out the resources on the [community page](community.md) for help.
|
||||
If you encounter issues with these checks, please refer to the resources on the [community page](community.md) for help.
|
||||
|
||||
Otherwise, let's go on to [Miller in 10 minutes](10min.md)!
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ Quick links:
|
|||
|
||||
## Short format specifiers, including --c2p
|
||||
|
||||
In our examples so far we've often made use of `mlr --icsv --opprint` or `mlr --icsv --ojson`. These are such frequently occurring patterns that they have short options like `--c2p` and `--c2j`:
|
||||
In our examples so far, we've often made use of `mlr --icsv --opprint` or `mlr --icsv --ojson`. These are such frequently occurring patterns that they have short options like `--c2p` and `--c2j`:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --c2p head -n 2 example.csv</b>
|
||||
|
|
@ -59,7 +59,7 @@ You can get the full list [here](file-formats.md#data-conversion-keystroke-saver
|
|||
|
||||
## File names up front, including --from
|
||||
|
||||
Already we saw that you can put the filename first using `--from`. When you're interacting with your data at the command line, this makes it easier to up-arrow and append to the previous command:
|
||||
Already, we saw that you can put the filename first using `--from`. When you're interacting with your data at the command line, this makes it easier to up-arrow and append to the previous command:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --c2p --from example.csv sort -nr index then head -n 3</b>
|
||||
|
|
@ -110,7 +110,7 @@ I think `mlr --csv ...` explains itself better than `mlr -c ...`. Nonetheless, t
|
|||
|
||||
## .mlrrc file
|
||||
|
||||
If you want the default file format for Miller to be CSV, you can simply put `--csv` on a line by itself in your `~/.mlrrc` file. Then instead of `mlr --csv cat example.csv` you can just do `mlr cat example.csv`. This is just a personal default, though, so `mlr --opprint cat example.csv` will use default CSV format for input, and PPRINT (tabular) for output.
|
||||
If you want the default file format for Miller to be CSV, you can put `--csv` on a line by itself in your `~/.mlrrc` file. Then, instead of `mlr --csv cat example.csv` you can just do `mlr cat example.csv`. This is just a personal default, though, so `mlr --opprint cat example.csv` will use default CSV format for input, and PPRINT (tabular) for output.
|
||||
|
||||
You can read more about this at the [Customization](customization.md) page.
|
||||
|
||||
|
|
@ -126,6 +126,6 @@ fraction -f count \
|
|||
filename-which-varies.csv
|
||||
</pre>
|
||||
|
||||
Typing this out can get a bit old, if the only thing that changes for you is the filename.
|
||||
Typing this out can get a bit old if the only thing that changes for you is the filename.
|
||||
|
||||
See [Scripting with Miller](scripting.md) for some keystroke-saving options.
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
## Short format specifiers, including --c2p
|
||||
|
||||
In our examples so far we've often made use of `mlr --icsv --opprint` or `mlr --icsv --ojson`. These are such frequently occurring patterns that they have short options like `--c2p` and `--c2j`:
|
||||
In our examples so far, we've often made use of `mlr --icsv --opprint` or `mlr --icsv --ojson`. These are such frequently occurring patterns that they have short options like `--c2p` and `--c2j`:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --c2p head -n 2 example.csv
|
||||
|
|
@ -16,7 +16,7 @@ You can get the full list [here](file-formats.md#data-conversion-keystroke-saver
|
|||
|
||||
## File names up front, including --from
|
||||
|
||||
Already we saw that you can put the filename first using `--from`. When you're interacting with your data at the command line, this makes it easier to up-arrow and append to the previous command:
|
||||
Already, we saw that you can put the filename first using `--from`. When you're interacting with your data at the command line, this makes it easier to up-arrow and append to the previous command:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --c2p --from example.csv sort -nr index then head -n 3
|
||||
|
|
@ -55,7 +55,7 @@ I think `mlr --csv ...` explains itself better than `mlr -c ...`. Nonetheless, t
|
|||
|
||||
## .mlrrc file
|
||||
|
||||
If you want the default file format for Miller to be CSV, you can simply put `--csv` on a line by itself in your `~/.mlrrc` file. Then instead of `mlr --csv cat example.csv` you can just do `mlr cat example.csv`. This is just a personal default, though, so `mlr --opprint cat example.csv` will use default CSV format for input, and PPRINT (tabular) for output.
|
||||
If you want the default file format for Miller to be CSV, you can put `--csv` on a line by itself in your `~/.mlrrc` file. Then, instead of `mlr --csv cat example.csv` you can just do `mlr cat example.csv`. This is just a personal default, though, so `mlr --opprint cat example.csv` will use default CSV format for input, and PPRINT (tabular) for output.
|
||||
|
||||
You can read more about this at the [Customization](customization.md) page.
|
||||
|
||||
|
|
@ -71,6 +71,6 @@ fraction -f count \
|
|||
filename-which-varies.csv
|
||||
GENMD-EOF
|
||||
|
||||
Typing this out can get a bit old, if the only thing that changes for you is the filename.
|
||||
Typing this out can get a bit old if the only thing that changes for you is the filename.
|
||||
|
||||
See [Scripting with Miller](scripting.md) for some keystroke-saving options.
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
insertion-ordered hash map. This encompasses a variety of data
|
||||
formats, including but not limited to the familiar CSV, TSV, and JSON.
|
||||
(Miller can handle positionally-indexed data as a special case.) This
|
||||
manpage documents mlr 6.13.0.
|
||||
manpage documents mlr 6.16.0.
|
||||
|
||||
1mEXAMPLES0m
|
||||
mlr --icsv --opprint cat example.csv
|
||||
|
|
@ -145,6 +145,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv/tsv-only-flags
|
||||
mlr help dkvp-only-flags
|
||||
mlr help file-format-flags
|
||||
mlr help flatten-unflatten-flags
|
||||
mlr help format-conversion-keystroke-saver-flags
|
||||
|
|
@ -198,8 +199,8 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
merge-fields most-frequent nest nothing put regularize remove-empty-columns
|
||||
rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle
|
||||
skip-trivial-records sort sort-within-records sparsify split ssub stats1
|
||||
stats2 step sub summary tac tail tee template top utf8-to-latin1 unflatten
|
||||
uniq unspace unsparsify
|
||||
stats2 step sub summary surv tac tail tee template top utf8-to-latin1
|
||||
unflatten uniq unspace unsparsify
|
||||
|
||||
1mFUNCTION LIST0m
|
||||
abs acos acosh antimode any append apply arrayify asin asinh asserting_absent
|
||||
|
|
@ -254,12 +255,14 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
within the input.
|
||||
--pass-comments-with {string}
|
||||
Immediately print commented lines within input, with
|
||||
specified prefix.
|
||||
specified prefix. For CSV input format, the prefix
|
||||
must be a single character.
|
||||
--skip-comments Ignore commented lines (prefixed by `#`) within the
|
||||
input.
|
||||
--skip-comments-with {string}
|
||||
Ignore commented lines within input, with specified
|
||||
prefix.
|
||||
prefix. For CSV input format, the prefix must be a
|
||||
single character.
|
||||
|
||||
1mCOMPRESSED-DATA FLAGS0m
|
||||
Miller offers a few different ways to handle reading data files
|
||||
|
|
@ -356,6 +359,16 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
-N Keystroke-saver for `--implicit-csv-header
|
||||
--headerless-csv-output`.
|
||||
|
||||
1mDKVP-ONLY FLAGS0m
|
||||
These are flags which are applicable to DKVP format.
|
||||
|
||||
--incr-key Without this option, keyless DKVP fields are keyed by
|
||||
field number. For example: `a=10,b=20,30,d=40,50` is
|
||||
ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With
|
||||
this option, they're keyed by a running counter of
|
||||
keyless fields. For example: `a=10,b=20,30,d=40,50`
|
||||
is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`.
|
||||
|
||||
1mFILE-FORMAT FLAGS0m
|
||||
See the File formats doc page, and or `mlr help file-formats`, for more
|
||||
about file formats Miller supports.
|
||||
|
|
@ -368,9 +381,9 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
are overridden in all cases by setting output format to `format2`.
|
||||
|
||||
--asv or --asvlite Use ASV format for input and output data.
|
||||
--csv or -c Use CSV format for input and output data.
|
||||
--csv or -c or --c2c Use CSV format for input and output data.
|
||||
--csvlite Use CSV-lite format for input and output data.
|
||||
--dkvp Use DKVP format for input and output data.
|
||||
--dkvp or --d2d Use DKVP format for input and output data.
|
||||
--gen-field-name Specify field name for --igen. Defaults to "i".
|
||||
--gen-start Specify start value for --igen. Defaults to 1.
|
||||
--gen-step Specify step value for --igen. Defaults to 1.
|
||||
|
|
@ -394,9 +407,9 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
--itsvlite Use TSV-lite format for input data.
|
||||
--iusv or --iusvlite Use USV format for input data.
|
||||
--ixtab Use XTAB format for input data.
|
||||
--json or -j Use JSON format for input and output data.
|
||||
--jsonl Use JSON Lines format for input and output data.
|
||||
--nidx Use NIDX format for input and output data.
|
||||
--json or -j or --j2j Use JSON format for input and output data.
|
||||
--jsonl or --l2l Use JSON Lines format for input and output data.
|
||||
--nidx or --n2n Use NIDX format for input and output data.
|
||||
--oasv or --oasvlite Use ASV format for output data.
|
||||
--ocsv Use CSV format for output data.
|
||||
--ocsvlite Use CSV-lite format for output data.
|
||||
|
|
@ -410,11 +423,11 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
--otsvlite Use TSV-lite format for output data.
|
||||
--ousv or --ousvlite Use USV format for output data.
|
||||
--oxtab Use XTAB format for output data.
|
||||
--pprint Use PPRINT format for input and output data.
|
||||
--tsv or -t Use TSV format for input and output data.
|
||||
--pprint or --p2p Use PPRINT format for input and output data.
|
||||
--tsv or -t or --t2t Use TSV format for input and output data.
|
||||
--tsvlite Use TSV-lite format for input and output data.
|
||||
--usv or --usvlite Use USV format for input and output data.
|
||||
--xtab Use XTAB format for input and output data.
|
||||
--xtab or --x2x Use XTAB format for input and output data.
|
||||
--xvright Right-justify values for XTAB format.
|
||||
-i {format name} Use format name for input data. For example: `-i csv`
|
||||
is the same as `--icsv`.
|
||||
|
|
@ -424,7 +437,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
1mFLATTEN-UNFLATTEN FLAGS0m
|
||||
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
|
||||
|
||||
See the Flatten/unflatten doc page for more information.
|
||||
See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
|
||||
|
||||
--flatsep or --jflatsep {string}
|
||||
Separator for flattening multi-level JSON keys, e.g.
|
||||
|
|
@ -432,32 +445,31 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
formats. Defaults to `.`.
|
||||
--no-auto-flatten When output is non-JSON, suppress the default
|
||||
auto-flatten behavior. Default: if `$y = [7,8,9]`
|
||||
then this flattens to `y.1=7,y.2=8,y.3=9, and
|
||||
then this flattens to `y.1=7,y.2=8,y.3=9`, and
|
||||
similarly for maps. With `--no-auto-flatten`, instead
|
||||
we get `$y=[1, 2, 3]`.
|
||||
--no-auto-unflatten When input non-JSON and output is JSON, suppress the
|
||||
default auto-unflatten behavior. Default: if the
|
||||
--no-auto-unflatten When input is non-JSON and output is JSON, suppress
|
||||
the default auto-unflatten behavior. Default: if the
|
||||
input has `y.1=7,y.2=8,y.3=9` then this unflattens to
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
|
||||
`--no-auto-flatten`, instead we get
|
||||
`${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
`$y=[7,8,9]`. With `--no-auto-flatten`, instead we
|
||||
get `${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
|
||||
1mFORMAT-CONVERSION KEYSTROKE-SAVER FLAGS0m
|
||||
As keystroke-savers for format-conversion you may use the following.
|
||||
The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX,
|
||||
JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively.
|
||||
|
||||
| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
|
||||
+----------+-------+-------+--------+--------+--------+--------+--------+--------+----------|
|
||||
| CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
|
||||
| TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
|
||||
| JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
|
||||
| JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m |
|
||||
| DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m |
|
||||
| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m |
|
||||
| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m |
|
||||
| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m |
|
||||
| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
|
||||
| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
|
||||
+----------+----------+----------+----------+-------+-------+-------+-------+--------+----------|
|
||||
| CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
|
||||
| TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
|
||||
| JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
|
||||
| JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m |
|
||||
| DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m |
|
||||
| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m |
|
||||
| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m |
|
||||
| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m |
|
||||
| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
|
||||
|
||||
-p Keystroke-saver for `--nidx --fs space --repifs`.
|
||||
-T Keystroke-saver for `--nidx --fs tab`.
|
||||
|
|
@ -1033,7 +1045,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
-r Treat field names as regular expressions. "ab", "a.*b" will
|
||||
match any field name containing the substring "ab" or matching
|
||||
"a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may
|
||||
be used. The -o flag is ignored when -r is present.
|
||||
be used.
|
||||
-h|--help Show this message.
|
||||
Examples:
|
||||
mlr cut -f hostname,status
|
||||
|
|
@ -1077,7 +1089,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
|
||||
1mfilter0m
|
||||
Usage: mlr filter [options] {DSL expression}
|
||||
Lets you use a domain-specific language to programatically filter which
|
||||
Lets you use a domain-specific language to programmatically filter which
|
||||
stream records will be output.
|
||||
See also: https://miller.readthedocs.io/en/latest/reference-verbs
|
||||
|
||||
|
|
@ -1275,6 +1287,8 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
See also the `sub` and `ssub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
|
||||
1mhaving-fields0m
|
||||
|
|
@ -1384,7 +1398,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be
|
||||
expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'.
|
||||
Please use "mlr --usage-separator-options" for information on specifying separators.
|
||||
Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information
|
||||
Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information
|
||||
including examples.
|
||||
|
||||
1mlabel0m
|
||||
|
|
@ -1535,7 +1549,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
|
||||
1mput0m
|
||||
Usage: mlr put [options] {DSL expression}
|
||||
Lets you use a domain-specific language to programatically alter stream records.
|
||||
Lets you use a domain-specific language to programmatically alter stream records.
|
||||
See also: https://miller.readthedocs.io/en/latest/reference-verbs
|
||||
|
||||
Options:
|
||||
|
|
@ -1836,6 +1850,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
-nf {comma-separated field names} Same as -n
|
||||
-nr {comma-separated field names} Numerical descending; nulls sort first
|
||||
-t {comma-separated field names} Natural ascending
|
||||
-b Move sort fields to start of record, as in reorder -b
|
||||
-tr|-rt {comma-separated field names} Natural descending
|
||||
-h|--help Show this message.
|
||||
|
||||
|
|
@ -1910,6 +1925,8 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
|
||||
1mstats10m
|
||||
|
|
@ -2057,6 +2074,8 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
See also the `gsub` and `ssub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
|
||||
1msummary0m
|
||||
|
|
@ -2102,6 +2121,15 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
--transpose Show output with field names as column names..
|
||||
-h|--help Show this message.
|
||||
|
||||
1msurv0m
|
||||
Usage: mlr surv -d {duration-field} -s {status-field}
|
||||
|
||||
Estimate Kaplan-Meier survival curve (right-censored).
|
||||
Options:
|
||||
-d {field} Name of duration field (time-to-event or censoring).
|
||||
-s {field} Name of status field (0=censored, 1=event).
|
||||
-h, --help Show this message.
|
||||
|
||||
1mtac0m
|
||||
Usage: mlr tac [options]
|
||||
Prints records in reverse order from the order in which they were encountered.
|
||||
|
|
@ -3731,5 +3759,5 @@ This is simply a copy of what you should see on running `man mlr` at a command p
|
|||
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
|
||||
https://miller.readthedocs.io
|
||||
|
||||
2024-10-05 4mMILLER24m(1)
|
||||
2026-01-02 4mMILLER24m(1)
|
||||
</pre>
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@
|
|||
insertion-ordered hash map. This encompasses a variety of data
|
||||
formats, including but not limited to the familiar CSV, TSV, and JSON.
|
||||
(Miller can handle positionally-indexed data as a special case.) This
|
||||
manpage documents mlr 6.13.0.
|
||||
manpage documents mlr 6.16.0.
|
||||
|
||||
1mEXAMPLES0m
|
||||
mlr --icsv --opprint cat example.csv
|
||||
|
|
@ -124,6 +124,7 @@
|
|||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv/tsv-only-flags
|
||||
mlr help dkvp-only-flags
|
||||
mlr help file-format-flags
|
||||
mlr help flatten-unflatten-flags
|
||||
mlr help format-conversion-keystroke-saver-flags
|
||||
|
|
@ -177,8 +178,8 @@
|
|||
merge-fields most-frequent nest nothing put regularize remove-empty-columns
|
||||
rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle
|
||||
skip-trivial-records sort sort-within-records sparsify split ssub stats1
|
||||
stats2 step sub summary tac tail tee template top utf8-to-latin1 unflatten
|
||||
uniq unspace unsparsify
|
||||
stats2 step sub summary surv tac tail tee template top utf8-to-latin1
|
||||
unflatten uniq unspace unsparsify
|
||||
|
||||
1mFUNCTION LIST0m
|
||||
abs acos acosh antimode any append apply arrayify asin asinh asserting_absent
|
||||
|
|
@ -233,12 +234,14 @@
|
|||
within the input.
|
||||
--pass-comments-with {string}
|
||||
Immediately print commented lines within input, with
|
||||
specified prefix.
|
||||
specified prefix. For CSV input format, the prefix
|
||||
must be a single character.
|
||||
--skip-comments Ignore commented lines (prefixed by `#`) within the
|
||||
input.
|
||||
--skip-comments-with {string}
|
||||
Ignore commented lines within input, with specified
|
||||
prefix.
|
||||
prefix. For CSV input format, the prefix must be a
|
||||
single character.
|
||||
|
||||
1mCOMPRESSED-DATA FLAGS0m
|
||||
Miller offers a few different ways to handle reading data files
|
||||
|
|
@ -335,6 +338,16 @@
|
|||
-N Keystroke-saver for `--implicit-csv-header
|
||||
--headerless-csv-output`.
|
||||
|
||||
1mDKVP-ONLY FLAGS0m
|
||||
These are flags which are applicable to DKVP format.
|
||||
|
||||
--incr-key Without this option, keyless DKVP fields are keyed by
|
||||
field number. For example: `a=10,b=20,30,d=40,50` is
|
||||
ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With
|
||||
this option, they're keyed by a running counter of
|
||||
keyless fields. For example: `a=10,b=20,30,d=40,50`
|
||||
is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`.
|
||||
|
||||
1mFILE-FORMAT FLAGS0m
|
||||
See the File formats doc page, and or `mlr help file-formats`, for more
|
||||
about file formats Miller supports.
|
||||
|
|
@ -347,9 +360,9 @@
|
|||
are overridden in all cases by setting output format to `format2`.
|
||||
|
||||
--asv or --asvlite Use ASV format for input and output data.
|
||||
--csv or -c Use CSV format for input and output data.
|
||||
--csv or -c or --c2c Use CSV format for input and output data.
|
||||
--csvlite Use CSV-lite format for input and output data.
|
||||
--dkvp Use DKVP format for input and output data.
|
||||
--dkvp or --d2d Use DKVP format for input and output data.
|
||||
--gen-field-name Specify field name for --igen. Defaults to "i".
|
||||
--gen-start Specify start value for --igen. Defaults to 1.
|
||||
--gen-step Specify step value for --igen. Defaults to 1.
|
||||
|
|
@ -373,9 +386,9 @@
|
|||
--itsvlite Use TSV-lite format for input data.
|
||||
--iusv or --iusvlite Use USV format for input data.
|
||||
--ixtab Use XTAB format for input data.
|
||||
--json or -j Use JSON format for input and output data.
|
||||
--jsonl Use JSON Lines format for input and output data.
|
||||
--nidx Use NIDX format for input and output data.
|
||||
--json or -j or --j2j Use JSON format for input and output data.
|
||||
--jsonl or --l2l Use JSON Lines format for input and output data.
|
||||
--nidx or --n2n Use NIDX format for input and output data.
|
||||
--oasv or --oasvlite Use ASV format for output data.
|
||||
--ocsv Use CSV format for output data.
|
||||
--ocsvlite Use CSV-lite format for output data.
|
||||
|
|
@ -389,11 +402,11 @@
|
|||
--otsvlite Use TSV-lite format for output data.
|
||||
--ousv or --ousvlite Use USV format for output data.
|
||||
--oxtab Use XTAB format for output data.
|
||||
--pprint Use PPRINT format for input and output data.
|
||||
--tsv or -t Use TSV format for input and output data.
|
||||
--pprint or --p2p Use PPRINT format for input and output data.
|
||||
--tsv or -t or --t2t Use TSV format for input and output data.
|
||||
--tsvlite Use TSV-lite format for input and output data.
|
||||
--usv or --usvlite Use USV format for input and output data.
|
||||
--xtab Use XTAB format for input and output data.
|
||||
--xtab or --x2x Use XTAB format for input and output data.
|
||||
--xvright Right-justify values for XTAB format.
|
||||
-i {format name} Use format name for input data. For example: `-i csv`
|
||||
is the same as `--icsv`.
|
||||
|
|
@ -403,7 +416,7 @@
|
|||
1mFLATTEN-UNFLATTEN FLAGS0m
|
||||
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
|
||||
|
||||
See the Flatten/unflatten doc page for more information.
|
||||
See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
|
||||
|
||||
--flatsep or --jflatsep {string}
|
||||
Separator for flattening multi-level JSON keys, e.g.
|
||||
|
|
@ -411,32 +424,31 @@
|
|||
formats. Defaults to `.`.
|
||||
--no-auto-flatten When output is non-JSON, suppress the default
|
||||
auto-flatten behavior. Default: if `$y = [7,8,9]`
|
||||
then this flattens to `y.1=7,y.2=8,y.3=9, and
|
||||
then this flattens to `y.1=7,y.2=8,y.3=9`, and
|
||||
similarly for maps. With `--no-auto-flatten`, instead
|
||||
we get `$y=[1, 2, 3]`.
|
||||
--no-auto-unflatten When input non-JSON and output is JSON, suppress the
|
||||
default auto-unflatten behavior. Default: if the
|
||||
--no-auto-unflatten When input is non-JSON and output is JSON, suppress
|
||||
the default auto-unflatten behavior. Default: if the
|
||||
input has `y.1=7,y.2=8,y.3=9` then this unflattens to
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
|
||||
`--no-auto-flatten`, instead we get
|
||||
`${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
`$y=[7,8,9]`. With `--no-auto-flatten`, instead we
|
||||
get `${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
|
||||
1mFORMAT-CONVERSION KEYSTROKE-SAVER FLAGS0m
|
||||
As keystroke-savers for format-conversion you may use the following.
|
||||
The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX,
|
||||
JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively.
|
||||
|
||||
| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
|
||||
+----------+-------+-------+--------+--------+--------+--------+--------+--------+----------|
|
||||
| CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
|
||||
| TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
|
||||
| JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
|
||||
| JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m |
|
||||
| DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m |
|
||||
| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m |
|
||||
| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m |
|
||||
| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m |
|
||||
| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
|
||||
| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
|
||||
+----------+----------+----------+----------+-------+-------+-------+-------+--------+----------|
|
||||
| CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
|
||||
| TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
|
||||
| JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
|
||||
| JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m |
|
||||
| DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m |
|
||||
| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m |
|
||||
| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m |
|
||||
| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m |
|
||||
| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
|
||||
|
||||
-p Keystroke-saver for `--nidx --fs space --repifs`.
|
||||
-T Keystroke-saver for `--nidx --fs tab`.
|
||||
|
|
@ -1012,7 +1024,7 @@
|
|||
-r Treat field names as regular expressions. "ab", "a.*b" will
|
||||
match any field name containing the substring "ab" or matching
|
||||
"a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may
|
||||
be used. The -o flag is ignored when -r is present.
|
||||
be used.
|
||||
-h|--help Show this message.
|
||||
Examples:
|
||||
mlr cut -f hostname,status
|
||||
|
|
@ -1056,7 +1068,7 @@
|
|||
|
||||
1mfilter0m
|
||||
Usage: mlr filter [options] {DSL expression}
|
||||
Lets you use a domain-specific language to programatically filter which
|
||||
Lets you use a domain-specific language to programmatically filter which
|
||||
stream records will be output.
|
||||
See also: https://miller.readthedocs.io/en/latest/reference-verbs
|
||||
|
||||
|
|
@ -1254,6 +1266,8 @@
|
|||
See also the `sub` and `ssub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
|
||||
1mhaving-fields0m
|
||||
|
|
@ -1363,7 +1377,7 @@
|
|||
Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be
|
||||
expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'.
|
||||
Please use "mlr --usage-separator-options" for information on specifying separators.
|
||||
Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information
|
||||
Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information
|
||||
including examples.
|
||||
|
||||
1mlabel0m
|
||||
|
|
@ -1514,7 +1528,7 @@
|
|||
|
||||
1mput0m
|
||||
Usage: mlr put [options] {DSL expression}
|
||||
Lets you use a domain-specific language to programatically alter stream records.
|
||||
Lets you use a domain-specific language to programmatically alter stream records.
|
||||
See also: https://miller.readthedocs.io/en/latest/reference-verbs
|
||||
|
||||
Options:
|
||||
|
|
@ -1815,6 +1829,7 @@
|
|||
-nf {comma-separated field names} Same as -n
|
||||
-nr {comma-separated field names} Numerical descending; nulls sort first
|
||||
-t {comma-separated field names} Natural ascending
|
||||
-b Move sort fields to start of record, as in reorder -b
|
||||
-tr|-rt {comma-separated field names} Natural descending
|
||||
-h|--help Show this message.
|
||||
|
||||
|
|
@ -1889,6 +1904,8 @@
|
|||
the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
|
||||
1mstats10m
|
||||
|
|
@ -2036,6 +2053,8 @@
|
|||
See also the `gsub` and `ssub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
|
||||
1msummary0m
|
||||
|
|
@ -2081,6 +2100,15 @@
|
|||
--transpose Show output with field names as column names..
|
||||
-h|--help Show this message.
|
||||
|
||||
1msurv0m
|
||||
Usage: mlr surv -d {duration-field} -s {status-field}
|
||||
|
||||
Estimate Kaplan-Meier survival curve (right-censored).
|
||||
Options:
|
||||
-d {field} Name of duration field (time-to-event or censoring).
|
||||
-s {field} Name of status field (0=censored, 1=event).
|
||||
-h, --help Show this message.
|
||||
|
||||
1mtac0m
|
||||
Usage: mlr tac [options]
|
||||
Prints records in reverse order from the order in which they were encountered.
|
||||
|
|
@ -3710,4 +3738,4 @@
|
|||
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
|
||||
https://miller.readthedocs.io
|
||||
|
||||
2024-10-05 4mMILLER24m(1)
|
||||
2026-01-02 4mMILLER24m(1)
|
||||
|
|
|
|||
|
|
@ -89,10 +89,10 @@ func convert_csv_to_json(fileNames []string) error {
|
|||
case ierr := <-inputErrorChannel:
|
||||
retval = ierr
|
||||
break
|
||||
case _ = <-dataProcessingErrorChannel:
|
||||
case <-dataProcessingErrorChannel:
|
||||
retval = errors.New("exiting due to data error") // details already printed
|
||||
break
|
||||
case _ = <-doneWritingChannel:
|
||||
case <-doneWritingChannel:
|
||||
done = true
|
||||
break
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ Quick links:
|
|||
|
||||
## Native builds as of Miller 6
|
||||
|
||||
Miller was originally developed for Unix-like operating systems including Linux and MacOS. Since Miller 5.2.0 which was the first version to support Windows at all, that support has been partial. But as of version 6.0.0, Miller builds directly on Windows.
|
||||
Miller was originally developed for Unix-like operating systems, including Linux and MacOS. Since Miller 5.2.0, which was the first version to support Windows at all, that support has been partial. But as of version 6.0.0, Miller builds directly on Windows.
|
||||
|
||||
**The experience is now almost the same on Windows as it is on Linux, NetBSD/FreeBSD, and MacOS.**
|
||||
|
||||
|
|
@ -28,7 +28,7 @@ See [Installation](installing-miller.md) for how to get a copy of `mlr.exe`.
|
|||
|
||||
## Setup
|
||||
|
||||
Simply place `mlr.exe` somewhere within your `PATH` variable.
|
||||
Place `mlr.exe` somewhere within your `PATH` variable.
|
||||
|
||||

|
||||
|
||||
|
|
@ -38,7 +38,7 @@ To use Miller from within MSYS2/Cygwin, also make sure `mlr.exe` is within the `
|
|||
|
||||
## Differences
|
||||
|
||||
The Windows-support code within Miller makes effort to support Linux/Unix/MacOS-like command-line syntax including single-quoting of expressions for `mlr put` and `mlr filter` -- and in the examples above, this often works. However, there are still some cases where more complex expressions aren't successfully parsed from the Windows prompt, even though they are from MSYS2:
|
||||
The Windows-support code within Miller makes an effort to support Linux/Unix/MacOS-like command-line syntax, including single-quoting of expressions for `mlr put` and `mlr filter` -- and in the examples above, this often works. However, there are still some cases where more complex expressions aren't successfully parsed from the Windows prompt, even though they are from MSYS2:
|
||||
|
||||

|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
## Native builds as of Miller 6
|
||||
|
||||
Miller was originally developed for Unix-like operating systems including Linux and MacOS. Since Miller 5.2.0 which was the first version to support Windows at all, that support has been partial. But as of version 6.0.0, Miller builds directly on Windows.
|
||||
Miller was originally developed for Unix-like operating systems, including Linux and MacOS. Since Miller 5.2.0, which was the first version to support Windows at all, that support has been partial. But as of version 6.0.0, Miller builds directly on Windows.
|
||||
|
||||
**The experience is now almost the same on Windows as it is on Linux, NetBSD/FreeBSD, and MacOS.**
|
||||
|
||||
|
|
@ -12,7 +12,7 @@ See [Installation](installing-miller.md) for how to get a copy of `mlr.exe`.
|
|||
|
||||
## Setup
|
||||
|
||||
Simply place `mlr.exe` somewhere within your `PATH` variable.
|
||||
Place `mlr.exe` somewhere within your `PATH` variable.
|
||||
|
||||

|
||||
|
||||
|
|
@ -22,7 +22,7 @@ To use Miller from within MSYS2/Cygwin, also make sure `mlr.exe` is within the `
|
|||
|
||||
## Differences
|
||||
|
||||
The Windows-support code within Miller makes effort to support Linux/Unix/MacOS-like command-line syntax including single-quoting of expressions for `mlr put` and `mlr filter` -- and in the examples above, this often works. However, there are still some cases where more complex expressions aren't successfully parsed from the Windows prompt, even though they are from MSYS2:
|
||||
The Windows-support code within Miller makes an effort to support Linux/Unix/MacOS-like command-line syntax, including single-quoting of expressions for `mlr put` and `mlr filter` -- and in the examples above, this often works. However, there are still some cases where more complex expressions aren't successfully parsed from the Windows prompt, even though they are from MSYS2:
|
||||
|
||||

|
||||
|
||||
|
|
|
|||
|
|
@ -16,11 +16,11 @@ Quick links:
|
|||
</div>
|
||||
# Intro to Miller's programming language
|
||||
|
||||
In the [Miller in 10 minutes](10min.md) page we took a tour of some of Miller's most-used [verbs](reference-verbs.md) including `cat`, `head`, `tail`, `cut`, and `sort`. These are analogs of familiar system commands, but empowered by field-name indexing and file-format awareness: the system `sort` command only knows about lines and column names like `1,2,3,4`, while `mlr sort` knows about CSV/TSV/JSON/etc records, and field names like `color,shape,flag,index`.
|
||||
On the [Miller in 10 minutes](10min.md) page, we took a tour of some of Miller's most-used [verbs](reference-verbs.md), including `cat`, `head`, `tail`, `cut`, and `sort`. These are analogs of familiar system commands, but empowered by field-name indexing and file-format awareness: the system `sort` command only knows about lines and column names like `1,2,3,4`, while `mlr sort` knows about CSV/TSV/JSON/etc records, and field names like `color,shape,flag,index`.
|
||||
|
||||
We also caught a glimpse of Miller's `put` and `filter` verbs. These two are special since they let you express statements using Miller's programming language. It's a *embedded domain-specific language* since it's inside Miller: often referred to simply as the *Miller DSL*.
|
||||
We also caught a glimpse of Miller's `put` and `filter` verbs. These two are special because they allow you to express statements using Miller's programming language. It's an *embedded domain-specific language* since it's inside Miller: often referred to simply as the *Miller DSL*.
|
||||
|
||||
In the [DSL reference](reference-dsl.md) page we have a complete reference to Miller's programming language. For now, let's take a quick look at key features -- you can use as few or as many features as you like.
|
||||
On the [DSL reference](reference-dsl.md) page, we have a complete reference to Miller's programming language. For now, let's take a quick look at key features -- you can use as few or as many features as you like.
|
||||
|
||||
## Records and fields
|
||||
|
||||
|
|
@ -45,9 +45,9 @@ purple square false 10 91 72.3735 8.2430 596.5747605000001
|
|||
|
||||
When we type that, a few things are happening:
|
||||
|
||||
* We refer to fields in the input data using a dollar sign and then the field name, e.g. `$quantity`. (If a field name contains special characters like a dot or slash, just use curly braces: `${field.name}`.)
|
||||
* We refer to fields in the input data using a dollar sign and then the field name, e.g., `$quantity`. (If a field name contains special characters like a dot or slash, just use curly braces: `${field.name}`.)
|
||||
* The expression `$cost = $quantity * $rate` is executed once per record of the data file. Our [example.csv](./example.csv) has 10 records so this expression was executed 10 times, with the field names `$quantity` and `$rate` each time bound to the current record's values for those fields.
|
||||
* On the left-hand side we have the new field name `$cost` which didn't come from the input data. Assignments to new variables result in a new field being placed after all the other ones. If we'd assigned to an existing field name, it would have been updated in-place.
|
||||
* On the left-hand side, we have the new field name `$cost`, which didn't come from the input data. Assignments to new variables result in a new field being placed after all the other ones. If we'd assigned to an existing field name, it would have been updated in place.
|
||||
* The entire expression is surrounded by single quotes (with an adjustment needed on [Windows](miller-on-windows.md)), to get it past the system shell. Inside those, only double quotes have meaning in Miller's programming language.
|
||||
|
||||
## Multi-line statements, and statements-from-file
|
||||
|
|
@ -91,9 +91,9 @@ yellow circle true 9 8700 63.5058 8.3350 529.3208430000001
|
|||
purple square false 10 9100 72.3735 8.2430 596.5747605000001
|
||||
</pre>
|
||||
|
||||
Anything from a `#` character to end of line is a code comment.
|
||||
Anything from a `#` character to the end of the line is a code comment.
|
||||
|
||||
One of Miller's key features is the ability to express data-transformation right there at the keyboard, interactively. But if you find yourself using expressions repeatedly, you can put everything between the single quotes into a file and refer to that using `put -f`:
|
||||
One of Miller's key features is the ability to express data transformation right there at the keyboard, interactively. But if you find yourself using expressions repeatedly, you can put everything between the single quotes into a file and refer to that using `put -f`:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat dsl-example.mlr</b>
|
||||
|
|
@ -120,13 +120,13 @@ yellow circle true 9 8700 63.5058 8.3350 529.3208430000001
|
|||
purple square false 10 9100 72.3735 8.2430 596.5747605000001
|
||||
</pre>
|
||||
|
||||
This becomes particularly important on Windows. Quite a bit of effort was put into making Miller on Windows be able to handle the kinds of single-quoted expressions we're showing here, but if you get syntax-error messages on Windows using examples in this documentation, you can put the parts between single quotes into a file and refer to that using `mlr put -f` -- or, use the triple-double-quote trick as described in the [Miller on Windows page](miller-on-windows.md).
|
||||
This becomes particularly important on Windows. Quite a bit of effort was put into making Miller on Windows be able to handle the kinds of single-quoted expressions we're showing here. Still, if you get syntax-error messages on Windows using examples in this documentation, you can put the parts between single quotes into a file and refer to that using `mlr put -f` -- or, use the triple-double-quote trick as described in the [Miller on Windows page](miller-on-windows.md).
|
||||
|
||||
## Out-of-stream variables, begin, and end
|
||||
|
||||
Above we saw that your expression is executed once per record -- if a file has a million records, your expression will be executed a million times, once for each record. But you can mark statements to only be executed once, either before the record stream begins, or after the record stream is ended. If you know about [AWK](https://en.wikipedia.org/wiki/AWK), you might have noticed that Miller's programming language is loosely inspired by it, including the `begin` and `end` statements.
|
||||
Above, we saw that your expression is executed once per record: if a file has a million records, your expression will be executed a million times, once for each record. But you can mark statements only to be executed once, either before the record stream begins or after the record stream is ended. If you know about [AWK](https://en.wikipedia.org/wiki/AWK), you might have noticed that Miller's programming language is loosely inspired by it, including the `begin` and `end` statements.
|
||||
|
||||
Above we also saw that names like `$quantity` are bound to each record in turn.
|
||||
Above, we also saw that names like `$quantity` are bound to each record in turn.
|
||||
|
||||
To make `begin` and `end` statements useful, we need somewhere to put things that persist across the duration of the record stream, and a way to emit them. Miller uses [**out-of-stream variables**](reference-dsl-variables.md#out-of-stream-variables) (or **oosvars** for short) whose names start with an `@` sigil, along with the [`emit`](reference-dsl-output-statements.md#emit-statements) keyword to write them into the output record stream:
|
||||
|
||||
|
|
@ -210,7 +210,7 @@ Also inspired by [AWK](https://en.wikipedia.org/wiki/AWK), the Miller DSL has th
|
|||
|
||||
* `FILENAME` -- the filename the current record came from. Especially useful in things like `mlr ... *.csv`.
|
||||
* `FILENUM` -- similarly, but integer 1,2,3,... rather than filename.
|
||||
* `NF` -- the number of fields in the current record. Note that if you assign `$newcolumn = some value` then `NF` will increment.
|
||||
* `NF` -- the number of fields in the current record. Note that if you assign `$newcolumn = some value`, then `NF` will increment.
|
||||
* `NR` -- starting from 1, counter of how many records processed so far.
|
||||
* `FNR` -- similar, but resets to 1 at the start of each file.
|
||||
|
||||
|
|
@ -290,12 +290,12 @@ purple square false 10 91 72.3735 8.2430 3628800
|
|||
Note that here we used the `-f` flag to `put` to load our function
|
||||
definition, and also the `-e` flag to add another statement on the command
|
||||
line. (We could have also put `$fact = factorial(NR)` inside
|
||||
`factorial-example.mlr` but that would have made that file less flexible for our
|
||||
`factorial-example.mlr`, but that would have made that file less flexible for our
|
||||
future use.)
|
||||
|
||||
## If-statements, loops, and local variables
|
||||
|
||||
Suppose you want to only compute sums conditionally -- you can use an `if` statement:
|
||||
Suppose you want only to compute sums conditionally -- you can use an `if` statement:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat if-example.mlr</b>
|
||||
|
|
@ -331,7 +331,7 @@ page](reference-dsl-control-structures.md#for-loops), Miller has a few kinds of
|
|||
for-loops. In addition to the usual 3-part `for (i = 0; i < 10; i += 1)` kind
|
||||
that many programming languages have, Miller also lets you loop over
|
||||
[maps](reference-main-maps.md) and [arrays](reference-main-arrays.md). We
|
||||
haven't encountered maps and arrays yet in this introduction, but for now it
|
||||
haven't encountered maps and arrays yet in this introduction, but for now, it
|
||||
suffices to know that `$*` is a special variable holding the current record as
|
||||
a map:
|
||||
|
||||
|
|
@ -375,14 +375,14 @@ Here we used the local variables `k` and `v`. Now we've seen four kinds of varia
|
|||
* Local variables like `k`
|
||||
* Built-in context variables like `NF` and `NR`
|
||||
|
||||
If you're curious about scope and extent of local variables, you can read more in the [section on variables](reference-dsl-variables.md).
|
||||
If you're curious about the scope and extent of local variables, you can read more in the [section on variables](reference-dsl-variables.md).
|
||||
|
||||
## Arithmetic
|
||||
|
||||
Numbers in Miller's programming language are intended to operate with the principle of least surprise:
|
||||
|
||||
* Internally, numbers are either 64-bit signed integers or double-precision floating-point.
|
||||
* Sums, differences, and products of integers are also integers (so `2*3=6` not `6.0`) -- unless the result of the operation would overflow a 64-bit signed integer in which case the result is automatically converted to float. (If you ever want integer-to-integer arithmetic, use `x .+ y`, `x .* y`, etc.)
|
||||
* Sums, differences, and products of integers are also integers (so `2*3=6` not `6.0`) -- unless the result of the operation would overflow a 64-bit signed integer, in which case the result is automatically converted to float. (If you ever want integer-to-integer arithmetic, use `x .+ y`, `x .* y`, etc.)
|
||||
* Quotients of integers are integers if the division is exact, else floating-point: so `6/2=3` but `7/2=3.5`.
|
||||
|
||||
You can read more about this in the [arithmetic reference](reference-main-arithmetic.md).
|
||||
|
|
@ -397,7 +397,7 @@ see more in the [null-data reference](reference-main-null-data.md) but the
|
|||
basic idea is:
|
||||
|
||||
* Adding a number to absent gives the number back. This means you don't have to put `@sum = 0` in your `begin` blocks.
|
||||
* Any variable which has the absent value is not assigned. This means you don't have to check presence of things from one record to the next.
|
||||
* Any variable that has the absent value is not assigned. This means you don't have to check the presence of things from one record to the next.
|
||||
|
||||
For example, you can sum up all the `$a` values across records without having to check whether they're present or not:
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
# Intro to Miller's programming language
|
||||
|
||||
In the [Miller in 10 minutes](10min.md) page we took a tour of some of Miller's most-used [verbs](reference-verbs.md) including `cat`, `head`, `tail`, `cut`, and `sort`. These are analogs of familiar system commands, but empowered by field-name indexing and file-format awareness: the system `sort` command only knows about lines and column names like `1,2,3,4`, while `mlr sort` knows about CSV/TSV/JSON/etc records, and field names like `color,shape,flag,index`.
|
||||
On the [Miller in 10 minutes](10min.md) page, we took a tour of some of Miller's most-used [verbs](reference-verbs.md), including `cat`, `head`, `tail`, `cut`, and `sort`. These are analogs of familiar system commands, but empowered by field-name indexing and file-format awareness: the system `sort` command only knows about lines and column names like `1,2,3,4`, while `mlr sort` knows about CSV/TSV/JSON/etc records, and field names like `color,shape,flag,index`.
|
||||
|
||||
We also caught a glimpse of Miller's `put` and `filter` verbs. These two are special since they let you express statements using Miller's programming language. It's a *embedded domain-specific language* since it's inside Miller: often referred to simply as the *Miller DSL*.
|
||||
We also caught a glimpse of Miller's `put` and `filter` verbs. These two are special because they allow you to express statements using Miller's programming language. It's an *embedded domain-specific language* since it's inside Miller: often referred to simply as the *Miller DSL*.
|
||||
|
||||
In the [DSL reference](reference-dsl.md) page we have a complete reference to Miller's programming language. For now, let's take a quick look at key features -- you can use as few or as many features as you like.
|
||||
On the [DSL reference](reference-dsl.md) page, we have a complete reference to Miller's programming language. For now, let's take a quick look at key features -- you can use as few or as many features as you like.
|
||||
|
||||
## Records and fields
|
||||
|
||||
|
|
@ -16,9 +16,9 @@ GENMD-EOF
|
|||
|
||||
When we type that, a few things are happening:
|
||||
|
||||
* We refer to fields in the input data using a dollar sign and then the field name, e.g. `$quantity`. (If a field name contains special characters like a dot or slash, just use curly braces: `${field.name}`.)
|
||||
* We refer to fields in the input data using a dollar sign and then the field name, e.g., `$quantity`. (If a field name contains special characters like a dot or slash, just use curly braces: `${field.name}`.)
|
||||
* The expression `$cost = $quantity * $rate` is executed once per record of the data file. Our [example.csv](./example.csv) has 10 records so this expression was executed 10 times, with the field names `$quantity` and `$rate` each time bound to the current record's values for those fields.
|
||||
* On the left-hand side we have the new field name `$cost` which didn't come from the input data. Assignments to new variables result in a new field being placed after all the other ones. If we'd assigned to an existing field name, it would have been updated in-place.
|
||||
* On the left-hand side, we have the new field name `$cost`, which didn't come from the input data. Assignments to new variables result in a new field being placed after all the other ones. If we'd assigned to an existing field name, it would have been updated in place.
|
||||
* The entire expression is surrounded by single quotes (with an adjustment needed on [Windows](miller-on-windows.md)), to get it past the system shell. Inside those, only double quotes have meaning in Miller's programming language.
|
||||
|
||||
## Multi-line statements, and statements-from-file
|
||||
|
|
@ -36,9 +36,9 @@ mlr --c2p put '
|
|||
' example.csv
|
||||
GENMD-EOF
|
||||
|
||||
Anything from a `#` character to end of line is a code comment.
|
||||
Anything from a `#` character to the end of the line is a code comment.
|
||||
|
||||
One of Miller's key features is the ability to express data-transformation right there at the keyboard, interactively. But if you find yourself using expressions repeatedly, you can put everything between the single quotes into a file and refer to that using `put -f`:
|
||||
One of Miller's key features is the ability to express data transformation right there at the keyboard, interactively. But if you find yourself using expressions repeatedly, you can put everything between the single quotes into a file and refer to that using `put -f`:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
cat dsl-example.mlr
|
||||
|
|
@ -48,13 +48,13 @@ GENMD-RUN-COMMAND
|
|||
mlr --c2p put -f dsl-example.mlr example.csv
|
||||
GENMD-EOF
|
||||
|
||||
This becomes particularly important on Windows. Quite a bit of effort was put into making Miller on Windows be able to handle the kinds of single-quoted expressions we're showing here, but if you get syntax-error messages on Windows using examples in this documentation, you can put the parts between single quotes into a file and refer to that using `mlr put -f` -- or, use the triple-double-quote trick as described in the [Miller on Windows page](miller-on-windows.md).
|
||||
This becomes particularly important on Windows. Quite a bit of effort was put into making Miller on Windows be able to handle the kinds of single-quoted expressions we're showing here. Still, if you get syntax-error messages on Windows using examples in this documentation, you can put the parts between single quotes into a file and refer to that using `mlr put -f` -- or, use the triple-double-quote trick as described in the [Miller on Windows page](miller-on-windows.md).
|
||||
|
||||
## Out-of-stream variables, begin, and end
|
||||
|
||||
Above we saw that your expression is executed once per record -- if a file has a million records, your expression will be executed a million times, once for each record. But you can mark statements to only be executed once, either before the record stream begins, or after the record stream is ended. If you know about [AWK](https://en.wikipedia.org/wiki/AWK), you might have noticed that Miller's programming language is loosely inspired by it, including the `begin` and `end` statements.
|
||||
Above, we saw that your expression is executed once per record: if a file has a million records, your expression will be executed a million times, once for each record. But you can mark statements only to be executed once, either before the record stream begins or after the record stream is ended. If you know about [AWK](https://en.wikipedia.org/wiki/AWK), you might have noticed that Miller's programming language is loosely inspired by it, including the `begin` and `end` statements.
|
||||
|
||||
Above we also saw that names like `$quantity` are bound to each record in turn.
|
||||
Above, we also saw that names like `$quantity` are bound to each record in turn.
|
||||
|
||||
To make `begin` and `end` statements useful, we need somewhere to put things that persist across the duration of the record stream, and a way to emit them. Miller uses [**out-of-stream variables**](reference-dsl-variables.md#out-of-stream-variables) (or **oosvars** for short) whose names start with an `@` sigil, along with the [`emit`](reference-dsl-output-statements.md#emit-statements) keyword to write them into the output record stream:
|
||||
|
||||
|
|
@ -95,7 +95,7 @@ Also inspired by [AWK](https://en.wikipedia.org/wiki/AWK), the Miller DSL has th
|
|||
|
||||
* `FILENAME` -- the filename the current record came from. Especially useful in things like `mlr ... *.csv`.
|
||||
* `FILENUM` -- similarly, but integer 1,2,3,... rather than filename.
|
||||
* `NF` -- the number of fields in the current record. Note that if you assign `$newcolumn = some value` then `NF` will increment.
|
||||
* `NF` -- the number of fields in the current record. Note that if you assign `$newcolumn = some value`, then `NF` will increment.
|
||||
* `NR` -- starting from 1, counter of how many records processed so far.
|
||||
* `FNR` -- similar, but resets to 1 at the start of each file.
|
||||
|
||||
|
|
@ -130,12 +130,12 @@ GENMD-EOF
|
|||
Note that here we used the `-f` flag to `put` to load our function
|
||||
definition, and also the `-e` flag to add another statement on the command
|
||||
line. (We could have also put `$fact = factorial(NR)` inside
|
||||
`factorial-example.mlr` but that would have made that file less flexible for our
|
||||
`factorial-example.mlr`, but that would have made that file less flexible for our
|
||||
future use.)
|
||||
|
||||
## If-statements, loops, and local variables
|
||||
|
||||
Suppose you want to only compute sums conditionally -- you can use an `if` statement:
|
||||
Suppose you want only to compute sums conditionally -- you can use an `if` statement:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
cat if-example.mlr
|
||||
|
|
@ -152,7 +152,7 @@ page](reference-dsl-control-structures.md#for-loops), Miller has a few kinds of
|
|||
for-loops. In addition to the usual 3-part `for (i = 0; i < 10; i += 1)` kind
|
||||
that many programming languages have, Miller also lets you loop over
|
||||
[maps](reference-main-maps.md) and [arrays](reference-main-arrays.md). We
|
||||
haven't encountered maps and arrays yet in this introduction, but for now it
|
||||
haven't encountered maps and arrays yet in this introduction, but for now, it
|
||||
suffices to know that `$*` is a special variable holding the current record as
|
||||
a map:
|
||||
|
||||
|
|
@ -175,14 +175,14 @@ Here we used the local variables `k` and `v`. Now we've seen four kinds of varia
|
|||
* Local variables like `k`
|
||||
* Built-in context variables like `NF` and `NR`
|
||||
|
||||
If you're curious about scope and extent of local variables, you can read more in the [section on variables](reference-dsl-variables.md).
|
||||
If you're curious about the scope and extent of local variables, you can read more in the [section on variables](reference-dsl-variables.md).
|
||||
|
||||
## Arithmetic
|
||||
|
||||
Numbers in Miller's programming language are intended to operate with the principle of least surprise:
|
||||
|
||||
* Internally, numbers are either 64-bit signed integers or double-precision floating-point.
|
||||
* Sums, differences, and products of integers are also integers (so `2*3=6` not `6.0`) -- unless the result of the operation would overflow a 64-bit signed integer in which case the result is automatically converted to float. (If you ever want integer-to-integer arithmetic, use `x .+ y`, `x .* y`, etc.)
|
||||
* Sums, differences, and products of integers are also integers (so `2*3=6` not `6.0`) -- unless the result of the operation would overflow a 64-bit signed integer, in which case the result is automatically converted to float. (If you ever want integer-to-integer arithmetic, use `x .+ y`, `x .* y`, etc.)
|
||||
* Quotients of integers are integers if the division is exact, else floating-point: so `6/2=3` but `7/2=3.5`.
|
||||
|
||||
You can read more about this in the [arithmetic reference](reference-main-arithmetic.md).
|
||||
|
|
@ -197,7 +197,7 @@ see more in the [null-data reference](reference-main-null-data.md) but the
|
|||
basic idea is:
|
||||
|
||||
* Adding a number to absent gives the number back. This means you don't have to put `@sum = 0` in your `begin` blocks.
|
||||
* Any variable which has the absent value is not assigned. This means you don't have to check presence of things from one record to the next.
|
||||
* Any variable that has the absent value is not assigned. This means you don't have to check the presence of things from one record to the next.
|
||||
|
||||
For example, you can sum up all the `$a` values across records without having to check whether they're present or not:
|
||||
|
||||
|
|
|
|||
|
|
@ -722,7 +722,7 @@ Passes through input records with specified fields included/excluded.
|
|||
-r Treat field names as regular expressions. "ab", "a.*b" will
|
||||
match any field name containing the substring "ab" or matching
|
||||
"a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may
|
||||
be used. The -o flag is ignored when -r is present.
|
||||
be used.
|
||||
Examples:
|
||||
mlr cut -f hostname,status
|
||||
mlr cut -x -f hostname,status
|
||||
|
|
|
|||
|
|
@ -24,43 +24,23 @@ TL;DRs: [install](installing-miller.md), [binaries](https://github.com/johnkerl/
|
|||
|
||||
### Performance
|
||||
|
||||
Performance is on par with Miller 5 for simple processing, and is far better than Miller 5 for
|
||||
complex processing chains -- the latter due to improved multicore utilization. CSV I/O is notably
|
||||
improved. See the [Performance benchmarks](#performance-benchmarks) section at the bottom of this
|
||||
page for details.
|
||||
Performance is on par with Miller 5 for simple processing, and is far better than Miller 5 for complex processing chains -- the latter due to improved multicore utilization. CSV I/O is notably improved. See the [Performance benchmarks](#performance-benchmarks) section at the bottom of this page for details.
|
||||
|
||||
### Documentation improvements
|
||||
|
||||
Documentation (what you're reading here) and online help (`mlr --help`) have been completely reworked.
|
||||
|
||||
In the initial release, the focus was convincing users already familiar with
|
||||
`awk`/`grep`/`cut` that Miller was a viable alternative -- but over time it's
|
||||
become clear that many Miller users aren't expert with those tools. The focus
|
||||
has shifted toward a higher quantity of more introductory/accessible material
|
||||
for command-line data processing.
|
||||
In the initial release, the focus was on convincing users already familiar with `awk`, `grep`, and `cut` that Miller was a viable alternative; however, over time, it has become clear that many Miller users aren't experts with those tools. The focus has shifted toward a higher quantity of more introductory/accessible material for command-line data processing.
|
||||
|
||||
Similarly, the FAQ/recipe material has been expanded to include more, and
|
||||
simpler, use-cases including resolved questions from
|
||||
[Miller Issues](https://github.com/johnkerl/miller/issues)
|
||||
and
|
||||
[Miller Discussions](https://github.com/johnkerl/miller/discussions);
|
||||
more complex/niche material has been pushed farther down. The long reference
|
||||
pages have been split up into separate pages. (See also
|
||||
[Structure of these documents](structure-of-these-documents.md).)
|
||||
Similarly, the FAQ/recipe material has been expanded to include more, and simpler, use-cases, including resolved questions from [Miller Issues](https://github.com/johnkerl/miller/issues) and [Miller Discussions](https://github.com/johnkerl/miller/discussions); more complex/niche material has been pushed farther down. The lengthy reference pages have been divided into separate pages. (See also [Structure of these documents](structure-of-these-documents.md).)
|
||||
|
||||
One of the main feedback themes from the 2021 Miller User Survey was that some
|
||||
things should be easier to find. Namely, on each doc page there's now a banner
|
||||
across the top with things that should be one click away from the landing page
|
||||
(or any page): command-line flags, verbs, functions, glossary/acronyms, and a
|
||||
finder for docs by release.
|
||||
One of the main feedback themes from the 2021 Miller User Survey was that some things should be easier to find. Namely, on each doc page, there's now a banner across the top with things that should be one click away from the landing page (or any page): command-line flags, verbs, functions, glossary/acronyms, and a finder for docs by release.
|
||||
|
||||
Since CSV is overwhelmingly the most popular data format for Miller, it is
|
||||
now discussed first, and more examples use CSV.
|
||||
Since CSV is overwhelmingly the most popular data format for Miller, it is now discussed first, and more examples use CSV.
|
||||
|
||||
### Improved Windows experience
|
||||
|
||||
Stronger support for Windows (with or without MSYS2), with a couple of
|
||||
exceptions. See [Miller on Windows](miller-on-windows.md) for more information.
|
||||
Stronger support for Windows (with or without MSYS2), with a couple of exceptions. See [Miller on Windows](miller-on-windows.md) for more information.
|
||||
|
||||
Binaries are reliably available using GitHub Actions: see also [Installation](installing-miller.md).
|
||||
|
||||
|
|
@ -89,9 +69,7 @@ Parse error on token ">" at line 63 column 7.
|
|||
|
||||
### Scripting
|
||||
|
||||
Scripting is now easier -- support for `#!` with `sh`, as always, along with now support for `#!` with `mlr -s`. For
|
||||
Windows, `mlr -s` can also be used. These help reduce backslash-clutter and let you do more while typing less.
|
||||
See the [scripting page](scripting.md).
|
||||
Scripting is now easier -- support for `#!` with `sh`, as always, along with now support for `#!` with `mlr -s`. For Windows, `mlr -s` can also be used. These help reduce backslash clutter and let you do more while typing less. See the [scripting page](scripting.md).
|
||||
|
||||
### REPL
|
||||
|
||||
|
|
@ -143,7 +121,7 @@ the `TZ` environment variable. Please see [DSL datetime/timezone functions](refe
|
|||
|
||||
### In-process support for compressed input
|
||||
|
||||
In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z`, `.bz2`, and `.zst` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information.
|
||||
In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly, for `.z`, `.bz2`, and `.zst` files. Please refer to the page on [Compressed Data](reference-main-compressed-data.md) for more information.
|
||||
|
||||
### Support for reading web URLs
|
||||
|
||||
|
|
@ -171,9 +149,7 @@ purple,triangle,false,7,65,80.1405,5.8240
|
|||
|
||||
### Improved JSON / JSON Lines support, and arrays
|
||||
|
||||
Arrays are now supported in Miller's `put`/`filter` programming language, as
|
||||
described in the [Arrays reference](reference-main-arrays.md). (Also, `array` is
|
||||
now a keyword so this is no longer usable as a local-variable or UDF name.)
|
||||
Arrays are now supported in Miller's `put`/`filter` programming language, as described in the [Arrays reference](reference-main-arrays.md). (Also, `array` is now a keyword, so this is no longer usable as a local variable or UDF name.)
|
||||
|
||||
JSON support is improved:
|
||||
|
||||
|
|
@ -196,24 +172,13 @@ See also the [Arrays reference](reference-main-arrays.md) for more information.
|
|||
|
||||
### Improved numeric conversion
|
||||
|
||||
The most central part of Miller 6 is a deep refactor of how data values are parsed
|
||||
from file contents, how types are inferred, and how they're converted back to
|
||||
text into output files.
|
||||
The most central part of Miller 6 is a deep refactor of how data values are parsed from file contents, how types are inferred, and how they're converted back to text into output files.
|
||||
|
||||
This was all initiated by [https://github.com/johnkerl/miller/issues/151](https://github.com/johnkerl/miller/issues/151).
|
||||
|
||||
In Miller 5 and below, all values were stored as strings, then only converted
|
||||
to int/float as-needed, for example when a particular field was referenced in
|
||||
the `stats1` or `put` verbs. This led to awkwardnesses such as the `-S`
|
||||
and `-F` flags for `put` and `filter`.
|
||||
In Miller 5 and below, all values were stored as strings, then only converted to int/float as needed, for example, when a particular field was referenced in the `stats1` or `put` verbs. This led to awkwardnesses such as the `-S` and `-F` flags for `put` and `filter`.
|
||||
|
||||
In Miller 6, things parseable as int/float are treated as such from the moment
|
||||
the input data is read, and these are passed along through the verb chain. All
|
||||
values are typed from when they're read, and their types are passed along.
|
||||
Meanwhile the original string representation of each value is also retained. If
|
||||
a numeric field isn't modified during the processing chain, it's printed out
|
||||
the way it arrived. Also, quoted values in JSON strings are flagged as being
|
||||
strings throughout the processing chain.
|
||||
In Miller 6, values parseable as integers or floating-point numbers are treated as such from the moment the input data is read, and these are passed along through the verb chain. All values are typed from when they're read, and their types are passed along. Meanwhile, the original string representation of each value is also retained. If a numeric field isn't modified during the processing chain, it's printed out the way it arrived. Additionally, quoted values in JSON strings are consistently flagged as strings throughout the processing chain.
|
||||
|
||||
For example (see [https://github.com/johnkerl/miller/issues/178](https://github.com/johnkerl/miller/issues/178)) you can now do
|
||||
|
||||
|
|
@ -242,30 +207,21 @@ For example (see [https://github.com/johnkerl/miller/issues/178](https://github.
|
|||
|
||||
### Deduping of repeated field names
|
||||
|
||||
By default, field names are deduped for all file formats except JSON / JSON Lines. So if you
|
||||
have an input record with `x=8,x=9` then the second field's key is renamed to
|
||||
`x_2` and so on -- the record scans as `x=8,x_2=9`. Use `mlr
|
||||
--no-dedupe-field-names` to suppress this, and have the record be scanned as
|
||||
`x=9`.
|
||||
By default, field names are deduplicated for all file formats except JSON / JSON Lines. So if you have an input record with `x=8,x=9`, then the second field's key is renamed to `x_2` and so on -- the record scans as `x=8,x_2=9`. Use `mlr --no-dedupe-field-names` to suppress this, and have the record be scanned as `x=9`.
|
||||
|
||||
For JSON and JSON Lines, the last duplicated key in an input record is always retained,
|
||||
regardless of `mlr --no-dedupe-field-names`: `{"x":8,"x":9}` scans as if it
|
||||
were `{"x":9}`.
|
||||
For JSON and JSON Lines, the last duplicated key in an input record is always retained, regardless of `mlr --no-dedupe-field-names`: `{"x":8,"x":9}` scans as if it were `{"x":9}`.
|
||||
|
||||
### Regex support for IFS and IPS
|
||||
|
||||
You can now split fields on whitespace when whitespace is a mix of tabs and
|
||||
spaces. As well, you can use regular expressions for the input field-separator
|
||||
and the input pair-separator. Please see the section on
|
||||
[multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
|
||||
You can now split fields on whitespace when whitespace is a mix of tabs and spaces. As well, you can use regular expressions for the input field-separator and the input pair-separator. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
|
||||
|
||||
In particular, for NIDX format, the default IFS now allows splitting on one or more of space or tab.
|
||||
In particular, for NIDX format, the default `IFS` now allows splitting on one or more of space or tab.
|
||||
|
||||
### Case-folded sorting options
|
||||
|
||||
The [sort](reference-verbs.md#sort) verb now accepts `-c` and `-cr` options for case-folded ascending/descending sort, respetively.
|
||||
The [sort](reference-verbs.md#sort) verb now accepts `-c` and `-cr` options for case-folded ascending/descending sort, respectively.
|
||||
|
||||
### New DSL functions / operators
|
||||
### New DSL functions and operators
|
||||
|
||||
* Higher-order functions [`select`](reference-dsl-builtin-functions.md#select), [`apply`](reference-dsl-builtin-functions.md#apply), [`reduce`](reference-dsl-builtin-functions.md#reduce), [`fold`](reference-dsl-builtin-functions.md#fold), and [`sort`](reference-dsl-builtin-functions.md#sort). See the [sorting page](sorting.md) and the [higher-order-functions page](reference-dsl-higher-order-functions.md) for more information.
|
||||
|
||||
|
|
@ -293,30 +249,30 @@ The following differences are rather technical. If they don't sound familiar to
|
|||
|
||||
### Line endings
|
||||
|
||||
The `--auto` flag is now ignored. Before, if a file had CR/LF (Windows-style) line endings on input (on any platform), it would have the same on output; likewise, LF (Unix-style) line endings. Now, files with CR/LF or LF line endings are processed on any platform, but the output line-ending is for the platform. E.g. reading CR/LF files on Linux will now produce LF output.
|
||||
The `--auto` flag is now ignored. Before, if a file had CR/LF (Windows-style) line endings on input (on any platform), it would have the same on output; likewise, LF (Unix-style) line endings. Now, files with CR/LF or LF line endings are processed on any platform, but the output line ending is for the platform. E.g., reading CR/LF files on Linux will now produce LF output.
|
||||
|
||||
### IFS and IPS as regular expressions
|
||||
|
||||
IFS and IPS can be regular expressions now. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
|
||||
IFS and IPS can now be regular expressions. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
|
||||
|
||||
### JSON and JSON Lines formatting
|
||||
|
||||
* `--jknquoteint` and `jquoteall` are ignored; they were workarounds for the (now much-improved) type-inference and type-tracking in Miller 6.
|
||||
* `--json-fatal-arrays-on-input`, `--json-map-arrays-on-input`, and `--json-skip-arrays-on-input` are ignored; Miller 6 now supports arrays fully.
|
||||
* See also `mlr help legacy-flags` or the [legacy-flags reference](reference-main-flag-list.md#legacy-flags).
|
||||
* Miller 5 accepted input records either with or without enclosing `[...]`; on output, by default it produced single-line records without outermost `[...]`. Miller 5 let you customize output formatting using `--jvstack` (multi-line records) and `--jlistwrap` (write outermost `[...]`). _Thus, Miller 5's JSON output format, with default flags, was in fact [JSON Lines](file-formats.md#json-lines) all along._
|
||||
* Miller 5 accepted input records either with or without enclosing `[...]`; on output, by default, it produced single-line records without outermost `[...]`. Miller 5 lets you customize output formatting using `--jvstack` (multi-line records) and `--jlistwrap` (write outermost `[...]`). _Thus, Miller 5's JSON output format, with default flags, was in fact [JSON Lines](file-formats.md#json-lines) all along._
|
||||
* In Miller 6, [JSON Lines](file-formats.md#json-lines) is acknowledged explicitly.
|
||||
* On input, your records are accepted whether or not they have outermost `[...]`, and regardless of line breaks, whether the specified input format is JSON or JSON Lines. (This is similar to [jq](https://stedolan.github.io/jq/).)
|
||||
* With `--ojson`, output records are written multiline (pretty-printed), with outermost `[...]`.
|
||||
* With `--ojsonl`, output records are written single-line, without outermost `[...]`.
|
||||
* This makes `--jvstack` and `--jlistwrap` unnecessary. However, if you want outermost `[...]` with single-line records, you can use `--ojson --no-jvstack`.
|
||||
* Miller 5 tolerated trailing commas, which are not compliant with the JSON specification: for example, `{"x":1,"y":2,}`. Miller 6 uses a JSON parser which is compliant with the JSON specification and does not accept trailing commas.
|
||||
* Miller 5 tolerated trailing commas, which are not compliant with the JSON specification: for example, `{"x":1,"y":2,}`. Miller 6 uses a JSON parser that is compliant with the JSON specification and does not accept trailing commas.
|
||||
|
||||
### Type-inference
|
||||
|
||||
* The `-S` and `-F` flags to `mlr put` and `mlr filter` are ignored, since type-inference is no longer done in `mlr put` and `mlr filter`, but rather, when records are first read. You can use `mlr -S` and `mlr -A`, respectively, instead to control type-inference within the record-readers.
|
||||
* Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as decimal integers.
|
||||
* Any numbers prefix with `0o`, e.g. `0o377`, are already treated as octal regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled.
|
||||
* Any numbers prefixed with `0o`, e.g. `0o377`, are already treated as octal, regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled.
|
||||
* See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags).
|
||||
|
||||
### Emit statements
|
||||
|
|
@ -341,13 +297,12 @@ This works in Miller 6 (and worked in Miller 5 as well) and is supported:
|
|||
input=1
|
||||
</pre>
|
||||
|
||||
Please see the [section on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf)
|
||||
for more information.
|
||||
Please see the [section on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information.
|
||||
|
||||
## Developer-specific aspects
|
||||
|
||||
* Miller has been ported from C to Go. Developer notes: [https://github.com/johnkerl/miller/blob/main/README-dev.md](https://github.com/johnkerl/miller/blob/main/README-dev.md).
|
||||
* Regression testing has been completely reworked, including regression-testing now running fully on Windows (alongside Linux and Mac) [on each GitHub commit](https://github.com/johnkerl/miller/actions).
|
||||
* Regression testing has been completely reworked, including regression-testing now running fully on Windows (alongside Linux and Mac) [on each github.commit](https://github.com/johnkerl/miller/actions).
|
||||
|
||||
## Performance benchmarks
|
||||
|
||||
|
|
|
|||
|
|
@ -8,43 +8,23 @@ TL;DRs: [install](installing-miller.md), [binaries](https://github.com/johnkerl/
|
|||
|
||||
### Performance
|
||||
|
||||
Performance is on par with Miller 5 for simple processing, and is far better than Miller 5 for
|
||||
complex processing chains -- the latter due to improved multicore utilization. CSV I/O is notably
|
||||
improved. See the [Performance benchmarks](#performance-benchmarks) section at the bottom of this
|
||||
page for details.
|
||||
Performance is on par with Miller 5 for simple processing, and is far better than Miller 5 for complex processing chains -- the latter due to improved multicore utilization. CSV I/O is notably improved. See the [Performance benchmarks](#performance-benchmarks) section at the bottom of this page for details.
|
||||
|
||||
### Documentation improvements
|
||||
|
||||
Documentation (what you're reading here) and online help (`mlr --help`) have been completely reworked.
|
||||
|
||||
In the initial release, the focus was convincing users already familiar with
|
||||
`awk`/`grep`/`cut` that Miller was a viable alternative -- but over time it's
|
||||
become clear that many Miller users aren't expert with those tools. The focus
|
||||
has shifted toward a higher quantity of more introductory/accessible material
|
||||
for command-line data processing.
|
||||
In the initial release, the focus was on convincing users already familiar with `awk`, `grep`, and `cut` that Miller was a viable alternative; however, over time, it has become clear that many Miller users aren't experts with those tools. The focus has shifted toward a higher quantity of more introductory/accessible material for command-line data processing.
|
||||
|
||||
Similarly, the FAQ/recipe material has been expanded to include more, and
|
||||
simpler, use-cases including resolved questions from
|
||||
[Miller Issues](https://github.com/johnkerl/miller/issues)
|
||||
and
|
||||
[Miller Discussions](https://github.com/johnkerl/miller/discussions);
|
||||
more complex/niche material has been pushed farther down. The long reference
|
||||
pages have been split up into separate pages. (See also
|
||||
[Structure of these documents](structure-of-these-documents.md).)
|
||||
Similarly, the FAQ/recipe material has been expanded to include more, and simpler, use-cases, including resolved questions from [Miller Issues](https://github.com/johnkerl/miller/issues) and [Miller Discussions](https://github.com/johnkerl/miller/discussions); more complex/niche material has been pushed farther down. The lengthy reference pages have been divided into separate pages. (See also [Structure of these documents](structure-of-these-documents.md).)
|
||||
|
||||
One of the main feedback themes from the 2021 Miller User Survey was that some
|
||||
things should be easier to find. Namely, on each doc page there's now a banner
|
||||
across the top with things that should be one click away from the landing page
|
||||
(or any page): command-line flags, verbs, functions, glossary/acronyms, and a
|
||||
finder for docs by release.
|
||||
One of the main feedback themes from the 2021 Miller User Survey was that some things should be easier to find. Namely, on each doc page, there's now a banner across the top with things that should be one click away from the landing page (or any page): command-line flags, verbs, functions, glossary/acronyms, and a finder for docs by release.
|
||||
|
||||
Since CSV is overwhelmingly the most popular data format for Miller, it is
|
||||
now discussed first, and more examples use CSV.
|
||||
Since CSV is overwhelmingly the most popular data format for Miller, it is now discussed first, and more examples use CSV.
|
||||
|
||||
### Improved Windows experience
|
||||
|
||||
Stronger support for Windows (with or without MSYS2), with a couple of
|
||||
exceptions. See [Miller on Windows](miller-on-windows.md) for more information.
|
||||
Stronger support for Windows (with or without MSYS2), with a couple of exceptions. See [Miller on Windows](miller-on-windows.md) for more information.
|
||||
|
||||
Binaries are reliably available using GitHub Actions: see also [Installation](installing-miller.md).
|
||||
|
||||
|
|
@ -73,9 +53,7 @@ GENMD-EOF
|
|||
|
||||
### Scripting
|
||||
|
||||
Scripting is now easier -- support for `#!` with `sh`, as always, along with now support for `#!` with `mlr -s`. For
|
||||
Windows, `mlr -s` can also be used. These help reduce backslash-clutter and let you do more while typing less.
|
||||
See the [scripting page](scripting.md).
|
||||
Scripting is now easier -- support for `#!` with `sh`, as always, along with now support for `#!` with `mlr -s`. For Windows, `mlr -s` can also be used. These help reduce backslash clutter and let you do more while typing less. See the [scripting page](scripting.md).
|
||||
|
||||
### REPL
|
||||
|
||||
|
|
@ -125,7 +103,7 @@ the `TZ` environment variable. Please see [DSL datetime/timezone functions](refe
|
|||
|
||||
### In-process support for compressed input
|
||||
|
||||
In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z`, `.bz2`, and `.zst` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information.
|
||||
In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly, for `.z`, `.bz2`, and `.zst` files. Please refer to the page on [Compressed Data](reference-main-compressed-data.md) for more information.
|
||||
|
||||
### Support for reading web URLs
|
||||
|
||||
|
|
@ -140,9 +118,7 @@ GENMD-EOF
|
|||
|
||||
### Improved JSON / JSON Lines support, and arrays
|
||||
|
||||
Arrays are now supported in Miller's `put`/`filter` programming language, as
|
||||
described in the [Arrays reference](reference-main-arrays.md). (Also, `array` is
|
||||
now a keyword so this is no longer usable as a local-variable or UDF name.)
|
||||
Arrays are now supported in Miller's `put`/`filter` programming language, as described in the [Arrays reference](reference-main-arrays.md). (Also, `array` is now a keyword, so this is no longer usable as a local variable or UDF name.)
|
||||
|
||||
JSON support is improved:
|
||||
|
||||
|
|
@ -165,24 +141,13 @@ See also the [Arrays reference](reference-main-arrays.md) for more information.
|
|||
|
||||
### Improved numeric conversion
|
||||
|
||||
The most central part of Miller 6 is a deep refactor of how data values are parsed
|
||||
from file contents, how types are inferred, and how they're converted back to
|
||||
text into output files.
|
||||
The most central part of Miller 6 is a deep refactor of how data values are parsed from file contents, how types are inferred, and how they're converted back to text into output files.
|
||||
|
||||
This was all initiated by [https://github.com/johnkerl/miller/issues/151](https://github.com/johnkerl/miller/issues/151).
|
||||
|
||||
In Miller 5 and below, all values were stored as strings, then only converted
|
||||
to int/float as-needed, for example when a particular field was referenced in
|
||||
the `stats1` or `put` verbs. This led to awkwardnesses such as the `-S`
|
||||
and `-F` flags for `put` and `filter`.
|
||||
In Miller 5 and below, all values were stored as strings, then only converted to int/float as needed, for example, when a particular field was referenced in the `stats1` or `put` verbs. This led to awkwardnesses such as the `-S` and `-F` flags for `put` and `filter`.
|
||||
|
||||
In Miller 6, things parseable as int/float are treated as such from the moment
|
||||
the input data is read, and these are passed along through the verb chain. All
|
||||
values are typed from when they're read, and their types are passed along.
|
||||
Meanwhile the original string representation of each value is also retained. If
|
||||
a numeric field isn't modified during the processing chain, it's printed out
|
||||
the way it arrived. Also, quoted values in JSON strings are flagged as being
|
||||
strings throughout the processing chain.
|
||||
In Miller 6, values parseable as integers or floating-point numbers are treated as such from the moment the input data is read, and these are passed along through the verb chain. All values are typed from when they're read, and their types are passed along. Meanwhile, the original string representation of each value is also retained. If a numeric field isn't modified during the processing chain, it's printed out the way it arrived. Additionally, quoted values in JSON strings are consistently flagged as strings throughout the processing chain.
|
||||
|
||||
For example (see [https://github.com/johnkerl/miller/issues/178](https://github.com/johnkerl/miller/issues/178)) you can now do
|
||||
|
||||
|
|
@ -196,30 +161,21 @@ GENMD-EOF
|
|||
|
||||
### Deduping of repeated field names
|
||||
|
||||
By default, field names are deduped for all file formats except JSON / JSON Lines. So if you
|
||||
have an input record with `x=8,x=9` then the second field's key is renamed to
|
||||
`x_2` and so on -- the record scans as `x=8,x_2=9`. Use `mlr
|
||||
--no-dedupe-field-names` to suppress this, and have the record be scanned as
|
||||
`x=9`.
|
||||
By default, field names are deduplicated for all file formats except JSON / JSON Lines. So if you have an input record with `x=8,x=9`, then the second field's key is renamed to `x_2` and so on -- the record scans as `x=8,x_2=9`. Use `mlr --no-dedupe-field-names` to suppress this, and have the record be scanned as `x=9`.
|
||||
|
||||
For JSON and JSON Lines, the last duplicated key in an input record is always retained,
|
||||
regardless of `mlr --no-dedupe-field-names`: `{"x":8,"x":9}` scans as if it
|
||||
were `{"x":9}`.
|
||||
For JSON and JSON Lines, the last duplicated key in an input record is always retained, regardless of `mlr --no-dedupe-field-names`: `{"x":8,"x":9}` scans as if it were `{"x":9}`.
|
||||
|
||||
### Regex support for IFS and IPS
|
||||
|
||||
You can now split fields on whitespace when whitespace is a mix of tabs and
|
||||
spaces. As well, you can use regular expressions for the input field-separator
|
||||
and the input pair-separator. Please see the section on
|
||||
[multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
|
||||
You can now split fields on whitespace when whitespace is a mix of tabs and spaces. As well, you can use regular expressions for the input field-separator and the input pair-separator. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
|
||||
|
||||
In particular, for NIDX format, the default IFS now allows splitting on one or more of space or tab.
|
||||
In particular, for NIDX format, the default `IFS` now allows splitting on one or more of space or tab.
|
||||
|
||||
### Case-folded sorting options
|
||||
|
||||
The [sort](reference-verbs.md#sort) verb now accepts `-c` and `-cr` options for case-folded ascending/descending sort, respetively.
|
||||
The [sort](reference-verbs.md#sort) verb now accepts `-c` and `-cr` options for case-folded ascending/descending sort, respectively.
|
||||
|
||||
### New DSL functions / operators
|
||||
### New DSL functions and operators
|
||||
|
||||
* Higher-order functions [`select`](reference-dsl-builtin-functions.md#select), [`apply`](reference-dsl-builtin-functions.md#apply), [`reduce`](reference-dsl-builtin-functions.md#reduce), [`fold`](reference-dsl-builtin-functions.md#fold), and [`sort`](reference-dsl-builtin-functions.md#sort). See the [sorting page](sorting.md) and the [higher-order-functions page](reference-dsl-higher-order-functions.md) for more information.
|
||||
|
||||
|
|
@ -247,30 +203,30 @@ The following differences are rather technical. If they don't sound familiar to
|
|||
|
||||
### Line endings
|
||||
|
||||
The `--auto` flag is now ignored. Before, if a file had CR/LF (Windows-style) line endings on input (on any platform), it would have the same on output; likewise, LF (Unix-style) line endings. Now, files with CR/LF or LF line endings are processed on any platform, but the output line-ending is for the platform. E.g. reading CR/LF files on Linux will now produce LF output.
|
||||
The `--auto` flag is now ignored. Before, if a file had CR/LF (Windows-style) line endings on input (on any platform), it would have the same on output; likewise, LF (Unix-style) line endings. Now, files with CR/LF or LF line endings are processed on any platform, but the output line ending is for the platform. E.g., reading CR/LF files on Linux will now produce LF output.
|
||||
|
||||
### IFS and IPS as regular expressions
|
||||
|
||||
IFS and IPS can be regular expressions now. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
|
||||
IFS and IPS can now be regular expressions. Please see the section on [multi-character and regular-expression separators](reference-main-separators.md#multi-character-and-regular-expression-separators).
|
||||
|
||||
### JSON and JSON Lines formatting
|
||||
|
||||
* `--jknquoteint` and `jquoteall` are ignored; they were workarounds for the (now much-improved) type-inference and type-tracking in Miller 6.
|
||||
* `--json-fatal-arrays-on-input`, `--json-map-arrays-on-input`, and `--json-skip-arrays-on-input` are ignored; Miller 6 now supports arrays fully.
|
||||
* See also `mlr help legacy-flags` or the [legacy-flags reference](reference-main-flag-list.md#legacy-flags).
|
||||
* Miller 5 accepted input records either with or without enclosing `[...]`; on output, by default it produced single-line records without outermost `[...]`. Miller 5 let you customize output formatting using `--jvstack` (multi-line records) and `--jlistwrap` (write outermost `[...]`). _Thus, Miller 5's JSON output format, with default flags, was in fact [JSON Lines](file-formats.md#json-lines) all along._
|
||||
* Miller 5 accepted input records either with or without enclosing `[...]`; on output, by default, it produced single-line records without outermost `[...]`. Miller 5 lets you customize output formatting using `--jvstack` (multi-line records) and `--jlistwrap` (write outermost `[...]`). _Thus, Miller 5's JSON output format, with default flags, was in fact [JSON Lines](file-formats.md#json-lines) all along._
|
||||
* In Miller 6, [JSON Lines](file-formats.md#json-lines) is acknowledged explicitly.
|
||||
* On input, your records are accepted whether or not they have outermost `[...]`, and regardless of line breaks, whether the specified input format is JSON or JSON Lines. (This is similar to [jq](https://stedolan.github.io/jq/).)
|
||||
* With `--ojson`, output records are written multiline (pretty-printed), with outermost `[...]`.
|
||||
* With `--ojsonl`, output records are written single-line, without outermost `[...]`.
|
||||
* This makes `--jvstack` and `--jlistwrap` unnecessary. However, if you want outermost `[...]` with single-line records, you can use `--ojson --no-jvstack`.
|
||||
* Miller 5 tolerated trailing commas, which are not compliant with the JSON specification: for example, `{"x":1,"y":2,}`. Miller 6 uses a JSON parser which is compliant with the JSON specification and does not accept trailing commas.
|
||||
* Miller 5 tolerated trailing commas, which are not compliant with the JSON specification: for example, `{"x":1,"y":2,}`. Miller 6 uses a JSON parser that is compliant with the JSON specification and does not accept trailing commas.
|
||||
|
||||
### Type-inference
|
||||
|
||||
* The `-S` and `-F` flags to `mlr put` and `mlr filter` are ignored, since type-inference is no longer done in `mlr put` and `mlr filter`, but rather, when records are first read. You can use `mlr -S` and `mlr -A`, respectively, instead to control type-inference within the record-readers.
|
||||
* Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as decimal integers.
|
||||
* Any numbers prefix with `0o`, e.g. `0o377`, are already treated as octal regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled.
|
||||
* Any numbers prefixed with `0o`, e.g. `0o377`, are already treated as octal, regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled.
|
||||
* See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags).
|
||||
|
||||
### Emit statements
|
||||
|
|
@ -290,13 +246,12 @@ GENMD-RUN-COMMAND
|
|||
mlr -n put 'end {@input={"a":1}; emit1 {"input":@input["a"]}}'
|
||||
GENMD-EOF
|
||||
|
||||
Please see the [section on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf)
|
||||
for more information.
|
||||
Please see the [section on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information.
|
||||
|
||||
## Developer-specific aspects
|
||||
|
||||
* Miller has been ported from C to Go. Developer notes: [https://github.com/johnkerl/miller/blob/main/README-dev.md](https://github.com/johnkerl/miller/blob/main/README-dev.md).
|
||||
* Regression testing has been completely reworked, including regression-testing now running fully on Windows (alongside Linux and Mac) [on each GitHub commit](https://github.com/johnkerl/miller/actions).
|
||||
* Regression testing has been completely reworked, including regression-testing now running fully on Windows (alongside Linux and Mac) [on each github.commit](https://github.com/johnkerl/miller/actions).
|
||||
|
||||
## Performance benchmarks
|
||||
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ Flags:
|
|||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv/tsv-only-flags
|
||||
mlr help dkvp-only-flags
|
||||
mlr help file-format-flags
|
||||
mlr help flatten-unflatten-flags
|
||||
mlr help format-conversion-keystroke-saver-flags
|
||||
|
|
@ -230,6 +231,7 @@ Options:
|
|||
-nf {comma-separated field names} Same as -n
|
||||
-nr {comma-separated field names} Numerical descending; nulls sort first
|
||||
-t {comma-separated field names} Natural ascending
|
||||
-b Move sort fields to start of record, as in reorder -b
|
||||
-tr|-rt {comma-separated field names} Natural descending
|
||||
-h|--help Show this message.
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ Quick links:
|
|||
</div>
|
||||
# How original is Miller?
|
||||
|
||||
It isn't. Miller is one of many, many participants in the online-analytical-processing culture. Other key participants include `awk`, SQL, spreadsheets, etc. etc. etc. Far from being an original concept, Miller explicitly strives to imitate several existing tools:
|
||||
It isn't. Miller is just one of many participants in the online analytical processing culture. Other key participants include `awk`, SQL, spreadsheets, etc. etc. etc. Far from being an original concept, Miller explicitly strives to imitate several existing tools:
|
||||
|
||||
**The Unix toolkit**: Intentional similarities as described in [Unix-toolkit Context](unix-toolkit-context.md).
|
||||
|
||||
|
|
@ -26,7 +26,7 @@ Recipes abound for command-line data analysis using the Unix toolkit. Here are j
|
|||
* [http://www.gregreda.com/2013/07/15/unix-commands-for-data-science](http://www.gregreda.com/2013/07/15/unix-commands-for-data-science)
|
||||
* [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools)
|
||||
|
||||
**RecordStream**: Miller owes particular inspiration to [RecordStream](https://github.com/benbernard/RecordStream). The key difference is that RecordStream is a Perl-based tool for manipulating JSON (including requiring it to separately manipulate other formats such as CSV into and out of JSON), while Miller is fast Go which handles its formats natively. The similarities include the `sort`, `stats1` (analog of RecordStream's `collate`), and `delta` operations, as well as `filter` and `put`, and pretty-print formatting.
|
||||
**RecordStream**: Miller owes particular inspiration to [RecordStream](https://github.com/benbernard/RecordStream). The key difference is that RecordStream is a Perl-based tool for manipulating JSON (including requiring it to separately manipulate other formats such as CSV into and out of JSON), while Miller is a fast Go tool that handles its formats natively. The similarities include the `sort`, `stats1` (analogous to RecordStream's `collate`), and `delta` operations, as well as `filter` and `put`, and the use of pretty-print formatting.
|
||||
|
||||
**stats_m**: A third source of lineage is my Python [stats_m](https://github.com/johnkerl/scripts-math/tree/master/stats) module. This includes simple single-pass algorithms which form Miller's `stats1` and `stats2` subcommands.
|
||||
|
||||
|
|
@ -35,21 +35,21 @@ Recipes abound for command-line data analysis using the Unix toolkit. Here are j
|
|||
**Added value**: Miller's added values include:
|
||||
|
||||
* Name-indexing, compared to the Unix toolkit's positional indexing.
|
||||
* Raw speed, compared to `awk`, RecordStream, `stats_m`, or various other kinds of Python/Ruby/etc. scripts one can easily create.
|
||||
* Raw speed, compared to `awk`, RecordStream, `stats_m`, or various other kinds of Python/Ruby/etc. scripts that one can easily create.
|
||||
* Compact keystroking for many common tasks, with a decent amount of flexibility.
|
||||
* Ability to handle text files on the Unix pipe, without need for creating database tables, compared to SQL databases.
|
||||
* Ability to handle text files on the Unix pipe, without the need for creating database tables, compared to SQL databases.
|
||||
* Various file formats, and on-the-fly format conversion.
|
||||
|
||||
**jq**: Miller does for name-indexed text what [jq](https://stedolan.github.io/jq/) does for JSON. If you're not already familiar with `jq`, please check it out!.
|
||||
|
||||
**What about similar tools?**
|
||||
|
||||
Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). Last I knew it doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well. As it turns out, I learned about most of these after writing Miller.
|
||||
Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). Last I knew, it doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well. As it turns out, I learned about most of these after writing Miller.
|
||||
|
||||
**What about DOTADIW?** One of the key points of the [Unix philosophy](http://en.wikipedia.org/wiki/Unix_philosophy) is that a tool should do one thing and do it well. Hence `sort` and `cut` do just one thing. Why does Miller put `awk`-like processing, a few SQL-like operations, and statistical reduction all into one tool? This is a fair question. First note that many standard tools, such as `awk` and `perl`, do quite a few things -- as does `jq`. But I could have pushed for putting format awareness and name-indexing options into `cut`, `awk`, and so on (so you could do `cut -f hostname,uptime` or `awk '{sum += $x*$y}END{print sum}'`). Patching `cut`, `sort`, etc. on multiple operating systems is a non-starter in terms of uptake. Moreover, it makes sense for me to have Miller be a tool which collects together format-aware record-stream processing into one place, with good reuse of Miller-internal library code for its various features.
|
||||
**What about DOTADIW?** One of the key points of the [Unix philosophy](http://en.wikipedia.org/wiki/Unix_philosophy) is that a tool should do one thing and do it well. Hence, `sort` and `cut` do just one thing. Why does Miller put `awk`-like processing, a few SQL-like operations, and statistical reduction all into one tool? This is a fair question. First, note that many standard tools, such as `awk` and `perl`, do quite a few things -- as does `jq`. But I could have pushed for putting format awareness and name-indexing options into `cut`, `awk`, and so on (so you could do `cut -f hostname,uptime` or `awk '{sum += $x*$y}END{print sum}'`). Patching `cut`, `sort`, etc., on multiple operating systems is a non-starter in terms of uptake. Moreover, it makes sense for me to have Miller be a tool that collects together format-aware record-stream processing into one place, with good reuse of Miller's internal library code for its various features.
|
||||
|
||||
**Why not use Perl/Python/Ruby etc.?** Maybe you should. With those tools you'll get far more expressive power, and sufficiently quick turnaround time for small-to-medium-sized data. Using Miller you'll get something less than a complete programming language, but which is fast, with moderate amounts of flexibility and much less keystroking.
|
||||
**Why not use Perl/Python/Ruby, etc.?** Maybe you should. With those tools, you'll gain significantly more expressive power and a sufficiently quick turnaround time for small to medium-sized datasets. Using Miller, you'll get something less than a complete programming language, but which is fast, with moderate amounts of flexibility and much less keystroking.
|
||||
|
||||
When I was first developing Miller I made a survey of several languages. Using low-level implementation languages like C, Go, Rust, and Nim, I'd need to create my own domain-specific language (DSL) which would always be less featured than a full programming language, but I'd get better performance. Using high-level interpreted languages such as Perl/Python/Ruby I'd get the language's `eval` for free and I wouldn't need a DSL; Miller would have mainly been a set of format-specific I/O hooks. If I'd gotten good enough performance from the latter I'd have done it without question and Miller would be far more flexible. But low-level languages win the performance criteria by a landslide so we have Miller in Go with a custom DSL.
|
||||
When I was first developing Miller, I made a survey of several languages. Using low-level implementation languages like C, Go, Rust, and Nim, I'd need to create my own domain-specific language (DSL), which would always be less featured than a full programming language, but I'd get better performance. Using high-level interpreted languages such as Perl/Python/Ruby, I'd get the language's `eval` for free and I wouldn't need a DSL; Miller would have mainly been a set of format-specific I/O hooks. If I'd gotten good enough performance from the latter, I'd have done it without question, and Miller would be far more flexible. But low-level languages win the performance criteria by a landslide, so we have Miller in Go with a custom DSL.
|
||||
|
||||
**No, really, why one more command-line data-manipulation tool?** I wrote Miller because I was frustrated with tools like `grep`, `sed`, and so on being *line-aware* without being *format-aware*. The single most poignant example I can think of is seeing people grep data lines out of their CSV files and sadly losing their header lines. While some lighter-than-SQL processing is very nice to have, at core I wanted the format-awareness of [RecordStream](https://github.com/benbernard/RecordStream) combined with the raw speed of the Unix toolkit. Miller does precisely that.
|
||||
**No, really, why one more command-line data-manipulation tool?** I wrote Miller because I was frustrated with tools like `grep`, `sed`, and so on being *line-aware* without being *format-aware*. The single most poignant example I can think of is seeing people grep data lines from their CSV files and sadly losing their header lines. While some lighter-than-SQL processing is very nice to have, at core I wanted the format-awareness of [RecordStream](https://github.com/benbernard/RecordStream) combined with the raw speed of the Unix toolkit. Miller does precisely that.
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# How original is Miller?
|
||||
|
||||
It isn't. Miller is one of many, many participants in the online-analytical-processing culture. Other key participants include `awk`, SQL, spreadsheets, etc. etc. etc. Far from being an original concept, Miller explicitly strives to imitate several existing tools:
|
||||
It isn't. Miller is just one of many participants in the online analytical processing culture. Other key participants include `awk`, SQL, spreadsheets, etc. etc. etc. Far from being an original concept, Miller explicitly strives to imitate several existing tools:
|
||||
|
||||
**The Unix toolkit**: Intentional similarities as described in [Unix-toolkit Context](unix-toolkit-context.md).
|
||||
|
||||
|
|
@ -10,7 +10,7 @@ Recipes abound for command-line data analysis using the Unix toolkit. Here are j
|
|||
* [http://www.gregreda.com/2013/07/15/unix-commands-for-data-science](http://www.gregreda.com/2013/07/15/unix-commands-for-data-science)
|
||||
* [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools)
|
||||
|
||||
**RecordStream**: Miller owes particular inspiration to [RecordStream](https://github.com/benbernard/RecordStream). The key difference is that RecordStream is a Perl-based tool for manipulating JSON (including requiring it to separately manipulate other formats such as CSV into and out of JSON), while Miller is fast Go which handles its formats natively. The similarities include the `sort`, `stats1` (analog of RecordStream's `collate`), and `delta` operations, as well as `filter` and `put`, and pretty-print formatting.
|
||||
**RecordStream**: Miller owes particular inspiration to [RecordStream](https://github.com/benbernard/RecordStream). The key difference is that RecordStream is a Perl-based tool for manipulating JSON (including requiring it to separately manipulate other formats such as CSV into and out of JSON), while Miller is a fast Go tool that handles its formats natively. The similarities include the `sort`, `stats1` (analogous to RecordStream's `collate`), and `delta` operations, as well as `filter` and `put`, and the use of pretty-print formatting.
|
||||
|
||||
**stats_m**: A third source of lineage is my Python [stats_m](https://github.com/johnkerl/scripts-math/tree/master/stats) module. This includes simple single-pass algorithms which form Miller's `stats1` and `stats2` subcommands.
|
||||
|
||||
|
|
@ -19,21 +19,21 @@ Recipes abound for command-line data analysis using the Unix toolkit. Here are j
|
|||
**Added value**: Miller's added values include:
|
||||
|
||||
* Name-indexing, compared to the Unix toolkit's positional indexing.
|
||||
* Raw speed, compared to `awk`, RecordStream, `stats_m`, or various other kinds of Python/Ruby/etc. scripts one can easily create.
|
||||
* Raw speed, compared to `awk`, RecordStream, `stats_m`, or various other kinds of Python/Ruby/etc. scripts that one can easily create.
|
||||
* Compact keystroking for many common tasks, with a decent amount of flexibility.
|
||||
* Ability to handle text files on the Unix pipe, without need for creating database tables, compared to SQL databases.
|
||||
* Ability to handle text files on the Unix pipe, without the need for creating database tables, compared to SQL databases.
|
||||
* Various file formats, and on-the-fly format conversion.
|
||||
|
||||
**jq**: Miller does for name-indexed text what [jq](https://stedolan.github.io/jq/) does for JSON. If you're not already familiar with `jq`, please check it out!.
|
||||
|
||||
**What about similar tools?**
|
||||
|
||||
Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). Last I knew it doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well. As it turns out, I learned about most of these after writing Miller.
|
||||
Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). Last I knew, it doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well. As it turns out, I learned about most of these after writing Miller.
|
||||
|
||||
**What about DOTADIW?** One of the key points of the [Unix philosophy](http://en.wikipedia.org/wiki/Unix_philosophy) is that a tool should do one thing and do it well. Hence `sort` and `cut` do just one thing. Why does Miller put `awk`-like processing, a few SQL-like operations, and statistical reduction all into one tool? This is a fair question. First note that many standard tools, such as `awk` and `perl`, do quite a few things -- as does `jq`. But I could have pushed for putting format awareness and name-indexing options into `cut`, `awk`, and so on (so you could do `cut -f hostname,uptime` or `awk '{sum += $x*$y}END{print sum}'`). Patching `cut`, `sort`, etc. on multiple operating systems is a non-starter in terms of uptake. Moreover, it makes sense for me to have Miller be a tool which collects together format-aware record-stream processing into one place, with good reuse of Miller-internal library code for its various features.
|
||||
**What about DOTADIW?** One of the key points of the [Unix philosophy](http://en.wikipedia.org/wiki/Unix_philosophy) is that a tool should do one thing and do it well. Hence, `sort` and `cut` do just one thing. Why does Miller put `awk`-like processing, a few SQL-like operations, and statistical reduction all into one tool? This is a fair question. First, note that many standard tools, such as `awk` and `perl`, do quite a few things -- as does `jq`. But I could have pushed for putting format awareness and name-indexing options into `cut`, `awk`, and so on (so you could do `cut -f hostname,uptime` or `awk '{sum += $x*$y}END{print sum}'`). Patching `cut`, `sort`, etc., on multiple operating systems is a non-starter in terms of uptake. Moreover, it makes sense for me to have Miller be a tool that collects together format-aware record-stream processing into one place, with good reuse of Miller's internal library code for its various features.
|
||||
|
||||
**Why not use Perl/Python/Ruby etc.?** Maybe you should. With those tools you'll get far more expressive power, and sufficiently quick turnaround time for small-to-medium-sized data. Using Miller you'll get something less than a complete programming language, but which is fast, with moderate amounts of flexibility and much less keystroking.
|
||||
**Why not use Perl/Python/Ruby, etc.?** Maybe you should. With those tools, you'll gain significantly more expressive power and a sufficiently quick turnaround time for small to medium-sized datasets. Using Miller, you'll get something less than a complete programming language, but which is fast, with moderate amounts of flexibility and much less keystroking.
|
||||
|
||||
When I was first developing Miller I made a survey of several languages. Using low-level implementation languages like C, Go, Rust, and Nim, I'd need to create my own domain-specific language (DSL) which would always be less featured than a full programming language, but I'd get better performance. Using high-level interpreted languages such as Perl/Python/Ruby I'd get the language's `eval` for free and I wouldn't need a DSL; Miller would have mainly been a set of format-specific I/O hooks. If I'd gotten good enough performance from the latter I'd have done it without question and Miller would be far more flexible. But low-level languages win the performance criteria by a landslide so we have Miller in Go with a custom DSL.
|
||||
When I was first developing Miller, I made a survey of several languages. Using low-level implementation languages like C, Go, Rust, and Nim, I'd need to create my own domain-specific language (DSL), which would always be less featured than a full programming language, but I'd get better performance. Using high-level interpreted languages such as Perl/Python/Ruby, I'd get the language's `eval` for free and I wouldn't need a DSL; Miller would have mainly been a set of format-specific I/O hooks. If I'd gotten good enough performance from the latter, I'd have done it without question, and Miller would be far more flexible. But low-level languages win the performance criteria by a landslide, so we have Miller in Go with a custom DSL.
|
||||
|
||||
**No, really, why one more command-line data-manipulation tool?** I wrote Miller because I was frustrated with tools like `grep`, `sed`, and so on being *line-aware* without being *format-aware*. The single most poignant example I can think of is seeing people grep data lines out of their CSV files and sadly losing their header lines. While some lighter-than-SQL processing is very nice to have, at core I wanted the format-awareness of [RecordStream](https://github.com/benbernard/RecordStream) combined with the raw speed of the Unix toolkit. Miller does precisely that.
|
||||
**No, really, why one more command-line data-manipulation tool?** I wrote Miller because I was frustrated with tools like `grep`, `sed`, and so on being *line-aware* without being *format-aware*. The single most poignant example I can think of is seeing people grep data lines from their CSV files and sadly losing their header lines. While some lighter-than-SQL processing is very nice to have, at core I wanted the format-awareness of [RecordStream](https://github.com/benbernard/RecordStream) combined with the raw speed of the Unix toolkit. Miller does precisely that.
|
||||
|
|
|
|||
|
|
@ -16,12 +16,11 @@ Quick links:
|
|||
</div>
|
||||
# Record-heterogeneity
|
||||
|
||||
We think of CSV tables as rectangular: if there are 17 columns in the header
|
||||
then there are 17 columns for every row, else the data have a formatting error.
|
||||
We think of CSV tables as rectangular: if there are 17 columns in the header, then there are 17 columns for every row, else the data has a formatting error.
|
||||
|
||||
But heterogeneous data abound -- log-file entries, JSON documents, no-SQL
|
||||
databases such as MongoDB, etc. -- not to mention **data-cleaning
|
||||
opportunities** we'll look at in this page. Miller offers several ways to
|
||||
opportunities** we'll look at on this page. Miller offers several ways to
|
||||
handle data heterogeneity.
|
||||
|
||||
## Terminology, examples, and solutions
|
||||
|
|
@ -56,7 +55,7 @@ It has three records (written here using JSON Lines formatting):
|
|||
|
||||
Here every row has the same keys, in the same order: `a,b,c`.
|
||||
|
||||
These are also sometimes called **rectangular** since if we pretty-print them we get a nice rectangle:
|
||||
These are also sometimes called **rectangular** since if we pretty-print them, we get a nice rectangle:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --icsv --opprint cat data/het/hom.csv</b>
|
||||
|
|
@ -94,7 +93,7 @@ a,b,c
|
|||
This example is still homogeneous, though: every row has the same keys, in the same order: `a,b,c`.
|
||||
Empty values don't make the data heterogeneous.
|
||||
|
||||
Note however that we can use the [`fill-empty`](reference-verbs.md#fill-empty) verb to make these
|
||||
Note, however, that we can use the [`fill-empty`](reference-verbs.md#fill-empty) verb to make these
|
||||
values non-empty, if we like:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
|
|
@ -109,7 +108,7 @@ filler 8 9
|
|||
|
||||
### Ragged data
|
||||
|
||||
Next let's look at non-well-formed CSV files. For a third example:
|
||||
Next, let's look at non-well-formed CSV files. For a third example:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat data/het/ragged.csv</b>
|
||||
|
|
@ -130,17 +129,11 @@ If you `mlr --csv cat` this, you'll get an error message:
|
|||
a,b,c
|
||||
1,2,3
|
||||
mlr: mlr: CSV header/data length mismatch 3 != 2 at filename data/het/ragged.csv row 3.
|
||||
.
|
||||
</pre>
|
||||
|
||||
There are two kinds of raggedness here. Since CSVs form records by zipping the
|
||||
keys from the header line together with the values from each data line, the
|
||||
second record has a missing value for key `c` (which ought to be fillable),
|
||||
while the third record has a value `10` with no key for it.
|
||||
There are two kinds of raggedness here. Since CSVs form records by zipping the keys from the header line, together with the values from each data line, the second record has a missing value for key `c` (which ought to be fillable), while the third record has a value `10` with no key for it.
|
||||
|
||||
Using the [`--allow-ragged-csv-input` flag](reference-main-flag-list.md#csv-only-flags)
|
||||
we can fill values in too-short rows, and provide a key (column number starting
|
||||
with 1) for too-long rows:
|
||||
Using the [`--allow-ragged-csv-input` flag](reference-main-flag-list.md#csv-only-flags), we can fill values in too-short rows and provide a key (column number starting with 1) for too-long rows:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --icsv --ojson --allow-ragged-csv-input cat data/het/ragged.csv</b>
|
||||
|
|
@ -187,7 +180,7 @@ This kind of data arises often in practice. One reason is that, while many
|
|||
programming languages (including the Miller DSL) [preserve insertion
|
||||
order](reference-main-maps.md#insertion-order-is-preserved) in maps; others do
|
||||
not. So someone might have written `{"a":4,"b":5,"c":6}` in the source code,
|
||||
but the data may not have printed that way into a given data file.
|
||||
but the data may not have been printed that way into a given data file.
|
||||
|
||||
We can use the [`regularize`](reference-verbs.md#regularize) or
|
||||
[`sort-within-records`](reference-verbs.md#sort-within-records) verb to order
|
||||
|
|
@ -204,13 +197,13 @@ the keys:
|
|||
|
||||
The `regularize` verb tries to re-order subsequent rows to look like the first
|
||||
(whatever order that is); the `sort-within-records` verb simply uses
|
||||
alphabetical order (which is the same in the above example where the first
|
||||
alphabetical order (which is the same in the above example, where the first
|
||||
record has keys in the order `a,b,c`).
|
||||
|
||||
### Sparse data
|
||||
|
||||
Here's another frequently occurring situation -- quite often, systems will log
|
||||
data for items which are present, but won't log data for items which aren't.
|
||||
data for items that are present, but won't log data for items that aren't.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --json cat data/het/sparse.json</b>
|
||||
|
|
@ -237,8 +230,7 @@ data for items which are present, but won't log data for items which aren't.
|
|||
|
||||
This data is called **sparse** (from the [data-storage term](https://en.wikipedia.org/wiki/Sparse_matrix)).
|
||||
|
||||
We can use the [`unsparsify`](reference-verbs.md#unsparsify) verb to make sure
|
||||
every record has the same keys:
|
||||
We can use the [`unsparsify`](reference-verbs.md#unsparsify) verb to make sure every record has the same keys:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --json unsparsify data/het/sparse.json</b>
|
||||
|
|
@ -283,12 +275,11 @@ xy55.east - /dev/sda1 failover true
|
|||
|
||||
## Reading and writing heterogeneous data
|
||||
|
||||
In the previous sections we saw different kinds of data heterogeneity, and ways
|
||||
to transform the data to make it homogeneous.
|
||||
In the previous sections, we saw different kinds of data heterogeneity and ways to transform the data to make it homogeneous.
|
||||
|
||||
### Non-rectangular file formats: JSON, XTAB, NIDX, DKVP
|
||||
|
||||
For these formats, record-heterogeneity comes naturally:
|
||||
For these formats, record heterogeneity comes naturally:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat data/het/sparse.json</b>
|
||||
|
|
@ -372,11 +363,11 @@ record_count=150,resource=/path/to/second/file
|
|||
|
||||
### Rectangular file formats: CSV and pretty-print
|
||||
|
||||
CSV and pretty-print formats expect rectangular structure. But Miller lets you
|
||||
CSV and pretty-print formats expect a rectangular structure. But Miller lets you
|
||||
process non-rectangular using CSV and pretty-print.
|
||||
|
||||
For CSV-lite and TSV-lite, Miller simply prints a newline and a new header when there is a schema
|
||||
change -- where by _schema_ we mean simply the list of record keys in the order they are
|
||||
For CSV-lite and TSV-lite, Miller prints a newline and a new header when there is a schema
|
||||
change -- where by _schema_ we mean the list of record keys in the order they are
|
||||
encountered. When there is no schema change, you get CSV per se as a special case. Likewise, Miller
|
||||
reads heterogeneous CSV or pretty-print input the same way. The difference between CSV and CSV-lite
|
||||
is that the former is [RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter
|
||||
|
|
@ -471,9 +462,7 @@ mlr: CSV schema change: first keys "resource,loadsec,ok"; current keys "record_c
|
|||
mlr: exiting due to data error.
|
||||
</pre>
|
||||
|
||||
Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if
|
||||
there are implicit header changes (no intervening blank line and new header line) as seen above --
|
||||
you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`).
|
||||
Miller handles explicit header changes as shown. If your CSV input contains ragged data -- if there are implicit header changes (no intervening blank line and new header line) as seen above -- you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`).
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --csv --allow-ragged-csv-input cat data/het/ragged.csv</b>
|
||||
|
|
@ -488,11 +477,11 @@ a,b,c
|
|||
## Processing heterogeneous data
|
||||
|
||||
Above we saw how to make heterogeneous data homogeneous, and then how to print heterogeneous data.
|
||||
As for other processing, record-heterogeneity is not a problem for Miller.
|
||||
As for other processing, record heterogeneity is not a problem for Miller.
|
||||
|
||||
Miller operates on specified fields and takes the rest along: for example, if
|
||||
you are sorting on the `count` field then all records in the input stream must
|
||||
have a `count` field but the other fields can vary, and moreover the sorted-on
|
||||
you are sorting on the `count` field, then all records in the input stream must
|
||||
have a `count` field, but the other fields can vary---and moreover the sorted-on
|
||||
field name(s) don't need to be in the same position on each line:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
|
|
|
|||
|
|
@ -1,11 +1,10 @@
|
|||
# Record-heterogeneity
|
||||
|
||||
We think of CSV tables as rectangular: if there are 17 columns in the header
|
||||
then there are 17 columns for every row, else the data have a formatting error.
|
||||
We think of CSV tables as rectangular: if there are 17 columns in the header, then there are 17 columns for every row, else the data has a formatting error.
|
||||
|
||||
But heterogeneous data abound -- log-file entries, JSON documents, no-SQL
|
||||
databases such as MongoDB, etc. -- not to mention **data-cleaning
|
||||
opportunities** we'll look at in this page. Miller offers several ways to
|
||||
opportunities** we'll look at on this page. Miller offers several ways to
|
||||
handle data heterogeneity.
|
||||
|
||||
## Terminology, examples, and solutions
|
||||
|
|
@ -29,7 +28,7 @@ GENMD-EOF
|
|||
|
||||
Here every row has the same keys, in the same order: `a,b,c`.
|
||||
|
||||
These are also sometimes called **rectangular** since if we pretty-print them we get a nice rectangle:
|
||||
These are also sometimes called **rectangular** since if we pretty-print them, we get a nice rectangle:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --icsv --opprint cat data/het/hom.csv
|
||||
|
|
@ -50,7 +49,7 @@ GENMD-EOF
|
|||
This example is still homogeneous, though: every row has the same keys, in the same order: `a,b,c`.
|
||||
Empty values don't make the data heterogeneous.
|
||||
|
||||
Note however that we can use the [`fill-empty`](reference-verbs.md#fill-empty) verb to make these
|
||||
Note, however, that we can use the [`fill-empty`](reference-verbs.md#fill-empty) verb to make these
|
||||
values non-empty, if we like:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
|
|
@ -59,7 +58,7 @@ GENMD-EOF
|
|||
|
||||
### Ragged data
|
||||
|
||||
Next let's look at non-well-formed CSV files. For a third example:
|
||||
Next, let's look at non-well-formed CSV files. For a third example:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
cat data/het/ragged.csv
|
||||
|
|
@ -71,14 +70,9 @@ GENMD-RUN-COMMAND-TOLERATING-ERROR
|
|||
mlr --csv cat data/het/ragged.csv
|
||||
GENMD-EOF
|
||||
|
||||
There are two kinds of raggedness here. Since CSVs form records by zipping the
|
||||
keys from the header line together with the values from each data line, the
|
||||
second record has a missing value for key `c` (which ought to be fillable),
|
||||
while the third record has a value `10` with no key for it.
|
||||
There are two kinds of raggedness here. Since CSVs form records by zipping the keys from the header line, together with the values from each data line, the second record has a missing value for key `c` (which ought to be fillable), while the third record has a value `10` with no key for it.
|
||||
|
||||
Using the [`--allow-ragged-csv-input` flag](reference-main-flag-list.md#csv-only-flags)
|
||||
we can fill values in too-short rows, and provide a key (column number starting
|
||||
with 1) for too-long rows:
|
||||
Using the [`--allow-ragged-csv-input` flag](reference-main-flag-list.md#csv-only-flags), we can fill values in too-short rows and provide a key (column number starting with 1) for too-long rows:
|
||||
|
||||
GENMD-RUN-COMMAND-TOLERATING-ERROR
|
||||
mlr --icsv --ojson --allow-ragged-csv-input cat data/het/ragged.csv
|
||||
|
|
@ -101,7 +95,7 @@ This kind of data arises often in practice. One reason is that, while many
|
|||
programming languages (including the Miller DSL) [preserve insertion
|
||||
order](reference-main-maps.md#insertion-order-is-preserved) in maps; others do
|
||||
not. So someone might have written `{"a":4,"b":5,"c":6}` in the source code,
|
||||
but the data may not have printed that way into a given data file.
|
||||
but the data may not have been printed that way into a given data file.
|
||||
|
||||
We can use the [`regularize`](reference-verbs.md#regularize) or
|
||||
[`sort-within-records`](reference-verbs.md#sort-within-records) verb to order
|
||||
|
|
@ -113,13 +107,13 @@ GENMD-EOF
|
|||
|
||||
The `regularize` verb tries to re-order subsequent rows to look like the first
|
||||
(whatever order that is); the `sort-within-records` verb simply uses
|
||||
alphabetical order (which is the same in the above example where the first
|
||||
alphabetical order (which is the same in the above example, where the first
|
||||
record has keys in the order `a,b,c`).
|
||||
|
||||
### Sparse data
|
||||
|
||||
Here's another frequently occurring situation -- quite often, systems will log
|
||||
data for items which are present, but won't log data for items which aren't.
|
||||
data for items that are present, but won't log data for items that aren't.
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --json cat data/het/sparse.json
|
||||
|
|
@ -127,8 +121,7 @@ GENMD-EOF
|
|||
|
||||
This data is called **sparse** (from the [data-storage term](https://en.wikipedia.org/wiki/Sparse_matrix)).
|
||||
|
||||
We can use the [`unsparsify`](reference-verbs.md#unsparsify) verb to make sure
|
||||
every record has the same keys:
|
||||
We can use the [`unsparsify`](reference-verbs.md#unsparsify) verb to make sure every record has the same keys:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --json unsparsify data/het/sparse.json
|
||||
|
|
@ -142,12 +135,11 @@ GENMD-EOF
|
|||
|
||||
## Reading and writing heterogeneous data
|
||||
|
||||
In the previous sections we saw different kinds of data heterogeneity, and ways
|
||||
to transform the data to make it homogeneous.
|
||||
In the previous sections, we saw different kinds of data heterogeneity and ways to transform the data to make it homogeneous.
|
||||
|
||||
### Non-rectangular file formats: JSON, XTAB, NIDX, DKVP
|
||||
|
||||
For these formats, record-heterogeneity comes naturally:
|
||||
For these formats, record heterogeneity comes naturally:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
cat data/het/sparse.json
|
||||
|
|
@ -177,11 +169,11 @@ GENMD-EOF
|
|||
|
||||
### Rectangular file formats: CSV and pretty-print
|
||||
|
||||
CSV and pretty-print formats expect rectangular structure. But Miller lets you
|
||||
CSV and pretty-print formats expect a rectangular structure. But Miller lets you
|
||||
process non-rectangular using CSV and pretty-print.
|
||||
|
||||
For CSV-lite and TSV-lite, Miller simply prints a newline and a new header when there is a schema
|
||||
change -- where by _schema_ we mean simply the list of record keys in the order they are
|
||||
For CSV-lite and TSV-lite, Miller prints a newline and a new header when there is a schema
|
||||
change -- where by _schema_ we mean the list of record keys in the order they are
|
||||
encountered. When there is no schema change, you get CSV per se as a special case. Likewise, Miller
|
||||
reads heterogeneous CSV or pretty-print input the same way. The difference between CSV and CSV-lite
|
||||
is that the former is [RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter
|
||||
|
|
@ -207,9 +199,7 @@ GENMD-RUN-COMMAND-TOLERATING-ERROR
|
|||
mlr --ijson --ocsv group-like data/het.json
|
||||
GENMD-EOF
|
||||
|
||||
Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if
|
||||
there are implicit header changes (no intervening blank line and new header line) as seen above --
|
||||
you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`).
|
||||
Miller handles explicit header changes as shown. If your CSV input contains ragged data -- if there are implicit header changes (no intervening blank line and new header line) as seen above -- you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`).
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --csv --allow-ragged-csv-input cat data/het/ragged.csv
|
||||
|
|
@ -218,11 +208,11 @@ GENMD-EOF
|
|||
## Processing heterogeneous data
|
||||
|
||||
Above we saw how to make heterogeneous data homogeneous, and then how to print heterogeneous data.
|
||||
As for other processing, record-heterogeneity is not a problem for Miller.
|
||||
As for other processing, record heterogeneity is not a problem for Miller.
|
||||
|
||||
Miller operates on specified fields and takes the rest along: for example, if
|
||||
you are sorting on the `count` field then all records in the input stream must
|
||||
have a `count` field but the other fields can vary, and moreover the sorted-on
|
||||
you are sorting on the `count` field, then all records in the input stream must
|
||||
have a `count` field, but the other fields can vary---and moreover the sorted-on
|
||||
field name(s) don't need to be in the same position on each line:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
|
|
|
|||
|
|
@ -16,9 +16,7 @@ Quick links:
|
|||
</div>
|
||||
# DSL built-in functions
|
||||
|
||||
These are functions in the [Miller programming language](miller-programming-language.md)
|
||||
that you can call when you use `mlr put` and `mlr filter`. For example, when you type
|
||||
|
||||
These are functions in the [Miller programming language](miller-programming-language.md) that you can call when you use `mlr put` and `mlr filter`. For example, when you type
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --icsv --opprint --from example.csv put '</b>
|
||||
<b> $color = toupper($color);</b>
|
||||
|
|
@ -43,26 +41,13 @@ the `toupper` and `gsub` bits are _functions_.
|
|||
|
||||
## Overview
|
||||
|
||||
At the command line, you can use `mlr -f` and `mlr -F` for information much
|
||||
like what's on this page.
|
||||
At the command line, you can use `mlr -f` and `mlr -F` for information much like what's on this page.
|
||||
|
||||
Each function takes a specific number of arguments, as shown below, except for
|
||||
functions marked as variadic such as `min` and `max`. (The latter compute min
|
||||
and max of any number of arguments.) There is no notion of optional or
|
||||
default-on-absent arguments. All argument-passing is positional rather than by
|
||||
name; arguments are passed by value, not by reference.
|
||||
Each function takes a specific number of arguments, as shown below, except for functions marked as variadic, such as `min` and `max`. (The latter compute the min and max of any number of arguments.) There is no notion of optional or default-on-absent arguments. All argument-passing is positional rather than by name; arguments are passed by value, not by reference.
|
||||
|
||||
At the command line, you can get a list of all functions using `mlr -f`, with
|
||||
details using `mlr -F`. (Or, `mlr help usage-functions-by-class` to get
|
||||
details in the order shown on this page.) You can get detail for a given
|
||||
function using `mlr help function namegoeshere`, e.g. `mlr help function
|
||||
gsub`.
|
||||
At the command line, you can get a list of all functions using `mlr -f`, with details using `mlr -F`. (Or, `mlr help usage-functions-by-class` to get details in the order shown on this page.) You can get details for a given function using `mlr help function namegoeshere`, e.g., `mlr help function gsub`.
|
||||
|
||||
Operators are listed here along with functions. In this case, the
|
||||
argument-count is the number of items involved in the infix operator, e.g. we
|
||||
say `x+y` so the details for the `+` operator say that its number of arguments
|
||||
is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary
|
||||
`? :` operator shows an argument-count of 3.
|
||||
Operators are listed here along with functions. In this case, the argument count refers to the number of items involved in the infix operator. For example, we say `x+y`, so the details for the `+` operator indicate that it has two arguments. Unary operators such as `!` and `~` show argument-count of 1; the ternary `? :` operator shows an argument count of 3.
|
||||
|
||||
|
||||
## Functions by class
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
# DSL built-in functions
|
||||
|
||||
These are functions in the [Miller programming language](miller-programming-language.md)
|
||||
that you can call when you use `mlr put` and `mlr filter`. For example, when you type
|
||||
|
||||
These are functions in the [Miller programming language](miller-programming-language.md) that you can call when you use `mlr put` and `mlr filter`. For example, when you type
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --icsv --opprint --from example.csv put '
|
||||
$color = toupper($color);
|
||||
|
|
@ -14,25 +12,12 @@ the `toupper` and `gsub` bits are _functions_.
|
|||
|
||||
## Overview
|
||||
|
||||
At the command line, you can use `mlr -f` and `mlr -F` for information much
|
||||
like what's on this page.
|
||||
At the command line, you can use `mlr -f` and `mlr -F` for information much like what's on this page.
|
||||
|
||||
Each function takes a specific number of arguments, as shown below, except for
|
||||
functions marked as variadic such as `min` and `max`. (The latter compute min
|
||||
and max of any number of arguments.) There is no notion of optional or
|
||||
default-on-absent arguments. All argument-passing is positional rather than by
|
||||
name; arguments are passed by value, not by reference.
|
||||
Each function takes a specific number of arguments, as shown below, except for functions marked as variadic, such as `min` and `max`. (The latter compute the min and max of any number of arguments.) There is no notion of optional or default-on-absent arguments. All argument-passing is positional rather than by name; arguments are passed by value, not by reference.
|
||||
|
||||
At the command line, you can get a list of all functions using `mlr -f`, with
|
||||
details using `mlr -F`. (Or, `mlr help usage-functions-by-class` to get
|
||||
details in the order shown on this page.) You can get detail for a given
|
||||
function using `mlr help function namegoeshere`, e.g. `mlr help function
|
||||
gsub`.
|
||||
At the command line, you can get a list of all functions using `mlr -f`, with details using `mlr -F`. (Or, `mlr help usage-functions-by-class` to get details in the order shown on this page.) You can get details for a given function using `mlr help function namegoeshere`, e.g., `mlr help function gsub`.
|
||||
|
||||
Operators are listed here along with functions. In this case, the
|
||||
argument-count is the number of items involved in the infix operator, e.g. we
|
||||
say `x+y` so the details for the `+` operator say that its number of arguments
|
||||
is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary
|
||||
`? :` operator shows an argument-count of 3.
|
||||
Operators are listed here along with functions. In this case, the argument count refers to the number of items involved in the infix operator. For example, we say `x+y`, so the details for the `+` operator indicate that it has two arguments. Unary operators such as `!` and `~` show argument-count of 1; the ternary `? :` operator shows an argument count of 3.
|
||||
|
||||
GENMD-RUN-CONTENT-GENERATOR(./mk-func-info.rb)
|
||||
|
|
|
|||
|
|
@ -16,34 +16,9 @@ Quick links:
|
|||
</div>
|
||||
# A note on the complexity of Miller's expression language
|
||||
|
||||
One of Miller's strengths is its brevity: it's much quicker -- and less
|
||||
error-prone -- to type `mlr stats1 -a sum -f x,y -g a,b` than having to track
|
||||
summation variables as in `awk`, or using Miller's [out-of-stream
|
||||
variables](reference-dsl-variables.md#out-of-stream-variables). And the more
|
||||
language features Miller's put-DSL has (for-loops, if-statements, nested
|
||||
control structures, user-defined functions, etc.) then the *less* powerful it
|
||||
begins to seem: because of the other programming-language features it *doesn't*
|
||||
have (classes, exceptions, and so on).
|
||||
One of Miller's strengths is its brevity: it's much quicker -- and less error-prone -- to type `mlr stats1 -a sum -f x,y -g a,b` than having to track summation variables as in `awk`, or using Miller's [out-of-stream variables](reference-dsl-variables.md#out-of-stream-variables). And the more language features Miller's put-DSL has (for-loops, if-statements, nested control structures, user-defined functions, etc.), then the *less* powerful it begins to seem: because of the other programming-language features it *doesn't* have (classes, exceptions, and so on).
|
||||
|
||||
When I was originally prototyping Miller in 2015, the primary decision I had
|
||||
was whether to hand-code in a low-level language like C or Rust or Go, with my
|
||||
own hand-rolled DSL, or whether to use a higher-level language (like Python or
|
||||
Lua or Nim) and let the `put` statements be handled by the implementation
|
||||
language's own `eval`: the implementation language would take the place of a
|
||||
DSL. Multiple performance experiments showed me I could get better throughput
|
||||
using the former, by a wide margin. So Miller is Go under the hood with a
|
||||
hand-rolled DSL.
|
||||
When I was initially prototyping Miller in 2015, the primary decision I had was whether to hand-code in a low-level language like C or Rust or Go, with my hand-rolled DSL, or whether to use a higher-level language (like Python or Lua or Nim) and let the `put` statements be handled by the implementation language's own `eval`: the implementation language would take the place of a DSL. Multiple performance experiments showed me I could get better throughput using the former, by a wide margin. So Miller is Go under the hood with a hand-rolled DSL.
|
||||
|
||||
I do want to keep focusing on what Miller is good at -- concise notation, low
|
||||
latency, and high throughput -- and not add too much in terms of
|
||||
high-level-language features to the DSL. That said, some sort of
|
||||
customizability is a basic thing to want. As of 4.1.0 we have recursive
|
||||
`for`/`while`/`if` [structures](reference-dsl-control-structures.md) on about
|
||||
the same complexity level as `awk`; as of 5.0.0 we have [user-defined
|
||||
functions](reference-dsl-user-defined-functions.md) and [map-valued
|
||||
variables](reference-dsl-variables.md), again on about the same complexity level
|
||||
as `awk` along with optional type-declaration syntax; as of Miller 6 we have
|
||||
full support for [arrays](reference-main-arrays.md). While I'm excited by these
|
||||
powerful language features, I hope to keep new features focused on Miller's
|
||||
sweet spot which is speed plus simplicity.
|
||||
I want to continue focusing on what Miller excels at — concise notation, low latency, and high throughput — and not add too many high-level language features to the DSL. That said, some customizability is a basic thing to want. As of 4.1.0, we have recursive `for`/`while`/`if` [structures](reference-dsl-control-structures.md) on about the same complexity level as `awk`; as of 5.0.0, we have [user-defined functions](reference-dsl-user-defined-functions.md) and [map-valued variables](reference-dsl-variables.md), again on about the same complexity level as `awk` along with optional type-declaration syntax; as of Miller 6, we have full support for [arrays](reference-main-arrays.md). While I'm excited by these powerful language features, I hope to keep new features focused on Miller's sweet spot, which is speed plus simplicity.
|
||||
|
||||
|
|
|
|||
|
|
@ -1,33 +1,8 @@
|
|||
# A note on the complexity of Miller's expression language
|
||||
|
||||
One of Miller's strengths is its brevity: it's much quicker -- and less
|
||||
error-prone -- to type `mlr stats1 -a sum -f x,y -g a,b` than having to track
|
||||
summation variables as in `awk`, or using Miller's [out-of-stream
|
||||
variables](reference-dsl-variables.md#out-of-stream-variables). And the more
|
||||
language features Miller's put-DSL has (for-loops, if-statements, nested
|
||||
control structures, user-defined functions, etc.) then the *less* powerful it
|
||||
begins to seem: because of the other programming-language features it *doesn't*
|
||||
have (classes, exceptions, and so on).
|
||||
One of Miller's strengths is its brevity: it's much quicker -- and less error-prone -- to type `mlr stats1 -a sum -f x,y -g a,b` than having to track summation variables as in `awk`, or using Miller's [out-of-stream variables](reference-dsl-variables.md#out-of-stream-variables). And the more language features Miller's put-DSL has (for-loops, if-statements, nested control structures, user-defined functions, etc.), then the *less* powerful it begins to seem: because of the other programming-language features it *doesn't* have (classes, exceptions, and so on).
|
||||
|
||||
When I was originally prototyping Miller in 2015, the primary decision I had
|
||||
was whether to hand-code in a low-level language like C or Rust or Go, with my
|
||||
own hand-rolled DSL, or whether to use a higher-level language (like Python or
|
||||
Lua or Nim) and let the `put` statements be handled by the implementation
|
||||
language's own `eval`: the implementation language would take the place of a
|
||||
DSL. Multiple performance experiments showed me I could get better throughput
|
||||
using the former, by a wide margin. So Miller is Go under the hood with a
|
||||
hand-rolled DSL.
|
||||
When I was initially prototyping Miller in 2015, the primary decision I had was whether to hand-code in a low-level language like C or Rust or Go, with my hand-rolled DSL, or whether to use a higher-level language (like Python or Lua or Nim) and let the `put` statements be handled by the implementation language's own `eval`: the implementation language would take the place of a DSL. Multiple performance experiments showed me I could get better throughput using the former, by a wide margin. So Miller is Go under the hood with a hand-rolled DSL.
|
||||
|
||||
I do want to keep focusing on what Miller is good at -- concise notation, low
|
||||
latency, and high throughput -- and not add too much in terms of
|
||||
high-level-language features to the DSL. That said, some sort of
|
||||
customizability is a basic thing to want. As of 4.1.0 we have recursive
|
||||
`for`/`while`/`if` [structures](reference-dsl-control-structures.md) on about
|
||||
the same complexity level as `awk`; as of 5.0.0 we have [user-defined
|
||||
functions](reference-dsl-user-defined-functions.md) and [map-valued
|
||||
variables](reference-dsl-variables.md), again on about the same complexity level
|
||||
as `awk` along with optional type-declaration syntax; as of Miller 6 we have
|
||||
full support for [arrays](reference-main-arrays.md). While I'm excited by these
|
||||
powerful language features, I hope to keep new features focused on Miller's
|
||||
sweet spot which is speed plus simplicity.
|
||||
I want to continue focusing on what Miller excels at — concise notation, low latency, and high throughput — and not add too many high-level language features to the DSL. That said, some customizability is a basic thing to want. As of 4.1.0, we have recursive `for`/`while`/`if` [structures](reference-dsl-control-structures.md) on about the same complexity level as `awk`; as of 5.0.0, we have [user-defined functions](reference-dsl-user-defined-functions.md) and [map-valued variables](reference-dsl-variables.md), again on about the same complexity level as `awk` along with optional type-declaration syntax; as of Miller 6, we have full support for [arrays](reference-main-arrays.md). While I'm excited by these powerful language features, I hope to keep new features focused on Miller's sweet spot, which is speed plus simplicity.
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ Quick links:
|
|||
|
||||
## Pattern-action blocks
|
||||
|
||||
These are reminiscent of `awk` syntax. They can be used to allow assignments to be done only when appropriate -- e.g. for math-function domain restrictions, regex-matching, and so on:
|
||||
These are reminiscent of `awk` syntax. They can be used to allow assignments to be done only when appropriate -- e.g., for math-function domain restrictions, regex-matching, and so on:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr cat data/put-gating-example-1.dkvp</b>
|
||||
|
|
@ -64,7 +64,7 @@ a=some other name
|
|||
a=xyz_789,b=left_xyz,c=right_789
|
||||
</pre>
|
||||
|
||||
This produces heteregenous output which Miller, of course, has no problems with (see [Record Heterogeneity](record-heterogeneity.md)). But if you want homogeneous output, the curly braces can be replaced with a semicolon between the expression and the body statements. This causes `put` to evaluate the boolean expression (along with any side effects, namely, regex-captures `\1`, `\2`, etc.) but doesn't use it as a criterion for whether subsequent assignments should be executed. Instead, subsequent assignments are done unconditionally:
|
||||
This produces heterogeneous output which Miller, of course, has no problems with (see [Record Heterogeneity](record-heterogeneity.md)). But if you want homogeneous output, the curly braces can be replaced with a semicolon between the expression and the body statements. This causes `put` to evaluate the boolean expression (along with any side effects, namely, regex-captures `\1`, `\2`, etc.) but doesn't use it as a criterion for whether subsequent assignments should be executed. Instead, subsequent assignments are done unconditionally:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --opprint put '</b>
|
||||
|
|
@ -172,7 +172,7 @@ records](operating-on-all-records.md) for some options.
|
|||
|
||||
## For-loops
|
||||
|
||||
While Miller's `while` and `do-while` statements are much as in many other languages, `for` loops are more idiosyncratic to Miller. They are loops over key-value pairs, whether in stream records, out-of-stream variables, local variables, or map-literals: more reminiscent of `foreach`, as in (for example) PHP. There are **for-loops over map keys** and **for-loops over key-value tuples**. Additionally, Miller has a **C-style triple-for loop** with initialize, test, and update statements. Each is described below.
|
||||
While Miller's `while` and `do-while` statements are much like those in many other languages, `for` loops are more idiosyncratic to Miller. They are loops over key-value pairs, whether in stream records, out-of-stream variables, local variables, or map-literals: more reminiscent of `foreach`, as in (for example) PHP. There are **for-loops over map keys** and **for-loops over key-value tuples**. Additionally, Miller has a **C-style triple-for loop** with initialize, test, and update statements. Each is described below.
|
||||
|
||||
As with `while` and `do-while`, a `break` or `continue` within nested control structures will propagate to the innermost loop enclosing them, if any, and a `break` or `continue` outside a loop is a syntax error that will be flagged as soon as the expression is parsed, before any input records are ingested.
|
||||
|
||||
|
|
@ -260,11 +260,9 @@ value: true valuetype: bool
|
|||
|
||||
### Key-value for-loops
|
||||
|
||||
For [maps](reference-main-maps.md), the first loop variable is the key and the
|
||||
second is the value; for [arrays](reference-main-arrays.md), the first loop
|
||||
variable is the (1-up) array index and the second is the value.
|
||||
For [maps](reference-main-maps.md), the first loop variable is the key, and the second is the value. For [arrays](reference-main-arrays.md), the first loop variable is the (1-based) array index, and the second is the value.
|
||||
|
||||
Single-level keys may be gotten at using either `for(k,v)` or `for((k),v)`; multi-level keys may be gotten at using `for((k1,k2,k3),v)` and so on. The `v` variable will be bound to a scalar value (non-array/non-map) if the map stops at that level, or to a map-valued or array-valued variable if the map goes deeper. If the map isn't deep enough then the loop body won't be executed.
|
||||
Single-level keys may be obtained using either `for(k,v)` or `for((k),v)`; multi-level keys may be obtained using `for((k1,k2,k3),v)` and so on. The `v` variable will be bound to a scalar value (non-array/non-map) if the map stops at that level, or to a map-valued or array-valued variable if the map goes deeper. If the map isn't deep enough then the loop body won't be executed.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat data/for-srec-example.tbl</b>
|
||||
|
|
@ -333,7 +331,7 @@ eks wye 4 0.381399 0.134188 4.515587 18.062348
|
|||
wye pan 5 0.573288 0.863624 6.4369119999999995 25.747647999999998
|
||||
</pre>
|
||||
|
||||
It can be confusing to modify the stream record while iterating over a copy of it, so instead you might find it simpler to use a local variable in the loop and only update the stream record after the loop:
|
||||
It can be confusing to modify the stream record while iterating over a copy of it, so instead, you might find it simpler to use a local variable in the loop and only update the stream record after the loop:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --from data/small --opprint put '</b>
|
||||
|
|
@ -355,7 +353,7 @@ eks wye 4 0.381399 0.134188 4.515587
|
|||
wye pan 5 0.573288 0.863624 6.4369119999999995
|
||||
</pre>
|
||||
|
||||
You can also start iterating on sub-maps of an out-of-stream or local variable; you can loop over nested keys; you can loop over all out-of-stream variables. The bound variables are bound to a copy of the sub-map as it was before the loop started. The sub-map is specified by square-bracketed indices after `in`, and additional deeper indices are bound to loop key-variables. The terminal values are bound to the loop value-variable whenever the keys are not too shallow. The value-variable may refer to a terminal (string, number) or it may be map-valued if the map goes deeper. Example indexing is as follows:
|
||||
You can also start iterating on sub-maps of an out-of-stream or local variable; you can loop over nested keys; you can loop over all out-of-stream variables. The bound variables are bound to a copy of the sub-map as it was before the loop started. The sub-map is specified by square-bracketed indices after `in`, and additional deeper indices are bound to loop key variables. The terminal values are bound to the loop value variable whenever the keys are not too shallow. The value variable may refer to a terminal (string, number) or it may be map-valued if the map goes deeper. Example indexing is as follows:
|
||||
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
# Parentheses are optional for single key:
|
||||
|
|
@ -516,15 +514,15 @@ wye pan 5 0.573288 0.863624 15 31
|
|||
|
||||
Notes:
|
||||
|
||||
* In `for (start; continuation; update) { body }`, the start, continuation, and update statements may be empty, single statements, or multiple comma-separated statements. If the continuation is empty (e.g. `for(i=1;;i+=1)`) it defaults to true.
|
||||
* In `for (start; continuation; update) { body }`, the start, continuation, and update statements may be empty, single statements, or multiple comma-separated statements. If the continuation is empty (e.g. `for(i=1;;i+=1)`), it defaults to true.
|
||||
|
||||
* In particular, you may use `$`-variables and/or `@`-variables in the start, continuation, and/or update steps (as well as the body, of course).
|
||||
|
||||
* The typedecls such as `int` or `num` are optional. If a typedecl is provided (for a local variable), it binds a variable scoped to the for-loop regardless of whether a same-name variable is present in outer scope. If a typedecl is not provided, then the variable is scoped to the for-loop if no same-name variable is present in outer scope, or if a same-name variable is present in outer scope then it is modified.
|
||||
* The typedecls such as `int` or `num` are optional. If a typedecl is provided (for a local variable), it binds a variable scoped to the for-loop regardless of whether a same-name variable is present in the outer scope. If a typedecl is not provided, then the variable is scoped to the for-loop if no same-name variable is present in the outer scope, or if a same-name variable is present in the outer scope, then it is modified.
|
||||
|
||||
* Miller has no `++` or `--` operators.
|
||||
|
||||
* As with all `for`/`if`/`while` statements in Miller, the curly braces are required even if the body is a single statement, or empty.
|
||||
* As with all `for`/`if`/`while` statements in Miller, the curly braces are required even if the body is a single statement or empty.
|
||||
|
||||
## Begin/end blocks
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
## Pattern-action blocks
|
||||
|
||||
These are reminiscent of `awk` syntax. They can be used to allow assignments to be done only when appropriate -- e.g. for math-function domain restrictions, regex-matching, and so on:
|
||||
These are reminiscent of `awk` syntax. They can be used to allow assignments to be done only when appropriate -- e.g., for math-function domain restrictions, regex-matching, and so on:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr cat data/put-gating-example-1.dkvp
|
||||
|
|
@ -24,7 +24,7 @@ mlr put '
|
|||
data/put-gating-example-2.dkvp
|
||||
GENMD-EOF
|
||||
|
||||
This produces heteregenous output which Miller, of course, has no problems with (see [Record Heterogeneity](record-heterogeneity.md)). But if you want homogeneous output, the curly braces can be replaced with a semicolon between the expression and the body statements. This causes `put` to evaluate the boolean expression (along with any side effects, namely, regex-captures `\1`, `\2`, etc.) but doesn't use it as a criterion for whether subsequent assignments should be executed. Instead, subsequent assignments are done unconditionally:
|
||||
This produces heterogeneous output which Miller, of course, has no problems with (see [Record Heterogeneity](record-heterogeneity.md)). But if you want homogeneous output, the curly braces can be replaced with a semicolon between the expression and the body statements. This causes `put` to evaluate the boolean expression (along with any side effects, namely, regex-captures `\1`, `\2`, etc.) but doesn't use it as a criterion for whether subsequent assignments should be executed. Instead, subsequent assignments are done unconditionally:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --opprint put '
|
||||
|
|
@ -120,7 +120,7 @@ records](operating-on-all-records.md) for some options.
|
|||
|
||||
## For-loops
|
||||
|
||||
While Miller's `while` and `do-while` statements are much as in many other languages, `for` loops are more idiosyncratic to Miller. They are loops over key-value pairs, whether in stream records, out-of-stream variables, local variables, or map-literals: more reminiscent of `foreach`, as in (for example) PHP. There are **for-loops over map keys** and **for-loops over key-value tuples**. Additionally, Miller has a **C-style triple-for loop** with initialize, test, and update statements. Each is described below.
|
||||
While Miller's `while` and `do-while` statements are much like those in many other languages, `for` loops are more idiosyncratic to Miller. They are loops over key-value pairs, whether in stream records, out-of-stream variables, local variables, or map-literals: more reminiscent of `foreach`, as in (for example) PHP. There are **for-loops over map keys** and **for-loops over key-value tuples**. Additionally, Miller has a **C-style triple-for loop** with initialize, test, and update statements. Each is described below.
|
||||
|
||||
As with `while` and `do-while`, a `break` or `continue` within nested control structures will propagate to the innermost loop enclosing them, if any, and a `break` or `continue` outside a loop is a syntax error that will be flagged as soon as the expression is parsed, before any input records are ingested.
|
||||
|
||||
|
|
@ -165,11 +165,9 @@ GENMD-EOF
|
|||
|
||||
### Key-value for-loops
|
||||
|
||||
For [maps](reference-main-maps.md), the first loop variable is the key and the
|
||||
second is the value; for [arrays](reference-main-arrays.md), the first loop
|
||||
variable is the (1-up) array index and the second is the value.
|
||||
For [maps](reference-main-maps.md), the first loop variable is the key, and the second is the value. For [arrays](reference-main-arrays.md), the first loop variable is the (1-based) array index, and the second is the value.
|
||||
|
||||
Single-level keys may be gotten at using either `for(k,v)` or `for((k),v)`; multi-level keys may be gotten at using `for((k1,k2,k3),v)` and so on. The `v` variable will be bound to a scalar value (non-array/non-map) if the map stops at that level, or to a map-valued or array-valued variable if the map goes deeper. If the map isn't deep enough then the loop body won't be executed.
|
||||
Single-level keys may be obtained using either `for(k,v)` or `for((k),v)`; multi-level keys may be obtained using `for((k1,k2,k3),v)` and so on. The `v` variable will be bound to a scalar value (non-array/non-map) if the map stops at that level, or to a map-valued or array-valued variable if the map goes deeper. If the map isn't deep enough then the loop body won't be executed.
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
cat data/for-srec-example.tbl
|
||||
|
|
@ -210,7 +208,7 @@ mlr --from data/small --opprint put '
|
|||
'
|
||||
GENMD-EOF
|
||||
|
||||
It can be confusing to modify the stream record while iterating over a copy of it, so instead you might find it simpler to use a local variable in the loop and only update the stream record after the loop:
|
||||
It can be confusing to modify the stream record while iterating over a copy of it, so instead, you might find it simpler to use a local variable in the loop and only update the stream record after the loop:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --from data/small --opprint put '
|
||||
|
|
@ -224,7 +222,7 @@ mlr --from data/small --opprint put '
|
|||
'
|
||||
GENMD-EOF
|
||||
|
||||
You can also start iterating on sub-maps of an out-of-stream or local variable; you can loop over nested keys; you can loop over all out-of-stream variables. The bound variables are bound to a copy of the sub-map as it was before the loop started. The sub-map is specified by square-bracketed indices after `in`, and additional deeper indices are bound to loop key-variables. The terminal values are bound to the loop value-variable whenever the keys are not too shallow. The value-variable may refer to a terminal (string, number) or it may be map-valued if the map goes deeper. Example indexing is as follows:
|
||||
You can also start iterating on sub-maps of an out-of-stream or local variable; you can loop over nested keys; you can loop over all out-of-stream variables. The bound variables are bound to a copy of the sub-map as it was before the loop started. The sub-map is specified by square-bracketed indices after `in`, and additional deeper indices are bound to loop key variables. The terminal values are bound to the loop value variable whenever the keys are not too shallow. The value variable may refer to a terminal (string, number) or it may be map-valued if the map goes deeper. Example indexing is as follows:
|
||||
|
||||
GENMD-INCLUDE-ESCAPED(data/for-oosvar-example-0a.txt)
|
||||
|
||||
|
|
@ -333,15 +331,15 @@ GENMD-EOF
|
|||
|
||||
Notes:
|
||||
|
||||
* In `for (start; continuation; update) { body }`, the start, continuation, and update statements may be empty, single statements, or multiple comma-separated statements. If the continuation is empty (e.g. `for(i=1;;i+=1)`) it defaults to true.
|
||||
* In `for (start; continuation; update) { body }`, the start, continuation, and update statements may be empty, single statements, or multiple comma-separated statements. If the continuation is empty (e.g. `for(i=1;;i+=1)`), it defaults to true.
|
||||
|
||||
* In particular, you may use `$`-variables and/or `@`-variables in the start, continuation, and/or update steps (as well as the body, of course).
|
||||
|
||||
* The typedecls such as `int` or `num` are optional. If a typedecl is provided (for a local variable), it binds a variable scoped to the for-loop regardless of whether a same-name variable is present in outer scope. If a typedecl is not provided, then the variable is scoped to the for-loop if no same-name variable is present in outer scope, or if a same-name variable is present in outer scope then it is modified.
|
||||
* The typedecls such as `int` or `num` are optional. If a typedecl is provided (for a local variable), it binds a variable scoped to the for-loop regardless of whether a same-name variable is present in the outer scope. If a typedecl is not provided, then the variable is scoped to the for-loop if no same-name variable is present in the outer scope, or if a same-name variable is present in the outer scope, then it is modified.
|
||||
|
||||
* Miller has no `++` or `--` operators.
|
||||
|
||||
* As with all `for`/`if`/`while` statements in Miller, the curly braces are required even if the body is a single statement, or empty.
|
||||
* As with all `for`/`if`/`while` statements in Miller, the curly braces are required even if the body is a single statement or empty.
|
||||
|
||||
## Begin/end blocks
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ red,square,true,2,15,79.2778,0.0130
|
|||
red,circle,true,3,16,13.8103,2.9010
|
||||
</pre>
|
||||
|
||||
The former, of course, is a little easier to type. For another example:
|
||||
The former is a little easier to type. For another example:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --csv put '@running_sum += $quantity; filter @running_sum > 500' example.csv</b>
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ GENMD-RUN-COMMAND
|
|||
mlr --csv put 'filter NR==2 || NR==3' example.csv
|
||||
GENMD-EOF
|
||||
|
||||
The former, of course, is a little easier to type. For another example:
|
||||
The former is a little easier to type. For another example:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --csv put '@running_sum += $quantity; filter @running_sum > 500' example.csv
|
||||
|
|
|
|||
|
|
@ -29,23 +29,15 @@ As of [Miller 6](new-in-miller-6.md) you can use
|
|||
intuitive operations on arrays and maps, as an alternative to things which
|
||||
would otherwise require for-loops.
|
||||
|
||||
See also the [`get_keys`](reference-dsl-builtin-functions.md#get_keys) and
|
||||
[`get_values`](reference-dsl-builtin-functions.md#get_values) functions which,
|
||||
when given a map, return an array of its keys or an array of its values,
|
||||
respectively.
|
||||
See also the [`get_keys`](reference-dsl-builtin-functions.md#get_keys) and [`get_values`](reference-dsl-builtin-functions.md#get_values) functions which, when given a map, return an array of its keys or an array of its values, respectively.
|
||||
|
||||
## select
|
||||
|
||||
The [`select`](reference-dsl-builtin-functions.md#select) function takes a map
|
||||
or array as its first argument and a function as second argument. It includes
|
||||
each input element in the output if the function returns true.
|
||||
The [`select`](reference-dsl-builtin-functions.md#select) function takes a map or array as its first argument and a function as its second argument. It includes each input element in the output if the function returns true.
|
||||
|
||||
For arrays, that function should take one argument, for array element; for
|
||||
maps, it should take two, for map-element key and value. In either case it
|
||||
should return a boolean.
|
||||
For arrays, that function should take one argument, for an array element; for maps, it should take two, for a map element key and value. In either case, it should return a boolean.
|
||||
|
||||
A perhaps helpful analogy: the `select` function is to arrays and maps as the
|
||||
[`filter`](reference-verbs.md#filter) is to records.
|
||||
A perhaps helpful analogy: the `select` function is to arrays and maps as the [`filter`](reference-verbs.md#filter) is to records.
|
||||
|
||||
Array examples:
|
||||
|
||||
|
|
@ -123,16 +115,11 @@ Values with last digit >= 5:
|
|||
|
||||
## apply
|
||||
|
||||
The [`apply`](reference-dsl-builtin-functions.md#apply) function takes a map
|
||||
or array as its first argument and a function as second argument. It applies
|
||||
the function to each element of the array or map.
|
||||
The [`apply`](reference-dsl-builtin-functions.md#apply) function takes a map or array as its first argument and a function as its second argument. It applies the function to each element of the array or map.
|
||||
|
||||
For arrays, the function should take one argument, for array element; it should
|
||||
return a new element. For maps, it should take two, for map-element key and
|
||||
value. It should return a new key-value pair (i.e. a single-entry map).
|
||||
For arrays, the function should take one argument, representing an array element, and return a new element. For maps, it should take two, for the map element key and value. It should return a new key-value pair (i.e., a single-entry map).
|
||||
|
||||
A perhaps helpful analogy: the `apply` function is to arrays and maps as the
|
||||
[`put`](reference-verbs.md#put) is to records.
|
||||
A perhaps helpful analogy: the `apply` function is to arrays and maps as the [`put`](reference-verbs.md#put) is to records.
|
||||
|
||||
Array examples:
|
||||
|
||||
|
|
@ -232,17 +219,11 @@ Same, with upcased keys:
|
|||
|
||||
## reduce
|
||||
|
||||
The [`reduce`](reference-dsl-builtin-functions.md#reduce) function takes a map
|
||||
or array as its first argument and a function as second argument. It accumulates entries into a final
|
||||
output -- for example, sum or product.
|
||||
The [`reduce`](reference-dsl-builtin-functions.md#reduce) function takes a map or array as its first argument and a function as its second argument. It accumulates entries into a final output, such as a sum or product.
|
||||
|
||||
For arrays, the function should take two arguments, for accumulated value and
|
||||
array element; for maps, it should take four, for accumulated key and value
|
||||
and map-element key and value. In either case it should return the updated
|
||||
accumulator.
|
||||
For arrays, the function should take two arguments, for the accumulated value and the array element; for maps, it should take four, for the accumulated key and value, and the map-element key and value. In either case it should return the updated accumulator.
|
||||
|
||||
The start value for the accumulator is the first element for arrays, or the
|
||||
first element's key-value pair for maps.
|
||||
The start value for the accumulator is the first element for arrays, or the first element's key-value pair for maps.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr -n put '</b>
|
||||
|
|
@ -370,10 +351,7 @@ String-join of values:
|
|||
|
||||
## fold
|
||||
|
||||
The [`fold`](reference-dsl-builtin-functions.md#fold) function is the same as
|
||||
`reduce`, except that instead of the starting value for the accumulation being
|
||||
taken from the first entry of the array/map, you specify it as the third
|
||||
argument.
|
||||
The [`fold`](reference-dsl-builtin-functions.md#fold) function is the same as `reduce`, except that instead of the starting value for the accumulation being taken from the first entry of the array/map, you specify it as the third argument.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr -n put '</b>
|
||||
|
|
@ -469,22 +447,13 @@ Sum of values with fold and 1000000 initial value:
|
|||
|
||||
## sort
|
||||
|
||||
The [`sort`](reference-dsl-builtin-functions.md#sort) function takes a map or
|
||||
array as its first argument, and it can take a function as second argument.
|
||||
Unlike the other higher-order functions, the second argument can be omitted
|
||||
when the natural ordering is desired -- ordered by array element for arrays, or by
|
||||
key for maps.
|
||||
The [`sort`](reference-dsl-builtin-functions.md#sort) function takes a map or array as its first argument, and it can take a function as its second argument. Unlike the other higher-order functions, the second argument can be omitted when the natural ordering is desired -- ordered by array element for arrays, or by key for maps.
|
||||
|
||||
As a second option, character flags such as `r` for reverse or `c` for
|
||||
case-folded lexical sort can be supplied as the second argument.
|
||||
As a second option, character flags such as `r` for reverse or `c` for case-folded lexical sort can be supplied as the second argument.
|
||||
|
||||
As a third option, a function can be supplied as the second argument.
|
||||
|
||||
For arrays, that function should take two arguments `a` and `b`, returning a
|
||||
negative, zero, or positive number as `a<b`, `a==b`, or `a>b` respectively.
|
||||
For maps, the function should take four arguments `ak`, `av`, `bk`, and `bv`,
|
||||
again returning negative, zero, or positive, using `a` and `b`'s keys and
|
||||
values.
|
||||
For arrays, that function should take two arguments `a` and `b`, returning a negative, zero, or positive number as `a<b`, `a==b`, or `a>b` respectively. For maps, the function should take four arguments `ak`, `av`, `bk`, and `bv`, again returning negative, zero, or positive, using `a`'s and `b`'s keys and values.
|
||||
|
||||
Array examples:
|
||||
|
||||
|
|
@ -703,9 +672,7 @@ red square false 6 64 77.1991 9.5310
|
|||
|
||||
## Combined examples
|
||||
|
||||
Using a paradigm from the [page on operating on all
|
||||
records](operating-on-all-records.md), we can retain a column from the input
|
||||
data as an array, then apply some higher-order functions to it:
|
||||
Using a paradigm from the [page on operating on all records](operating-on-all-records.md), we can retain a column from the input data as an array, then apply some higher-order functions to it:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --c2p cat example.csv</b>
|
||||
|
|
@ -776,7 +743,7 @@ Sorted, then cubed, then summed:
|
|||
|
||||
### Remember return
|
||||
|
||||
From other languages it's easy to accidentally write
|
||||
From other languages, it's easy to write accidentally
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr -n put 'end { print select([1,2,3,4,5], func (e) { e >= 3 })}'</b>
|
||||
|
|
@ -833,7 +800,7 @@ but this does:
|
|||
2187
|
||||
</pre>
|
||||
|
||||
### Built-in functions currently unsupported as arguments
|
||||
### Built-in functions are currently unsupported as arguments
|
||||
|
||||
[Built-in functions](reference-dsl-user-defined-functions.md) are, as of
|
||||
September 2021, a bit separate from [user-defined
|
||||
|
|
|
|||
|
|
@ -13,23 +13,15 @@ As of [Miller 6](new-in-miller-6.md) you can use
|
|||
intuitive operations on arrays and maps, as an alternative to things which
|
||||
would otherwise require for-loops.
|
||||
|
||||
See also the [`get_keys`](reference-dsl-builtin-functions.md#get_keys) and
|
||||
[`get_values`](reference-dsl-builtin-functions.md#get_values) functions which,
|
||||
when given a map, return an array of its keys or an array of its values,
|
||||
respectively.
|
||||
See also the [`get_keys`](reference-dsl-builtin-functions.md#get_keys) and [`get_values`](reference-dsl-builtin-functions.md#get_values) functions which, when given a map, return an array of its keys or an array of its values, respectively.
|
||||
|
||||
## select
|
||||
|
||||
The [`select`](reference-dsl-builtin-functions.md#select) function takes a map
|
||||
or array as its first argument and a function as second argument. It includes
|
||||
each input element in the output if the function returns true.
|
||||
The [`select`](reference-dsl-builtin-functions.md#select) function takes a map or array as its first argument and a function as its second argument. It includes each input element in the output if the function returns true.
|
||||
|
||||
For arrays, that function should take one argument, for array element; for
|
||||
maps, it should take two, for map-element key and value. In either case it
|
||||
should return a boolean.
|
||||
For arrays, that function should take one argument, for an array element; for maps, it should take two, for a map element key and value. In either case, it should return a boolean.
|
||||
|
||||
A perhaps helpful analogy: the `select` function is to arrays and maps as the
|
||||
[`filter`](reference-verbs.md#filter) is to records.
|
||||
A perhaps helpful analogy: the `select` function is to arrays and maps as the [`filter`](reference-verbs.md#filter) is to records.
|
||||
|
||||
Array examples:
|
||||
|
||||
|
|
@ -75,16 +67,11 @@ GENMD-EOF
|
|||
|
||||
## apply
|
||||
|
||||
The [`apply`](reference-dsl-builtin-functions.md#apply) function takes a map
|
||||
or array as its first argument and a function as second argument. It applies
|
||||
the function to each element of the array or map.
|
||||
The [`apply`](reference-dsl-builtin-functions.md#apply) function takes a map or array as its first argument and a function as its second argument. It applies the function to each element of the array or map.
|
||||
|
||||
For arrays, the function should take one argument, for array element; it should
|
||||
return a new element. For maps, it should take two, for map-element key and
|
||||
value. It should return a new key-value pair (i.e. a single-entry map).
|
||||
For arrays, the function should take one argument, representing an array element, and return a new element. For maps, it should take two, for the map element key and value. It should return a new key-value pair (i.e., a single-entry map).
|
||||
|
||||
A perhaps helpful analogy: the `apply` function is to arrays and maps as the
|
||||
[`put`](reference-verbs.md#put) is to records.
|
||||
A perhaps helpful analogy: the `apply` function is to arrays and maps as the [`put`](reference-verbs.md#put) is to records.
|
||||
|
||||
Array examples:
|
||||
|
||||
|
|
@ -134,17 +121,11 @@ GENMD-EOF
|
|||
|
||||
## reduce
|
||||
|
||||
The [`reduce`](reference-dsl-builtin-functions.md#reduce) function takes a map
|
||||
or array as its first argument and a function as second argument. It accumulates entries into a final
|
||||
output -- for example, sum or product.
|
||||
The [`reduce`](reference-dsl-builtin-functions.md#reduce) function takes a map or array as its first argument and a function as its second argument. It accumulates entries into a final output, such as a sum or product.
|
||||
|
||||
For arrays, the function should take two arguments, for accumulated value and
|
||||
array element; for maps, it should take four, for accumulated key and value
|
||||
and map-element key and value. In either case it should return the updated
|
||||
accumulator.
|
||||
For arrays, the function should take two arguments, for the accumulated value and the array element; for maps, it should take four, for the accumulated key and value, and the map-element key and value. In either case it should return the updated accumulator.
|
||||
|
||||
The start value for the accumulator is the first element for arrays, or the
|
||||
first element's key-value pair for maps.
|
||||
The start value for the accumulator is the first element for arrays, or the first element's key-value pair for maps.
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr -n put '
|
||||
|
|
@ -213,10 +194,7 @@ GENMD-EOF
|
|||
|
||||
## fold
|
||||
|
||||
The [`fold`](reference-dsl-builtin-functions.md#fold) function is the same as
|
||||
`reduce`, except that instead of the starting value for the accumulation being
|
||||
taken from the first entry of the array/map, you specify it as the third
|
||||
argument.
|
||||
The [`fold`](reference-dsl-builtin-functions.md#fold) function is the same as `reduce`, except that instead of the starting value for the accumulation being taken from the first entry of the array/map, you specify it as the third argument.
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr -n put '
|
||||
|
|
@ -269,22 +247,13 @@ GENMD-EOF
|
|||
|
||||
## sort
|
||||
|
||||
The [`sort`](reference-dsl-builtin-functions.md#sort) function takes a map or
|
||||
array as its first argument, and it can take a function as second argument.
|
||||
Unlike the other higher-order functions, the second argument can be omitted
|
||||
when the natural ordering is desired -- ordered by array element for arrays, or by
|
||||
key for maps.
|
||||
The [`sort`](reference-dsl-builtin-functions.md#sort) function takes a map or array as its first argument, and it can take a function as its second argument. Unlike the other higher-order functions, the second argument can be omitted when the natural ordering is desired -- ordered by array element for arrays, or by key for maps.
|
||||
|
||||
As a second option, character flags such as `r` for reverse or `c` for
|
||||
case-folded lexical sort can be supplied as the second argument.
|
||||
As a second option, character flags such as `r` for reverse or `c` for case-folded lexical sort can be supplied as the second argument.
|
||||
|
||||
As a third option, a function can be supplied as the second argument.
|
||||
|
||||
For arrays, that function should take two arguments `a` and `b`, returning a
|
||||
negative, zero, or positive number as `a<b`, `a==b`, or `a>b` respectively.
|
||||
For maps, the function should take four arguments `ak`, `av`, `bk`, and `bv`,
|
||||
again returning negative, zero, or positive, using `a` and `b`'s keys and
|
||||
values.
|
||||
For arrays, that function should take two arguments `a` and `b`, returning a negative, zero, or positive number as `a<b`, `a==b`, or `a>b` respectively. For maps, the function should take four arguments `ak`, `av`, `bk`, and `bv`, again returning negative, zero, or positive, using `a`'s and `b`'s keys and values.
|
||||
|
||||
Array examples:
|
||||
|
||||
|
|
@ -379,9 +348,7 @@ GENMD-EOF
|
|||
|
||||
## Combined examples
|
||||
|
||||
Using a paradigm from the [page on operating on all
|
||||
records](operating-on-all-records.md), we can retain a column from the input
|
||||
data as an array, then apply some higher-order functions to it:
|
||||
Using a paradigm from the [page on operating on all records](operating-on-all-records.md), we can retain a column from the input data as an array, then apply some higher-order functions to it:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --c2p cat example.csv
|
||||
|
|
@ -426,7 +393,7 @@ GENMD-EOF
|
|||
|
||||
### Remember return
|
||||
|
||||
From other languages it's easy to accidentally write
|
||||
From other languages, it's easy to write accidentally
|
||||
|
||||
GENMD-RUN-COMMAND-TOLERATING-ERROR
|
||||
mlr -n put 'end { print select([1,2,3,4,5], func (e) { e >= 3 })}'
|
||||
|
|
@ -465,7 +432,7 @@ mlr -n put '
|
|||
'
|
||||
GENMD-EOF
|
||||
|
||||
### Built-in functions currently unsupported as arguments
|
||||
### Built-in functions are currently unsupported as arguments
|
||||
|
||||
[Built-in functions](reference-dsl-user-defined-functions.md) are, as of
|
||||
September 2021, a bit separate from [user-defined
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ Operators are listed on the [DSL built-in functions page](reference-dsl-builtin-
|
|||
|
||||
## Operator precedence
|
||||
|
||||
Operators are listed in order of decreasing precedence, highest first.
|
||||
Operators are listed in order of decreasing precedence, from highest to lowest.
|
||||
|
||||
| Operators | Associativity |
|
||||
|-------------------------------|---------------|
|
||||
|
|
@ -46,14 +46,13 @@ Operators are listed in order of decreasing precedence, highest first.
|
|||
| `? :` | right to left |
|
||||
| `=` | N/A for Miller (there is no $a=$b=$c) |
|
||||
|
||||
See also the [section on parsing and operator precedence in the REPL](repl.md#parsing-and-operator-precedence)
|
||||
for information on how to examine operator precedence interactively.
|
||||
See also the [section on parsing and operator precedence in the REPL](repl.md#parsing-and-operator-precedence) for information on how to examine operator precedence interactively.
|
||||
|
||||
## Operator and function semantics
|
||||
|
||||
* Functions are often pass-throughs straight to the system-standard Go libraries.
|
||||
|
||||
* The [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max) functions are different from other multi-argument functions which return null if any of their inputs are null: for [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max), by contrast, if one argument is absent-null, the other is returned. Empty-null loses min or max against numeric or boolean; empty-null is less than any other string.
|
||||
* The [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max) functions are different from other multi-argument functions, which return null if any of their inputs are null: for [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max), by contrast, if one argument is absent-null, the other is returned. Empty-null loses min or max against numeric or boolean; empty-null is less than any other string.
|
||||
|
||||
* Symmetrically with respect to the bitwise OR, AND, and XOR operators
|
||||
[`|`](reference-dsl-builtin-functions.md#bitwise-or),
|
||||
|
|
@ -71,7 +70,7 @@ for information on how to examine operator precedence interactively.
|
|||
|
||||
The main use for the `.` operator is for string concatenation: `"abc" . "def"` is `"abc.def"`.
|
||||
|
||||
However, in Miller 6 it has optional use for map traversal. Example:
|
||||
However, in Miller 6, it has an optional use for map traversal. Example:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat data/server-log.json</b>
|
||||
|
|
@ -146,7 +145,7 @@ This also works on the left-hand sides of assignment statements:
|
|||
|
||||
A few caveats:
|
||||
|
||||
* This is why `.` has higher precedece than `+` in the table above -- in Miller 5 and below, where `.` was only used for concatenation, it had the same precedence as `+`. So you can now do this:
|
||||
* This is why `.` has higher precedence than `+` in the table above -- in Miller 5 and below, where `.` was only used for concatenation, it had the same precedence as `+`. So you can now do this:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --json --from data/server-log.json put -q '</b>
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ Operators are listed on the [DSL built-in functions page](reference-dsl-builtin-
|
|||
|
||||
## Operator precedence
|
||||
|
||||
Operators are listed in order of decreasing precedence, highest first.
|
||||
Operators are listed in order of decreasing precedence, from highest to lowest.
|
||||
|
||||
| Operators | Associativity |
|
||||
|-------------------------------|---------------|
|
||||
|
|
@ -30,14 +30,13 @@ Operators are listed in order of decreasing precedence, highest first.
|
|||
| `? :` | right to left |
|
||||
| `=` | N/A for Miller (there is no $a=$b=$c) |
|
||||
|
||||
See also the [section on parsing and operator precedence in the REPL](repl.md#parsing-and-operator-precedence)
|
||||
for information on how to examine operator precedence interactively.
|
||||
See also the [section on parsing and operator precedence in the REPL](repl.md#parsing-and-operator-precedence) for information on how to examine operator precedence interactively.
|
||||
|
||||
## Operator and function semantics
|
||||
|
||||
* Functions are often pass-throughs straight to the system-standard Go libraries.
|
||||
|
||||
* The [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max) functions are different from other multi-argument functions which return null if any of their inputs are null: for [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max), by contrast, if one argument is absent-null, the other is returned. Empty-null loses min or max against numeric or boolean; empty-null is less than any other string.
|
||||
* The [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max) functions are different from other multi-argument functions, which return null if any of their inputs are null: for [`min`](reference-dsl-builtin-functions.md#min) and [`max`](reference-dsl-builtin-functions.md#max), by contrast, if one argument is absent-null, the other is returned. Empty-null loses min or max against numeric or boolean; empty-null is less than any other string.
|
||||
|
||||
* Symmetrically with respect to the bitwise OR, AND, and XOR operators
|
||||
[`|`](reference-dsl-builtin-functions.md#bitwise-or),
|
||||
|
|
@ -55,7 +54,7 @@ for information on how to examine operator precedence interactively.
|
|||
|
||||
The main use for the `.` operator is for string concatenation: `"abc" . "def"` is `"abc.def"`.
|
||||
|
||||
However, in Miller 6 it has optional use for map traversal. Example:
|
||||
However, in Miller 6, it has an optional use for map traversal. Example:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
cat data/server-log.json
|
||||
|
|
@ -78,7 +77,7 @@ GENMD-EOF
|
|||
|
||||
A few caveats:
|
||||
|
||||
* This is why `.` has higher precedece than `+` in the table above -- in Miller 5 and below, where `.` was only used for concatenation, it had the same precedence as `+`. So you can now do this:
|
||||
* This is why `.` has higher precedence than `+` in the table above -- in Miller 5 and below, where `.` was only used for concatenation, it had the same precedence as `+`. So you can now do this:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --json --from data/server-log.json put -q '
|
||||
|
|
|
|||
|
|
@ -22,15 +22,15 @@ You can **output** variable-values or expressions in **five ways**:
|
|||
|
||||
* Use **emit1**/**emit**/**emitp**/**emitf** to send out-of-stream variables' current values to the output record stream, e.g. `@sum += $x; emit1 @sum` which produces an extra record such as `sum=3.1648382`. These records, just like records from input file(s), participate in downstream [then-chaining](reference-main-then-chaining.md) to other verbs.
|
||||
|
||||
* Use the **print** or **eprint** keywords which immediately print an expression *directly to standard output or standard error*, respectively. Note that `dump`, `edump`, `print`, and `eprint` don't output records which participate in `then`-chaining; rather, they're just immediate prints to stdout/stderr. The `printn` and `eprintn` keywords are the same except that they don't print final newlines. Additionally, you can print to a specified file instead of stdout/stderr.
|
||||
* Use the **print** or **eprint** keywords which immediately print an expression *directly to standard output or standard error*, respectively. Note that `dump`, `edump`, `print`, and `eprint` don't output records that participate in `then`-chaining; rather, they're just immediate prints to stdout/stderr. The `printn` and `eprintn` keywords are the same except that they don't print final newlines. Additionally, you can print to a specified file instead of stdout/stderr.
|
||||
|
||||
* Use the **dump** or **edump** keywords, which *immediately print all out-of-stream variables as a JSON data structure to the standard output or standard error* (respectively).
|
||||
|
||||
* Use **tee** which formats the current stream record (not just an arbitrary string as with **print**) to a specific file.
|
||||
* Use **tee**, which formats the current stream record (not just an arbitrary string as with **print**) to a specific file.
|
||||
|
||||
For the first two options you are populating the output-records stream which feeds into the next verb in a `then`-chain (if any), or which otherwise is formatted for output using `--o...` flags.
|
||||
For the first two options, you are populating the output-records stream which feeds into the next verb in a `then`-chain (if any), or which otherwise is formatted for output using `--o...` flags.
|
||||
|
||||
For the last three options you are sending output directly to standard output, standard error, or a file.
|
||||
For the last three options, you are sending output directly to standard output, standard error, or a file.
|
||||
|
||||
## Print statements
|
||||
|
||||
|
|
@ -38,7 +38,7 @@ The `print` statement is perhaps self-explanatory, but with a few light caveats:
|
|||
|
||||
* There are four variants: `print` goes to stdout with final newline, `printn` goes to stdout without final newline (you can include one using "\n" in your output string), `eprint` goes to stderr with final newline, and `eprintn` goes to stderr without final newline.
|
||||
|
||||
* Output goes directly to stdout/stderr, respectively: data produced this way do not go downstream to the next verb in a `then`-chain. (Use `emit` for that.)
|
||||
* Output goes directly to stdout/stderr, respectively: data produced this way does not go downstream to the next verb in a `then`-chain. (Use `emit` for that.)
|
||||
|
||||
* Print statements are for strings (`print "hello"`), or things which can be made into strings: numbers (`print 3`, `print $a + $b`), or concatenations thereof (`print "a + b = " . ($a + $b)`). Maps (in `$*`, map-valued out-of-stream or local variables, and map literals) as well as arrays are printed as JSON.
|
||||
|
||||
|
|
@ -62,9 +62,9 @@ The `dump` statement is for printing expressions, including maps, directly to st
|
|||
|
||||
* There are two variants: `dump` prints to stdout; `edump` prints to stderr.
|
||||
|
||||
* Output goes directly to stdout/stderr, respectively: data produced this way do not go downstream to the next verb in a `then`-chain. (Use `emit` for that.)
|
||||
* Output goes directly to stdout/stderr, respectively: data produced this way does not go downstream to the next verb in a `then`-chain. (Use `emit` for that.)
|
||||
|
||||
* You can use `dump` to output single strings, numbers, or expressions including map-valued data. Map-valued data are printed as JSON.
|
||||
* You can use `dump` to output single strings, numbers, or expressions including map-valued data. Map-valued data is printed as JSON.
|
||||
|
||||
* If you use `dump` (or `edump`) with no arguments, you get a JSON structure representing the current values of all out-of-stream variables.
|
||||
|
||||
|
|
@ -76,7 +76,7 @@ The `dump` statement is for printing expressions, including maps, directly to st
|
|||
|
||||
Records produced by a `mlr put` go downstream to the next verb in your `then`-chain, if any, or otherwise to standard output. If you want to additionally copy out records to files, you can do that using `tee`.
|
||||
|
||||
The syntax is, by example:
|
||||
The syntax is, for example:
|
||||
|
||||
<pre class="pre-highlight-non-pair">
|
||||
<b>mlr --from myfile.dat put 'tee > "tap.dat", $*' then sort -n index</b>
|
||||
|
|
@ -84,8 +84,7 @@ The syntax is, by example:
|
|||
|
||||
First is `tee >`, then the filename expression (which can be an expression such as `"tap.".$a.".dat"`), then a comma, then `$*`. (Nothing else but `$*` is teeable.)
|
||||
|
||||
You can also write to a variable file name -- for example, you can split a
|
||||
single file into multiple ones on field names:
|
||||
You can also write to a variable file name -- for example, you can split a single file into multiple ones on field names:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --csv cat example.csv</b>
|
||||
|
|
@ -324,26 +323,12 @@ There are four variants: `emit1`, `emitf`, `emit`, and `emitp`. These are used
|
|||
to insert new records into the record stream -- or, optionally, redirect them
|
||||
to files.
|
||||
|
||||
Keep in mind that out-of-stream variables are a nested, multi-level
|
||||
[map](reference-main-maps.md) (directly viewable as JSON using `dump`), while
|
||||
Miller record values are as well during processing -- but records may be
|
||||
flattened down for output to tabular formats. See the page [Flatten/unflatten:
|
||||
JSON vs. tabular formats](flatten-unflatten.md) for more information.
|
||||
Keep in mind that out-of-stream variables are a nested, multi-level [map](reference-main-maps.md) (directly viewable as JSON using `dump`), while Miller record values are as well during processing -- but records may be flattened down for output to tabular formats. See the page [Flatten/unflatten: JSON vs. tabular formats](flatten-unflatten.md) for more information.
|
||||
|
||||
* You can use `emit1` to emit any map-valued expression, including `$*`,
|
||||
map-valued out-of-stream variables, the entire out-of-stream-variable
|
||||
collection `@*`, map-valued local variables, map literals, or map-valued
|
||||
function return values.
|
||||
* For `emit`, `emitp`, and `emitf`, you can emit map-valued local variables,
|
||||
map-valued field attributes (with `$`), map-va out-of-stream variables (with
|
||||
`@`), `$*`, `@*`, or map literals (with outermost `{...}`) -- but not arbitrary
|
||||
expressions which evaluate to map (such as function return values).
|
||||
* You can use `emit1` to emit any map-valued expression, including `$*`, map-valued out-of-stream variables, the entire out-of-stream-variable collection `@*`, map-valued local variables, map literals, or map-valued function return values.
|
||||
* For `emit`, `emitp`, and `emitf`, you can emit map-valued local variables, map-valued field attributes (with `$`), map-va out-of-stream variables (with `@`), `$*`, `@*`, or map literals (with outermost `{...}`) -- but not arbitrary expressions which evaluate to map (such as function return values).
|
||||
|
||||
The reason for this is part historical and part technical. As we'll see below,
|
||||
you can do lots of syntactical things with `emit`, `emitp`, and `emitf`,
|
||||
including printing them side-by-side, index them, redirect the output to files,
|
||||
etc. What this means syntactically is that Miller's parser needs to handle all
|
||||
sorts of commas, parentheses, and so on:
|
||||
The reason for this is partly historical and partly technical. As we'll see below, you can do lots of syntactical things with `emit`, `emitp`, and `emitf`, including printing them side-by-side, indexing them, redirecting the output to files, etc. What this means syntactically is that Miller's parser needs to handle all sorts of commas, parentheses, and so on:
|
||||
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
emitf @count, @sum
|
||||
|
|
@ -352,12 +337,7 @@ sorts of commas, parentheses, and so on:
|
|||
# etc
|
||||
</pre>
|
||||
|
||||
When we try to allow `emitf`/`emit`/`emitp` to handle arbitrary map-valued
|
||||
expressions, like `mapexcept($*, mymap)` and so on, this inserts more syntactic
|
||||
complexity in terms of commas, parentheses, and so on. The technical term is
|
||||
_LR-1 shift-reduce conflicts_, but we can simply think of this in terms of the
|
||||
parser not being able to efficiently disambiguate all the punctuational
|
||||
opportunities.
|
||||
When we try to allow `emitf`/`emit`/`emitp` to handle arbitrary map-valued expressions, like `mapexcept($*, mymap)` and so on, this inserts more syntactic complexity in terms of commas, parentheses, and so on. The technical term is _LR-1 shift-reduce conflicts_, but we can think of this in terms of the parser being unable to efficiently disambiguate all the punctuational opportunities.
|
||||
|
||||
So, `emit1` can handle syntactic richness in the one thing being emitted;
|
||||
`emitf`, `emit`, and `emitp` can handle syntactic richness in the side-by-side
|
||||
|
|
@ -365,7 +345,7 @@ placement, indexing, and redirection.
|
|||
|
||||
(Mnemonic: If all you want is to insert a new record into the record stream, `emit1` is probably the _one_ you want.)
|
||||
|
||||
What this means is that if you want to emit an expression which evaluates to a map, you can do quite simply
|
||||
What this means is that if you want to emit an expression that evaluates to a map, you can do it quite simply:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --c2p --from example.csv put -q '</b>
|
||||
|
|
@ -386,7 +366,7 @@ id color shape flag k index quantity rate
|
|||
10 purple square false 10 91 72.3735 8.2430
|
||||
</pre>
|
||||
|
||||
And if you want indexing, redirects, etc., just assign to a temporary variable and use one of the other emit variants:
|
||||
And if you want indexing, redirects, etc., just assign to a temporary variable and use one of the other `emit` variants:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --c2p --from example.csv put -q '</b>
|
||||
|
|
@ -410,7 +390,7 @@ id color shape flag k index quantity rate
|
|||
|
||||
## Emitf statements
|
||||
|
||||
Use **emitf** to output several out-of-stream variables side-by-side in the same output record. For `emitf` these mustn't have indexing using `@name[...]`. Example:
|
||||
Use **emitf** to output several out-of-stream variables side-by-side in the same output record. For `emitf`, these mustn't have indexing using `@name[...]`. Example:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr put -q '</b>
|
||||
|
|
@ -426,7 +406,7 @@ count=5,x_sum=2.26476,y_sum=2.585083
|
|||
|
||||
## Emit statements
|
||||
|
||||
Use **emit** to output an out-of-stream variable. If it's non-indexed you'll get a simple key-value pair:
|
||||
Use **emit** to output an out-of-stream variable. If it's non-indexed, you'll get a simple key-value pair:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>cat data/small</b>
|
||||
|
|
@ -455,7 +435,7 @@ a=wye,b=pan,i=5,x=0.573288,y=0.863624
|
|||
sum=2.26476
|
||||
</pre>
|
||||
|
||||
If it's indexed then use as many names after `emit` as there are indices:
|
||||
If it's indexed, then use as many names after `emit` as there are indices:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr put -q '@sum[$a] += $x; end { dump }' data/small</b>
|
||||
|
|
@ -624,8 +604,7 @@ sum.wye.wye 0.204603
|
|||
sum.wye.pan 0.573288
|
||||
</pre>
|
||||
|
||||
Use **--flatsep** to specify the character which joins multilevel
|
||||
keys for `emitp` (it defaults to a colon):
|
||||
Use **--flatsep** to specify the character that joins multilevel keys for `emitp` (it defaults to a colon):
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --flatsep / put -q '@sum[$a][$b] += $x; end { emitp @sum, "a" }' data/small</b>
|
||||
|
|
@ -703,11 +682,11 @@ hat hat 182.8535323148762 381 0.47993053101017374
|
|||
hat pan 168.5538067327806 363 0.4643355557376876
|
||||
</pre>
|
||||
|
||||
What this does is walk through the first out-of-stream variable (`@x_sum` in this example) as usual, then for each keylist found (e.g. `pan,wye`), include the values for the remaining out-of-stream variables (here, `@x_count` and `@x_mean`). You should use this when all out-of-stream variables in the emit statement have **the same shape and the same keylists**.
|
||||
What this does is walk through the first out-of-stream variable (`@x_sum` in this example) as usual, then for each keylist found (e.g., `pan,wye`), include the values for the remaining out-of-stream variables (here, `@x_count` and `@x_mean`). You should use this when all out-of-stream variables in the emit statement have **the same shape and the same keylists**.
|
||||
|
||||
## Emit-all statements
|
||||
|
||||
Use **emit all** (or `emit @*` which is synonymous) to output all out-of-stream variables. You can use the following idiom to get various accumulators output side-by-side (reminiscent of `mlr stats1`):
|
||||
Use **emit all** (or `emit @*`, which is synonymous) to output all out-of-stream variables. You can use the following idiom to get various accumulators' output side-by-side (reminiscent of `mlr stats1`):
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --from data/small --opprint put -q '</b>
|
||||
|
|
|
|||
|
|
@ -6,15 +6,15 @@ You can **output** variable-values or expressions in **five ways**:
|
|||
|
||||
* Use **emit1**/**emit**/**emitp**/**emitf** to send out-of-stream variables' current values to the output record stream, e.g. `@sum += $x; emit1 @sum` which produces an extra record such as `sum=3.1648382`. These records, just like records from input file(s), participate in downstream [then-chaining](reference-main-then-chaining.md) to other verbs.
|
||||
|
||||
* Use the **print** or **eprint** keywords which immediately print an expression *directly to standard output or standard error*, respectively. Note that `dump`, `edump`, `print`, and `eprint` don't output records which participate in `then`-chaining; rather, they're just immediate prints to stdout/stderr. The `printn` and `eprintn` keywords are the same except that they don't print final newlines. Additionally, you can print to a specified file instead of stdout/stderr.
|
||||
* Use the **print** or **eprint** keywords which immediately print an expression *directly to standard output or standard error*, respectively. Note that `dump`, `edump`, `print`, and `eprint` don't output records that participate in `then`-chaining; rather, they're just immediate prints to stdout/stderr. The `printn` and `eprintn` keywords are the same except that they don't print final newlines. Additionally, you can print to a specified file instead of stdout/stderr.
|
||||
|
||||
* Use the **dump** or **edump** keywords, which *immediately print all out-of-stream variables as a JSON data structure to the standard output or standard error* (respectively).
|
||||
|
||||
* Use **tee** which formats the current stream record (not just an arbitrary string as with **print**) to a specific file.
|
||||
* Use **tee**, which formats the current stream record (not just an arbitrary string as with **print**) to a specific file.
|
||||
|
||||
For the first two options you are populating the output-records stream which feeds into the next verb in a `then`-chain (if any), or which otherwise is formatted for output using `--o...` flags.
|
||||
For the first two options, you are populating the output-records stream which feeds into the next verb in a `then`-chain (if any), or which otherwise is formatted for output using `--o...` flags.
|
||||
|
||||
For the last three options you are sending output directly to standard output, standard error, or a file.
|
||||
For the last three options, you are sending output directly to standard output, standard error, or a file.
|
||||
|
||||
## Print statements
|
||||
|
||||
|
|
@ -22,7 +22,7 @@ The `print` statement is perhaps self-explanatory, but with a few light caveats:
|
|||
|
||||
* There are four variants: `print` goes to stdout with final newline, `printn` goes to stdout without final newline (you can include one using "\n" in your output string), `eprint` goes to stderr with final newline, and `eprintn` goes to stderr without final newline.
|
||||
|
||||
* Output goes directly to stdout/stderr, respectively: data produced this way do not go downstream to the next verb in a `then`-chain. (Use `emit` for that.)
|
||||
* Output goes directly to stdout/stderr, respectively: data produced this way does not go downstream to the next verb in a `then`-chain. (Use `emit` for that.)
|
||||
|
||||
* Print statements are for strings (`print "hello"`), or things which can be made into strings: numbers (`print 3`, `print $a + $b`), or concatenations thereof (`print "a + b = " . ($a + $b)`). Maps (in `$*`, map-valued out-of-stream or local variables, and map literals) as well as arrays are printed as JSON.
|
||||
|
||||
|
|
@ -46,9 +46,9 @@ The `dump` statement is for printing expressions, including maps, directly to st
|
|||
|
||||
* There are two variants: `dump` prints to stdout; `edump` prints to stderr.
|
||||
|
||||
* Output goes directly to stdout/stderr, respectively: data produced this way do not go downstream to the next verb in a `then`-chain. (Use `emit` for that.)
|
||||
* Output goes directly to stdout/stderr, respectively: data produced this way does not go downstream to the next verb in a `then`-chain. (Use `emit` for that.)
|
||||
|
||||
* You can use `dump` to output single strings, numbers, or expressions including map-valued data. Map-valued data are printed as JSON.
|
||||
* You can use `dump` to output single strings, numbers, or expressions including map-valued data. Map-valued data is printed as JSON.
|
||||
|
||||
* If you use `dump` (or `edump`) with no arguments, you get a JSON structure representing the current values of all out-of-stream variables.
|
||||
|
||||
|
|
@ -60,7 +60,7 @@ The `dump` statement is for printing expressions, including maps, directly to st
|
|||
|
||||
Records produced by a `mlr put` go downstream to the next verb in your `then`-chain, if any, or otherwise to standard output. If you want to additionally copy out records to files, you can do that using `tee`.
|
||||
|
||||
The syntax is, by example:
|
||||
The syntax is, for example:
|
||||
|
||||
GENMD-CARDIFY-HIGHLIGHT-ONE
|
||||
mlr --from myfile.dat put 'tee > "tap.dat", $*' then sort -n index
|
||||
|
|
@ -68,8 +68,7 @@ GENMD-EOF
|
|||
|
||||
First is `tee >`, then the filename expression (which can be an expression such as `"tap.".$a.".dat"`), then a comma, then `$*`. (Nothing else but `$*` is teeable.)
|
||||
|
||||
You can also write to a variable file name -- for example, you can split a
|
||||
single file into multiple ones on field names:
|
||||
You can also write to a variable file name -- for example, you can split a single file into multiple ones on field names:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --csv cat example.csv
|
||||
|
|
@ -135,26 +134,12 @@ There are four variants: `emit1`, `emitf`, `emit`, and `emitp`. These are used
|
|||
to insert new records into the record stream -- or, optionally, redirect them
|
||||
to files.
|
||||
|
||||
Keep in mind that out-of-stream variables are a nested, multi-level
|
||||
[map](reference-main-maps.md) (directly viewable as JSON using `dump`), while
|
||||
Miller record values are as well during processing -- but records may be
|
||||
flattened down for output to tabular formats. See the page [Flatten/unflatten:
|
||||
JSON vs. tabular formats](flatten-unflatten.md) for more information.
|
||||
Keep in mind that out-of-stream variables are a nested, multi-level [map](reference-main-maps.md) (directly viewable as JSON using `dump`), while Miller record values are as well during processing -- but records may be flattened down for output to tabular formats. See the page [Flatten/unflatten: JSON vs. tabular formats](flatten-unflatten.md) for more information.
|
||||
|
||||
* You can use `emit1` to emit any map-valued expression, including `$*`,
|
||||
map-valued out-of-stream variables, the entire out-of-stream-variable
|
||||
collection `@*`, map-valued local variables, map literals, or map-valued
|
||||
function return values.
|
||||
* For `emit`, `emitp`, and `emitf`, you can emit map-valued local variables,
|
||||
map-valued field attributes (with `$`), map-va out-of-stream variables (with
|
||||
`@`), `$*`, `@*`, or map literals (with outermost `{...}`) -- but not arbitrary
|
||||
expressions which evaluate to map (such as function return values).
|
||||
* You can use `emit1` to emit any map-valued expression, including `$*`, map-valued out-of-stream variables, the entire out-of-stream-variable collection `@*`, map-valued local variables, map literals, or map-valued function return values.
|
||||
* For `emit`, `emitp`, and `emitf`, you can emit map-valued local variables, map-valued field attributes (with `$`), map-va out-of-stream variables (with `@`), `$*`, `@*`, or map literals (with outermost `{...}`) -- but not arbitrary expressions which evaluate to map (such as function return values).
|
||||
|
||||
The reason for this is part historical and part technical. As we'll see below,
|
||||
you can do lots of syntactical things with `emit`, `emitp`, and `emitf`,
|
||||
including printing them side-by-side, index them, redirect the output to files,
|
||||
etc. What this means syntactically is that Miller's parser needs to handle all
|
||||
sorts of commas, parentheses, and so on:
|
||||
The reason for this is partly historical and partly technical. As we'll see below, you can do lots of syntactical things with `emit`, `emitp`, and `emitf`, including printing them side-by-side, indexing them, redirecting the output to files, etc. What this means syntactically is that Miller's parser needs to handle all sorts of commas, parentheses, and so on:
|
||||
|
||||
GENMD-CARDIFY
|
||||
emitf @count, @sum
|
||||
|
|
@ -163,12 +148,7 @@ GENMD-CARDIFY
|
|||
# etc
|
||||
GENMD-EOF
|
||||
|
||||
When we try to allow `emitf`/`emit`/`emitp` to handle arbitrary map-valued
|
||||
expressions, like `mapexcept($*, mymap)` and so on, this inserts more syntactic
|
||||
complexity in terms of commas, parentheses, and so on. The technical term is
|
||||
_LR-1 shift-reduce conflicts_, but we can simply think of this in terms of the
|
||||
parser not being able to efficiently disambiguate all the punctuational
|
||||
opportunities.
|
||||
When we try to allow `emitf`/`emit`/`emitp` to handle arbitrary map-valued expressions, like `mapexcept($*, mymap)` and so on, this inserts more syntactic complexity in terms of commas, parentheses, and so on. The technical term is _LR-1 shift-reduce conflicts_, but we can think of this in terms of the parser being unable to efficiently disambiguate all the punctuational opportunities.
|
||||
|
||||
So, `emit1` can handle syntactic richness in the one thing being emitted;
|
||||
`emitf`, `emit`, and `emitp` can handle syntactic richness in the side-by-side
|
||||
|
|
@ -176,7 +156,7 @@ placement, indexing, and redirection.
|
|||
|
||||
(Mnemonic: If all you want is to insert a new record into the record stream, `emit1` is probably the _one_ you want.)
|
||||
|
||||
What this means is that if you want to emit an expression which evaluates to a map, you can do quite simply
|
||||
What this means is that if you want to emit an expression that evaluates to a map, you can do it quite simply:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --c2p --from example.csv put -q '
|
||||
|
|
@ -184,7 +164,7 @@ mlr --c2p --from example.csv put -q '
|
|||
'
|
||||
GENMD-EOF
|
||||
|
||||
And if you want indexing, redirects, etc., just assign to a temporary variable and use one of the other emit variants:
|
||||
And if you want indexing, redirects, etc., just assign to a temporary variable and use one of the other `emit` variants:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --c2p --from example.csv put -q '
|
||||
|
|
@ -195,7 +175,7 @@ GENMD-EOF
|
|||
|
||||
## Emitf statements
|
||||
|
||||
Use **emitf** to output several out-of-stream variables side-by-side in the same output record. For `emitf` these mustn't have indexing using `@name[...]`. Example:
|
||||
Use **emitf** to output several out-of-stream variables side-by-side in the same output record. For `emitf`, these mustn't have indexing using `@name[...]`. Example:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr put -q '
|
||||
|
|
@ -208,7 +188,7 @@ GENMD-EOF
|
|||
|
||||
## Emit statements
|
||||
|
||||
Use **emit** to output an out-of-stream variable. If it's non-indexed you'll get a simple key-value pair:
|
||||
Use **emit** to output an out-of-stream variable. If it's non-indexed, you'll get a simple key-value pair:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
cat data/small
|
||||
|
|
@ -222,7 +202,7 @@ GENMD-RUN-COMMAND
|
|||
mlr put -q '@sum += $x; end { emit @sum }' data/small
|
||||
GENMD-EOF
|
||||
|
||||
If it's indexed then use as many names after `emit` as there are indices:
|
||||
If it's indexed, then use as many names after `emit` as there are indices:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr put -q '@sum[$a] += $x; end { dump }' data/small
|
||||
|
|
@ -277,8 +257,7 @@ GENMD-RUN-COMMAND
|
|||
mlr --oxtab put -q '@sum[$a][$b] += $x; end { emitp @sum }' data/small
|
||||
GENMD-EOF
|
||||
|
||||
Use **--flatsep** to specify the character which joins multilevel
|
||||
keys for `emitp` (it defaults to a colon):
|
||||
Use **--flatsep** to specify the character that joins multilevel keys for `emitp` (it defaults to a colon):
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --flatsep / put -q '@sum[$a][$b] += $x; end { emitp @sum, "a" }' data/small
|
||||
|
|
@ -313,11 +292,11 @@ mlr --from data/medium --opprint put -q '
|
|||
'
|
||||
GENMD-EOF
|
||||
|
||||
What this does is walk through the first out-of-stream variable (`@x_sum` in this example) as usual, then for each keylist found (e.g. `pan,wye`), include the values for the remaining out-of-stream variables (here, `@x_count` and `@x_mean`). You should use this when all out-of-stream variables in the emit statement have **the same shape and the same keylists**.
|
||||
What this does is walk through the first out-of-stream variable (`@x_sum` in this example) as usual, then for each keylist found (e.g., `pan,wye`), include the values for the remaining out-of-stream variables (here, `@x_count` and `@x_mean`). You should use this when all out-of-stream variables in the emit statement have **the same shape and the same keylists**.
|
||||
|
||||
## Emit-all statements
|
||||
|
||||
Use **emit all** (or `emit @*` which is synonymous) to output all out-of-stream variables. You can use the following idiom to get various accumulators output side-by-side (reminiscent of `mlr stats1`):
|
||||
Use **emit all** (or `emit @*`, which is synonymous) to output all out-of-stream variables. You can use the following idiom to get various accumulators' output side-by-side (reminiscent of `mlr stats1`):
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --from data/small --opprint put -q '
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ hat wye 10002 0.321507044286237609 0.568893318795083758 5 9 4 2 data/s
|
|||
pan zee 10003 0.272054845593895200 0.425789896597056627 5 10 5 2 data/small2
|
||||
</pre>
|
||||
|
||||
Anything from a `#` character to end of line is a code comment.
|
||||
Anything from a `#` character to the end of the line is a code comment.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --opprint filter '($x > 0.5 && $y < 0.5) || ($x < 0.5 && $y > 0.5)' \</b>
|
||||
|
|
@ -147,11 +147,11 @@ a=eks,b=wye,i=4,x=0.381399,y=0.134188,xy=0.40431623334340655
|
|||
a=wye,b=pan,i=5,x=0.573288,y=0.863624,xy=1.036583592538489
|
||||
</pre>
|
||||
|
||||
A suggested use-case here is defining functions in files, and calling them from command-line expressions.
|
||||
A suggested use case here is defining functions in files and calling them from command-line expressions.
|
||||
|
||||
Another suggested use-case is putting default parameter values in files, e.g. using `begin{@count=is_present(@count)?@count:10}` in the file, where you can precede that using `begin{@count=40}` using `-e`.
|
||||
Another suggested use case is putting default parameter values in files, e.g., using `begin{@count=is_present(@count)?@count:10}` in the file, where you can precede that using `begin{@count=40}` using `-e`.
|
||||
|
||||
Moreover, you can have one or more `-f` expressions (maybe one function per file, for example) and one or more `-e` expressions on the command line. If you mix `-f` and `-e` then the expressions are evaluated in the order encountered.
|
||||
Moreover, you can have one or more `-f` expressions (maybe one function per file, for example) and one or more `-e` expressions on the command line. If you mix `-f` and `-e`, then the expressions are evaluated in the order encountered.
|
||||
|
||||
## Semicolons, commas, newlines, and curly braces
|
||||
|
||||
|
|
@ -180,7 +180,7 @@ x=1,y=2,3=,4=,5=,6=,7=,8=,9=,10=,foo=bar
|
|||
x=1,y=2,3=,4=,5=,6=,7=,8=,9=,10=,foo=bar
|
||||
</pre>
|
||||
|
||||
Semicolons are required between statements even if those statements are on separate lines. **Newlines** are for your convenience but have no syntactic meaning: line endings do not terminate statements. For example, adjacent assignment statements must be separated by semicolons even if those statements are on separate lines:
|
||||
Semicolons are required between statements, even if those statements are on separate lines. **Newlines** are for your convenience but have no syntactic meaning: line endings do not terminate statements. For example, adjacent assignment statements must be separated by semicolons even if those statements are on separate lines:
|
||||
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
mlr put '
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ mlr --opprint put '
|
|||
' data/small data/small2
|
||||
GENMD-EOF
|
||||
|
||||
Anything from a `#` character to end of line is a code comment.
|
||||
Anything from a `#` character to the end of the line is a code comment.
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --opprint filter '($x > 0.5 && $y < 0.5) || ($x < 0.5 && $y > 0.5)' \
|
||||
|
|
@ -62,11 +62,11 @@ GENMD-RUN-COMMAND
|
|||
mlr --from data/small put -f data/fe-example-4.mlr -e '$xy = f($x, $y)'
|
||||
GENMD-EOF
|
||||
|
||||
A suggested use-case here is defining functions in files, and calling them from command-line expressions.
|
||||
A suggested use case here is defining functions in files and calling them from command-line expressions.
|
||||
|
||||
Another suggested use-case is putting default parameter values in files, e.g. using `begin{@count=is_present(@count)?@count:10}` in the file, where you can precede that using `begin{@count=40}` using `-e`.
|
||||
Another suggested use case is putting default parameter values in files, e.g., using `begin{@count=is_present(@count)?@count:10}` in the file, where you can precede that using `begin{@count=40}` using `-e`.
|
||||
|
||||
Moreover, you can have one or more `-f` expressions (maybe one function per file, for example) and one or more `-e` expressions on the command line. If you mix `-f` and `-e` then the expressions are evaluated in the order encountered.
|
||||
Moreover, you can have one or more `-f` expressions (maybe one function per file, for example) and one or more `-e` expressions on the command line. If you mix `-f` and `-e`, then the expressions are evaluated in the order encountered.
|
||||
|
||||
## Semicolons, commas, newlines, and curly braces
|
||||
|
||||
|
|
@ -84,7 +84,7 @@ GENMD-RUN-COMMAND
|
|||
echo x=1,y=2 | mlr put 'while (NF < 10) { $[NF+1] = ""}; $foo = "bar"'
|
||||
GENMD-EOF
|
||||
|
||||
Semicolons are required between statements even if those statements are on separate lines. **Newlines** are for your convenience but have no syntactic meaning: line endings do not terminate statements. For example, adjacent assignment statements must be separated by semicolons even if those statements are on separate lines:
|
||||
Semicolons are required between statements, even if those statements are on separate lines. **Newlines** are for your convenience but have no syntactic meaning: line endings do not terminate statements. For example, adjacent assignment statements must be separated by semicolons even if those statements are on separate lines:
|
||||
|
||||
GENMD-INCLUDE-ESCAPED(data/newline-example.txt)
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ Quick links:
|
|||
</div>
|
||||
# DSL user-defined functions
|
||||
|
||||
As of Miller 5.0.0 you can define your own functions, as well as subroutines.
|
||||
As of Miller 5.0.0, you can define your own functions, as well as subroutines.
|
||||
|
||||
## User-defined functions
|
||||
|
||||
|
|
@ -49,7 +49,7 @@ wye pan 5 0.573288 0.863624 211.38663947090302 120
|
|||
|
||||
Properties of user-defined functions:
|
||||
|
||||
* Function bodies start with `func` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e. the Miller DSL has no nested functions.)
|
||||
* Function bodies start with `func` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e., the Miller DSL has no nested functions.)
|
||||
|
||||
* A function (uniqified by its name) may not be redefined: either by redefining a user-defined function, or by redefining a built-in function. However, functions and subroutines have separate namespaces: you can define a subroutine `log` (for logging messages to stderr, say) which does not clash with the mathematical `log` (logarithm) function.
|
||||
|
||||
|
|
@ -61,7 +61,7 @@ Properties of user-defined functions:
|
|||
|
||||
* When a return value is not implicitly returned, this results in a return value of [absent-null](reference-main-null-data.md). (In the example above, if there were records for which the argument to `f` is non-numeric, the assignments would be skipped.) See also the [null-data reference page](reference-main-null-data.md).
|
||||
|
||||
* See the section on [Local variables](reference-dsl-variables.md#local-variables) for information on scope and extent of arguments, as well as for information on the use of local variables within functions.
|
||||
* See the section on [Local variables](reference-dsl-variables.md#local-variables) for information on the scope and extent of arguments, as well as for information on the use of local variables within functions.
|
||||
|
||||
* See the section on [Expressions from files](reference-dsl-syntax.md#expressions-from-files) for information on the use of `-f` and `-e` flags.
|
||||
|
||||
|
|
@ -103,7 +103,7 @@ numcalls=15
|
|||
|
||||
Properties of user-defined subroutines:
|
||||
|
||||
* Subroutine bodies start with `subr` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e. the Miller DSL has no nested subroutines.)
|
||||
* Subroutine bodies start with `subr` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e., the Miller DSL has no nested subroutines.)
|
||||
|
||||
* A subroutine (uniqified by its name) may not be redefined. However, functions and subroutines have separate namespaces: you can define a subroutine `log` which does not clash with the mathematical `log` function.
|
||||
|
||||
|
|
@ -115,7 +115,7 @@ Properties of user-defined subroutines:
|
|||
|
||||
* Argument values may be reassigned: they are not read-only.
|
||||
|
||||
* See the section on [local variables](reference-dsl-variables.md#local-variables) for information on scope and extent of arguments, as well as for information on the use of local variables within functions.
|
||||
* See the section on [local variables](reference-dsl-variables.md#local-variables) for information on the scope and extent of arguments, as well as for information on the use of local variables within functions.
|
||||
|
||||
* See the section on [Expressions from files](reference-dsl-syntax.md#expressions-from-files) for information on the use of `-f` and `-e` flags.
|
||||
|
||||
|
|
@ -123,15 +123,11 @@ Properties of user-defined subroutines:
|
|||
|
||||
Subroutines cannot return values, and they are invoked by the keyword `call`.
|
||||
|
||||
In hindsight, subroutines needn't have been invented. If `foo` is a function
|
||||
then you can write `foo(1,2,3)` while ignoring its return value, and that plays
|
||||
the role of subroutine quite well.
|
||||
In hindsight, subroutines needn't have been invented. If `foo is a function, then you can write `foo(1,2,3)` while ignoring its return value, and that plays the role of a subroutine quite well.
|
||||
|
||||
## Loading a library of functions
|
||||
|
||||
If you have a file with UDFs you use frequently, say `my-udfs.mlr`, you can use
|
||||
`--load` or `--mload` to define them for your Miller scripts. For example, in
|
||||
your shell,
|
||||
If you have a file with UDFs you use frequently, say `my-udfs.mlr`, you can use `--load` or `--mload` to define them for your Miller scripts. For example, in your shell,
|
||||
|
||||
<pre class="pre-highlight-non-pair">
|
||||
<b>alias mlr='mlr --load ~/my-functions.mlr'</b>
|
||||
|
|
@ -149,8 +145,7 @@ See the [miscellaneous-flags page](reference-main-flag-list.md#miscellaneous-fla
|
|||
|
||||
You can define unnamed functions and assign them to variables, or pass them to functions.
|
||||
|
||||
See also the [page on higher-order functions](reference-dsl-higher-order-functions.md)
|
||||
for more information on
|
||||
See also the [page on higher-order functions](reference-dsl-higher-order-functions.md) for more information on
|
||||
[`select`](reference-dsl-builtin-functions.md#select),
|
||||
[`apply`](reference-dsl-builtin-functions.md#apply),
|
||||
[`reduce`](reference-dsl-builtin-functions.md#reduce),
|
||||
|
|
@ -209,9 +204,7 @@ purple square false 10 91 72.3735 8.2430 purple:square above
|
|||
|
||||
Note that you need a semicolon after the closing curly brace of the function literal.
|
||||
|
||||
Unlike named functions, function literals (also known as unnamed functions)
|
||||
have access to local variables defined in their enclosing scope. That's
|
||||
so you can do things like this:
|
||||
Unlike named functions, function literals (also known as unnamed functions) have access to local variables defined in their enclosing scope. That's so you can do things like this:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --c2p --from example.csv put '</b>
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# DSL user-defined functions
|
||||
|
||||
As of Miller 5.0.0 you can define your own functions, as well as subroutines.
|
||||
As of Miller 5.0.0, you can define your own functions, as well as subroutines.
|
||||
|
||||
## User-defined functions
|
||||
|
||||
|
|
@ -25,7 +25,7 @@ GENMD-EOF
|
|||
|
||||
Properties of user-defined functions:
|
||||
|
||||
* Function bodies start with `func` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e. the Miller DSL has no nested functions.)
|
||||
* Function bodies start with `func` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e., the Miller DSL has no nested functions.)
|
||||
|
||||
* A function (uniqified by its name) may not be redefined: either by redefining a user-defined function, or by redefining a built-in function. However, functions and subroutines have separate namespaces: you can define a subroutine `log` (for logging messages to stderr, say) which does not clash with the mathematical `log` (logarithm) function.
|
||||
|
||||
|
|
@ -37,7 +37,7 @@ Properties of user-defined functions:
|
|||
|
||||
* When a return value is not implicitly returned, this results in a return value of [absent-null](reference-main-null-data.md). (In the example above, if there were records for which the argument to `f` is non-numeric, the assignments would be skipped.) See also the [null-data reference page](reference-main-null-data.md).
|
||||
|
||||
* See the section on [Local variables](reference-dsl-variables.md#local-variables) for information on scope and extent of arguments, as well as for information on the use of local variables within functions.
|
||||
* See the section on [Local variables](reference-dsl-variables.md#local-variables) for information on the scope and extent of arguments, as well as for information on the use of local variables within functions.
|
||||
|
||||
* See the section on [Expressions from files](reference-dsl-syntax.md#expressions-from-files) for information on the use of `-f` and `-e` flags.
|
||||
|
||||
|
|
@ -67,7 +67,7 @@ GENMD-EOF
|
|||
|
||||
Properties of user-defined subroutines:
|
||||
|
||||
* Subroutine bodies start with `subr` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e. the Miller DSL has no nested subroutines.)
|
||||
* Subroutine bodies start with `subr` and a parameter list, defined outside of `begin`, `end`, or other `func` or `subr` blocks. (I.e., the Miller DSL has no nested subroutines.)
|
||||
|
||||
* A subroutine (uniqified by its name) may not be redefined. However, functions and subroutines have separate namespaces: you can define a subroutine `log` which does not clash with the mathematical `log` function.
|
||||
|
||||
|
|
@ -79,7 +79,7 @@ Properties of user-defined subroutines:
|
|||
|
||||
* Argument values may be reassigned: they are not read-only.
|
||||
|
||||
* See the section on [local variables](reference-dsl-variables.md#local-variables) for information on scope and extent of arguments, as well as for information on the use of local variables within functions.
|
||||
* See the section on [local variables](reference-dsl-variables.md#local-variables) for information on the scope and extent of arguments, as well as for information on the use of local variables within functions.
|
||||
|
||||
* See the section on [Expressions from files](reference-dsl-syntax.md#expressions-from-files) for information on the use of `-f` and `-e` flags.
|
||||
|
||||
|
|
@ -87,15 +87,11 @@ Properties of user-defined subroutines:
|
|||
|
||||
Subroutines cannot return values, and they are invoked by the keyword `call`.
|
||||
|
||||
In hindsight, subroutines needn't have been invented. If `foo` is a function
|
||||
then you can write `foo(1,2,3)` while ignoring its return value, and that plays
|
||||
the role of subroutine quite well.
|
||||
In hindsight, subroutines needn't have been invented. If `foo is a function, then you can write `foo(1,2,3)` while ignoring its return value, and that plays the role of a subroutine quite well.
|
||||
|
||||
## Loading a library of functions
|
||||
|
||||
If you have a file with UDFs you use frequently, say `my-udfs.mlr`, you can use
|
||||
`--load` or `--mload` to define them for your Miller scripts. For example, in
|
||||
your shell,
|
||||
If you have a file with UDFs you use frequently, say `my-udfs.mlr`, you can use `--load` or `--mload` to define them for your Miller scripts. For example, in your shell,
|
||||
|
||||
GENMD-CARDIFY-HIGHLIGHT-ONE
|
||||
alias mlr='mlr --load ~/my-functions.mlr'
|
||||
|
|
@ -113,8 +109,7 @@ See the [miscellaneous-flags page](reference-main-flag-list.md#miscellaneous-fla
|
|||
|
||||
You can define unnamed functions and assign them to variables, or pass them to functions.
|
||||
|
||||
See also the [page on higher-order functions](reference-dsl-higher-order-functions.md)
|
||||
for more information on
|
||||
See also the [page on higher-order functions](reference-dsl-higher-order-functions.md) for more information on
|
||||
[`select`](reference-dsl-builtin-functions.md#select),
|
||||
[`apply`](reference-dsl-builtin-functions.md#apply),
|
||||
[`reduce`](reference-dsl-builtin-functions.md#reduce),
|
||||
|
|
@ -147,9 +142,7 @@ GENMD-EOF
|
|||
|
||||
Note that you need a semicolon after the closing curly brace of the function literal.
|
||||
|
||||
Unlike named functions, function literals (also known as unnamed functions)
|
||||
have access to local variables defined in their enclosing scope. That's
|
||||
so you can do things like this:
|
||||
Unlike named functions, function literals (also known as unnamed functions) have access to local variables defined in their enclosing scope. That's so you can do things like this:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --c2p --from example.csv put '
|
||||
|
|
|
|||
|
|
@ -18,11 +18,11 @@ Quick links:
|
|||
|
||||
Miller has the following kinds of variables:
|
||||
|
||||
**Fields of stream records**, accessed using the `$` prefix. These refer to fields of the current data-stream record. For example, in `echo x=1,y=2 | mlr put '$z = $x + $y'`, `$x` and `$y` refer to input fields, and `$z` refers to a new, computed output field. In a few contexts, presented below, you can refer to the entire record as `$*`.
|
||||
**Fields of stream records**, accessed using the `$` prefix. These refer to fields of the current data-stream record. For example, in `echo x=1,y=2 | mlr put '$z = $x + $y'`, `$x` and `$y` refer to input fields, and `$z` refers to a new, computed output field. In the following contexts, you can refer to the entire record as `$*`.
|
||||
|
||||
**Out-of-stream variables** accessed using the `@` prefix. These refer to data which persist from one record to the next, including in `begin` and `end` blocks (which execute before/after the record stream is consumed, respectively). You use them to remember values across records, such as sums, differences, counters, and so on. In a few contexts, presented below, you can refer to the entire out-of-stream-variables collection as `@*`.
|
||||
**Out-of-stream variables** accessed using the `@` prefix. These refer to data that persists from one record to the next, including in `begin` and `end` blocks (which execute before/after the record stream is consumed, respectively). You use them to remember values across records, such as sums, differences, and counters, among other things. In the following contexts, you can refer to the entire out-of-stream-variables collection as `@*`.
|
||||
|
||||
**Local variables** are limited in scope and extent to the current statements being executed: these include function arguments, bound variables in for loops, and local variables.
|
||||
**Local variables** are limited in scope and extent to the current statements being executed, including function arguments, bound variables in for loops, and local variables.
|
||||
|
||||
**Built-in variables** such as `NF`, `NR`, `FILENAME`, `M_PI`, and `M_E`. These are all capital letters and are read-only (although some of them change value from one record to another).
|
||||
|
||||
|
|
@ -32,7 +32,7 @@ Miller has the following kinds of variables:
|
|||
|
||||
Names of fields within stream records must be specified using a `$` in [filter and put expressions](reference-dsl.md), even though the dollar signs don't appear in the data stream itself. For integer-indexed data, this looks like `awk`'s `$1,$2,$3`, except that Miller allows non-numeric names such as `$quantity` or `$hostname`. Likewise, enclose string literals in double quotes in `filter` expressions even though they don't appear in file data. In particular, `mlr filter '$x=="abc"'` passes through the record `x=abc`.
|
||||
|
||||
If field names have **special characters** such as `.` then you can use braces, e.g. `'${field.name}'`.
|
||||
If field names have **special characters** such as `.`, then you can use braces, e.g. `'${field.name}'`.
|
||||
|
||||
You may also use a **computed field name** in square brackets, e.g.
|
||||
|
||||
|
|
@ -55,7 +55,7 @@ Their **extent** is limited to the current record; their **scope** is the `filte
|
|||
|
||||
These are **read-write**: you can do `$y=2*$x`, `$x=$x+1`, etc.
|
||||
|
||||
Records are Miller's output: field names present in the input stream are passed through to output (written to standard output) unless fields are removed with `cut`, or records are excluded with `filter` or `put -q`, etc. Simply assign a value to a field and it will be output.
|
||||
Records are Miller's output: field names present in the input stream are passed through to output (written to standard output) unless fields are removed with `cut`, or records are excluded with `filter` or `put -q`, etc. Simply assign a value to a field, and it will be output.
|
||||
|
||||
## Positional field names
|
||||
|
||||
|
|
@ -63,7 +63,7 @@ Even though Miller's main selling point is name-indexing, sometimes you really w
|
|||
|
||||
Use `$[[3]]` to access the name of field 3. More generally, any expression evaluating to an integer can go between `$[[` and `]]`.
|
||||
|
||||
Then using a computed field name, `$[ $[[3]] ]` is the value in the third field. This has the shorter equivalent notation `$[[[3]]]`.
|
||||
Then, using a computed field name, `$[ $[[3]] ]` is the value in the third field. This has the shorter equivalent notation `$[[[3]]]`.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr cat data/small</b>
|
||||
|
|
@ -131,7 +131,7 @@ a=eks,b=wye,i=4,x=NEW,y=0.134188
|
|||
a=wye,b=pan,i=5,x=0.573288,y=NEW
|
||||
</pre>
|
||||
|
||||
Right-hand side accesses to non-existent fields -- i.e. with index less than 1 or greater than `NF` -- return an absent value. Likewise, left-hand side accesses only refer to fields which already exist. For example, if a field has 5 records then assigning the name or value of the 6th (or 600th) field results in a no-op.
|
||||
Right-hand side accesses to non-existent fields -- i.e., with index less than 1 or greater than `NF` -- return an absent value. Likewise, left-hand side accesses only refer to fields that already exist. For example, if a field has 5 records, then assigning the name or value of the 6th (or 600th) field results in a no-op.
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr put '$[[6]] = "NEW"' data/small</b>
|
||||
|
|
@ -157,13 +157,13 @@ a=wye,b=pan,i=5,x=0.573288,y=0.863624
|
|||
|
||||
!!! note
|
||||
|
||||
You can use positional field names only in the [Miller DSL](reference-dsl.md), i.e. only with the verbs `put` and `filter`.
|
||||
You can use positional field names only in the [Miller DSL](reference-dsl.md), i.e., only with the verbs `put` and `filter`.
|
||||
|
||||
## Out-of-stream variables
|
||||
|
||||
These are prefixed with an at-sign, e.g. `@sum`. Furthermore, unlike built-in variables and stream-record fields, they are maintained in an arbitrarily nested map: you can do `@sum += $quantity`, or `@sum[$color] += $quantity`, or `@sum[$color][$shape] += $quantity`. The keys for the multi-level map can be any expression which evaluates to string or integer: e.g. `@sum[NR] = $a + $b`, `@sum[$a."-".$b] = $x`, etc.
|
||||
These are prefixed with an at-sign, e.g., `@sum`. Furthermore, unlike built-in variables and stream-record fields, they are maintained in an arbitrarily nested map: you can do `@sum += $quantity`, or `@sum[$color] += $quantity`, or `@sum[$color][$shape] += $quantity`. The keys for the multi-level map can be any expression that evaluates to string or integer: e.g. `@sum[NR] = $a + $b`, `@sum[$a."-".$b] = $x`, etc.
|
||||
|
||||
Their names and their values are entirely under your control; they change only when you assign to them.
|
||||
Their names and their values are entirely under your control; they change only when you assign them.
|
||||
|
||||
Just as for field names in stream records, if you want to define out-of-stream variables with **special characters** such as `.` then you can use braces, e.g. `'@{variable.name}["index"]'`.
|
||||
|
||||
|
|
@ -198,13 +198,13 @@ sum=5
|
|||
sum=50
|
||||
</pre>
|
||||
|
||||
Out-of-stream variables' **extent** is from the start to the end of the record stream, i.e. every time the `put` or `filter` statement referring to them is executed.
|
||||
Out-of-stream variables' **extent** is from the start to the end of the record stream, i.e., every time the `put` or `filter` statement referring to them is executed.
|
||||
|
||||
Out-of-stream variables are **read-write**: you can do `$sum=@sum`, `@sum=$sum`, etc.
|
||||
|
||||
## Indexed out-of-stream variables
|
||||
|
||||
Using an index on the `@count` and `@sum` variables, we get the benefit of the `-g` (group-by) option which `mlr stats1` and various other Miller commands have:
|
||||
Using an index on the `@count` and `@sum` variables, we get the benefit of the `-g` (group-by) option, which `mlr stats1` and various other Miller commands have:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr put -q '</b>
|
||||
|
|
@ -309,8 +309,8 @@ Local variables are similar to out-of-stream variables, except that their extent
|
|||
For example:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b># Here I'm using a specified random-number seed so this example always</b>
|
||||
<b># produces the same output for this web document: in everyday practice we</b>
|
||||
<b># Here I'm using a specified random-number seed, so this example always</b>
|
||||
<b># produces the same output for this web document: in everyday practice, we</b>
|
||||
<b># would leave off the --seed 12345 part.</b>
|
||||
<b>mlr --seed 12345 seqgen --start 1 --stop 10 then put '</b>
|
||||
<b> func f(a, b) { # function arguments a and b</b>
|
||||
|
|
@ -341,7 +341,7 @@ i=10,o=15.37686787628025
|
|||
|
||||
Things which are completely unsurprising, resembling many other languages:
|
||||
|
||||
* Parameter names are bound to their arguments but can be reassigned, e.g. if there is a parameter named `a` then you can reassign the value of `a` to be something else within the function if you like.
|
||||
* Parameter names are bound to their arguments but can be reassigned, e.g., if there is a parameter named `a`, then you can reassign the value of `a` to be something else within the function if you like.
|
||||
|
||||
* However, you cannot redeclare the *type* of an argument or a local: `var a=1; var a=2` is an error but `var a=1; a=2` is OK.
|
||||
|
||||
|
|
@ -355,13 +355,13 @@ Things which are completely unsurprising, resembling many other languages:
|
|||
|
||||
Things which are perhaps surprising compared to other languages:
|
||||
|
||||
* Type declarations using `var`, or typed using `num`, `int`, `float`, `str`, `bool`, `arr`, `map`, `funct` are not necessary to declare local variables. Function arguments and variables bound in for-loops over stream records and out-of-stream variables are *implicitly* declared using `var`. (Some examples are shown below.)
|
||||
* Type declarations using `var`, or typed using `num`, `int`, `float`, `str`, `bool`, `arr`, `map`, `funct`, are not necessary to declare local variables. Function arguments and variables bound in for-loops over stream records and out-of-stream variables are *implicitly* declared using `var`. (Some examples are shown below.)
|
||||
|
||||
* Type-checking is done at assignment time. For example, `float f = 0` is an error (since `0` is an integer), as is `float f = 0.0; f = 1`. For this reason I prefer to use `num` over `float` in most contexts since `num` encompasses integer and floating-point values. More information is at [Type-checking](reference-dsl-variables.md#type-checking).
|
||||
* Type-checking is done at assignment time. For example, `float f = 0` is an error (since `0` is an integer), as is `float f = 0.0; f = 1`. For this reason, I prefer to use `num` over `float` in most contexts, as `num` encompasses both integer and floating-point values. For more information, refer to [Type-checking](reference-dsl-variables.md#type-checking).
|
||||
|
||||
* Bound variables in for-loops over stream records and out-of-stream variables are implicitly local to that block. E.g. in `for (k, v in $*) { ... }` `for ((k1, k2), v in @*) { ... }` if there are `k`, `v`, etc. in the enclosing scope then those will be masked by the loop-local bound variables in the loop, and moreover the values of the loop-local bound variables are not available after the end of the loop.
|
||||
|
||||
* For C-style triple-for loops, if a for-loop variable is defined using `var`, `int`, etc. then it is scoped to that for-loop. E.g. `for (i = 0; i < 10; i += 1) { ... }` and `for (int i = 0; i < 10; i += 1) { ... }`. (This is unsurprising.). If there is no typedecl and an outer-scope variable of that name exists, then it is used. (This is also unsurprising.) But if there is no outer-scope variable of that name, then the variable is scoped to the for-loop only.
|
||||
* For C-style triple-for loops, if a for-loop variable is defined using `var`, `int`, etc., then it is scoped to that for-loop. E.g. `for (i = 0; i < 10; i += 1) { ... }` and `for (int i = 0; i < 10; i += 1) { ... }`. (This is unsurprising.). If there is no typedecl and an outer-scope variable of that name exists, then it is used. (This is also unsurprising.) But if there is no outer-scope variable of that name, then the variable is scoped to the for-loop only.
|
||||
|
||||
The following example demonstrates the scope rules:
|
||||
|
||||
|
|
@ -478,7 +478,7 @@ print "outer j =", j; # j is undefined in this scope.
|
|||
|
||||
## Map literals
|
||||
|
||||
Miller's `put`/`filter` DSL has four kinds of maps. **Stream records** are (single-level) maps from name to value. **Out-of-stream variables** and **local variables** can also be maps, although they can be multi-level maps (e.g. `@sum[$x][$y]`). The fourth kind is **map literals**. These cannot be on the left-hand side of assignment expressions. Syntactically they look like JSON, although Miller allows string and integer keys in its map literals while JSON allows only string keys (e.g. `"3"` rather than `3`). Note though that integer keys become stringified in Miller: `@mymap[3]=4` results in `@mymap` being `{"3":4}`.
|
||||
Miller's `put`/`filter` DSL has four kinds of maps. **Stream records** are (single-level) maps from name to value. **Out-of-stream variables** and **local variables** can also be maps, although they can be multi-level maps (e.g. `@sum[$x][$y]`). The fourth kind is **map literals**. These cannot be on the left-hand side of assignment expressions. Syntactically, they look like JSON, although Miller allows string and integer keys in its map literals while JSON allows only string keys (e.g., `"3"` rather than `3`). Note, though, that integer keys become stringified in Miller: `@mymap[3]=4` results in `@mymap` being `{"3":4}`.
|
||||
|
||||
For example, the following swaps the input stream's `a` and `i` fields, modifies `y`, and drops the rest:
|
||||
|
||||
|
|
@ -565,7 +565,7 @@ there are the read-only separator variables `IRS`, `ORS`, `IFS`, `OFS`, `IPS`,
|
|||
and `OPS` as discussed on the [separators page](reference-main-separators.md),
|
||||
and the flatten/unflatten separator `FLATSEP` discussed on the
|
||||
[flatten/unflatten page](flatten-unflatten.md). Lastly, the `ENV` map allows
|
||||
read/write access to environment variables, e.g. `ENV["HOME"]` or
|
||||
read/write access to environment variables, e.g., `ENV["HOME"]` or
|
||||
`ENV["foo_".$hostname]` or `ENV["VERSION"]="1.2.3"`.
|
||||
|
||||
<!--- TODO: FLATSEP IFLATSEP OFLATSEP --->
|
||||
|
|
@ -608,7 +608,7 @@ system environment variables at the time Miller starts. Any changes made to
|
|||
`ENV` by assigning to it will affect any subprocesses, such as using
|
||||
[piped tee](reference-dsl-output-statements.md#redirected-output-statements).
|
||||
|
||||
Their **scope is global**: you can refer to them in any `filter` or `put` statement. Their values are assigned by the input-record reader:
|
||||
Their **scope is global**: you can refer to them in any `filter` or `put` statement. The input-record reader assigns their values:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --csv put '$nr = NR' data/a.csv</b>
|
||||
|
|
@ -634,11 +634,11 @@ a,b,c,nr
|
|||
|
||||
The **extent** is for the duration of the put/filter: in a `begin` statement (which executes before the first input record is consumed) you will find `NR=1` and in an `end` statement (which is executed after the last input record is consumed) you will find `NR` to be the total number of records ingested.
|
||||
|
||||
These are all **read-only** for the `mlr put` and `mlr filter` DSL: they may be assigned from, e.g. `$nr=NR`, but they may not be assigned to: `NR=100` is a syntax error.
|
||||
These are all **read-only** for the `mlr put` and `mlr filter` DSL: they may be assigned from, e.g., `$nr=NR`, but they may not be assigned to: `NR=100` is a syntax error.
|
||||
|
||||
## Type-checking
|
||||
|
||||
Miller's `put`/`filter` DSL supports two optional kinds of type-checking. One is inline **type-tests** and **type-assertions** within expressions. The other is **type declarations** for assignments to local variables, binding of arguments to user-defined functions, and return values from user-defined functions, These are discussed in the following subsections.
|
||||
Miller's `put`/`filter` DSL supports two optional kinds of type-checking. One is inline **type tests** and **type assertions** within expressions. The other is **type declarations** for assignments to local variables, binding of arguments to user-defined functions, and return values from user-defined functions. These are discussed in the following subsections.
|
||||
|
||||
Use of type-checking is entirely up to you: omit it if you want flexibility with heterogeneous data; use it if you want to help catch misspellings in your DSL code or unexpected irregularities in your input data.
|
||||
|
||||
|
|
@ -699,22 +699,22 @@ asserting_string
|
|||
|
||||
See [Data-cleaning Examples](data-cleaning-examples.md) for examples of how to use these.
|
||||
|
||||
### Type-declarations for local variables, function parameter, and function return values
|
||||
### Type declarations for local variables, function parameters, and function return values
|
||||
|
||||
Local variables can be defined either untyped as in `x = 1`, or typed as in `int x = 1`. Types include **var** (explicitly untyped), **int**, **float**, **num** (int or float), **str**, **bool**, **arr**, **map**, and **funct**. These optional type declarations are enforced at the time values are assigned to variables: whether at the initial value assignment as in `int x = 1` or in any subsequent assignments to the same variable farther down in the scope.
|
||||
|
||||
The reason for `num` is that `int` and `float` typedecls are very precise:
|
||||
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
float a = 0; # Runtime error since 0 is int not float
|
||||
int b = 1.0; # Runtime error since 1.0 is float not int
|
||||
float a = 0; # Runtime error since 0 is int, not float
|
||||
int b = 1.0; # Runtime error since 1.0 is float, not int
|
||||
num c = 0; # OK
|
||||
num d = 1.0; # OK
|
||||
</pre>
|
||||
|
||||
A suggestion is to use `num` for general use when you want numeric content, and use `int` when you genuinely want integer-only values, e.g. in loop indices or map keys (since Miller map keys can only be strings or ints).
|
||||
A suggestion is to use `num` for general use when you want numeric content, and use `int` when you genuinely want integer-only values, e.g., in loop indices or map keys (since Miller map keys can only be strings or ints).
|
||||
|
||||
The `var` type declaration indicates no type restrictions, e.g. `var x = 1` has the same type restrictions on `x` as `x = 1`. The difference is in intentional shadowing: if you have `x = 1` in outer scope and `x = 2` in inner scope (e.g. within a for-loop or an if-statement) then outer-scope `x` has value 2 after the second assignment. But if you have `var x = 2` in the inner scope, then you are declaring a variable scoped to the inner block.) For example:
|
||||
The `var` type declaration indicates no type restrictions, e.g., `var x = 1` has the same type restrictions on `x` as `x = 1`. The difference is in intentional shadowing: if you have `x = 1` in outer scope and `x = 2` in inner scope (e.g., within a for-loop or an if-statement) then outer-scope `x` has value 2 after the second assignment. But if you have `var x = 2` in the inner scope, then you are declaring a variable scoped to the inner block.) For example:
|
||||
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
x = 1;
|
||||
|
|
@ -732,7 +732,7 @@ if (NR == 4) {
|
|||
print x; # Value of this x is still 1
|
||||
</pre>
|
||||
|
||||
Likewise function arguments can optionally be typed, with type enforced when the function is called:
|
||||
Likewise, function arguments can optionally be typed, with type enforced when the function is called:
|
||||
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
func f(map m, int i) {
|
||||
|
|
@ -764,7 +764,7 @@ func f(map m, int i): bool {
|
|||
}
|
||||
...
|
||||
...
|
||||
# In Miller if your functions don't explicitly return a value, they return absent-null.
|
||||
# In Miller, if your functions don't explicitly return a value, they return absent-null.
|
||||
# So it would also be a runtime error on reaching the end of this function without
|
||||
# an explicit return statement.
|
||||
}
|
||||
|
|
@ -845,7 +845,7 @@ Example recursive copy of out-of-stream variables:
|
|||
}
|
||||
</pre>
|
||||
|
||||
Example of out-of-stream variable assigned to full stream record, where the 2nd record is stashed, and the 4th record is overwritten with that:
|
||||
Example of an out-of-stream variable assigned to the full stream record, where the 2nd record is stashed, and the 4th record is overwritten with that:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr put 'NR == 2 {@keep = $*}; NR == 4 {$* = @keep}' data/small</b>
|
||||
|
|
|
|||
|
|
@ -2,11 +2,11 @@
|
|||
|
||||
Miller has the following kinds of variables:
|
||||
|
||||
**Fields of stream records**, accessed using the `$` prefix. These refer to fields of the current data-stream record. For example, in `echo x=1,y=2 | mlr put '$z = $x + $y'`, `$x` and `$y` refer to input fields, and `$z` refers to a new, computed output field. In a few contexts, presented below, you can refer to the entire record as `$*`.
|
||||
**Fields of stream records**, accessed using the `$` prefix. These refer to fields of the current data-stream record. For example, in `echo x=1,y=2 | mlr put '$z = $x + $y'`, `$x` and `$y` refer to input fields, and `$z` refers to a new, computed output field. In the following contexts, you can refer to the entire record as `$*`.
|
||||
|
||||
**Out-of-stream variables** accessed using the `@` prefix. These refer to data which persist from one record to the next, including in `begin` and `end` blocks (which execute before/after the record stream is consumed, respectively). You use them to remember values across records, such as sums, differences, counters, and so on. In a few contexts, presented below, you can refer to the entire out-of-stream-variables collection as `@*`.
|
||||
**Out-of-stream variables** accessed using the `@` prefix. These refer to data that persists from one record to the next, including in `begin` and `end` blocks (which execute before/after the record stream is consumed, respectively). You use them to remember values across records, such as sums, differences, and counters, among other things. In the following contexts, you can refer to the entire out-of-stream-variables collection as `@*`.
|
||||
|
||||
**Local variables** are limited in scope and extent to the current statements being executed: these include function arguments, bound variables in for loops, and local variables.
|
||||
**Local variables** are limited in scope and extent to the current statements being executed, including function arguments, bound variables in for loops, and local variables.
|
||||
|
||||
**Built-in variables** such as `NF`, `NR`, `FILENAME`, `M_PI`, and `M_E`. These are all capital letters and are read-only (although some of them change value from one record to another).
|
||||
|
||||
|
|
@ -16,7 +16,7 @@ Miller has the following kinds of variables:
|
|||
|
||||
Names of fields within stream records must be specified using a `$` in [filter and put expressions](reference-dsl.md), even though the dollar signs don't appear in the data stream itself. For integer-indexed data, this looks like `awk`'s `$1,$2,$3`, except that Miller allows non-numeric names such as `$quantity` or `$hostname`. Likewise, enclose string literals in double quotes in `filter` expressions even though they don't appear in file data. In particular, `mlr filter '$x=="abc"'` passes through the record `x=abc`.
|
||||
|
||||
If field names have **special characters** such as `.` then you can use braces, e.g. `'${field.name}'`.
|
||||
If field names have **special characters** such as `.`, then you can use braces, e.g. `'${field.name}'`.
|
||||
|
||||
You may also use a **computed field name** in square brackets, e.g.
|
||||
|
||||
|
|
@ -36,7 +36,7 @@ Their **extent** is limited to the current record; their **scope** is the `filte
|
|||
|
||||
These are **read-write**: you can do `$y=2*$x`, `$x=$x+1`, etc.
|
||||
|
||||
Records are Miller's output: field names present in the input stream are passed through to output (written to standard output) unless fields are removed with `cut`, or records are excluded with `filter` or `put -q`, etc. Simply assign a value to a field and it will be output.
|
||||
Records are Miller's output: field names present in the input stream are passed through to output (written to standard output) unless fields are removed with `cut`, or records are excluded with `filter` or `put -q`, etc. Simply assign a value to a field, and it will be output.
|
||||
|
||||
## Positional field names
|
||||
|
||||
|
|
@ -44,7 +44,7 @@ Even though Miller's main selling point is name-indexing, sometimes you really w
|
|||
|
||||
Use `$[[3]]` to access the name of field 3. More generally, any expression evaluating to an integer can go between `$[[` and `]]`.
|
||||
|
||||
Then using a computed field name, `$[ $[[3]] ]` is the value in the third field. This has the shorter equivalent notation `$[[[3]]]`.
|
||||
Then, using a computed field name, `$[ $[[3]] ]` is the value in the third field. This has the shorter equivalent notation `$[[[3]]]`.
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr cat data/small
|
||||
|
|
@ -70,7 +70,7 @@ GENMD-RUN-COMMAND
|
|||
mlr put '$[[[NR]]] = "NEW"' data/small
|
||||
GENMD-EOF
|
||||
|
||||
Right-hand side accesses to non-existent fields -- i.e. with index less than 1 or greater than `NF` -- return an absent value. Likewise, left-hand side accesses only refer to fields which already exist. For example, if a field has 5 records then assigning the name or value of the 6th (or 600th) field results in a no-op.
|
||||
Right-hand side accesses to non-existent fields -- i.e., with index less than 1 or greater than `NF` -- return an absent value. Likewise, left-hand side accesses only refer to fields that already exist. For example, if a field has 5 records, then assigning the name or value of the 6th (or 600th) field results in a no-op.
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr put '$[[6]] = "NEW"' data/small
|
||||
|
|
@ -82,13 +82,13 @@ GENMD-EOF
|
|||
|
||||
!!! note
|
||||
|
||||
You can use positional field names only in the [Miller DSL](reference-dsl.md), i.e. only with the verbs `put` and `filter`.
|
||||
You can use positional field names only in the [Miller DSL](reference-dsl.md), i.e., only with the verbs `put` and `filter`.
|
||||
|
||||
## Out-of-stream variables
|
||||
|
||||
These are prefixed with an at-sign, e.g. `@sum`. Furthermore, unlike built-in variables and stream-record fields, they are maintained in an arbitrarily nested map: you can do `@sum += $quantity`, or `@sum[$color] += $quantity`, or `@sum[$color][$shape] += $quantity`. The keys for the multi-level map can be any expression which evaluates to string or integer: e.g. `@sum[NR] = $a + $b`, `@sum[$a."-".$b] = $x`, etc.
|
||||
These are prefixed with an at-sign, e.g., `@sum`. Furthermore, unlike built-in variables and stream-record fields, they are maintained in an arbitrarily nested map: you can do `@sum += $quantity`, or `@sum[$color] += $quantity`, or `@sum[$color][$shape] += $quantity`. The keys for the multi-level map can be any expression that evaluates to string or integer: e.g. `@sum[NR] = $a + $b`, `@sum[$a."-".$b] = $x`, etc.
|
||||
|
||||
Their names and their values are entirely under your control; they change only when you assign to them.
|
||||
Their names and their values are entirely under your control; they change only when you assign them.
|
||||
|
||||
Just as for field names in stream records, if you want to define out-of-stream variables with **special characters** such as `.` then you can use braces, e.g. `'@{variable.name}["index"]'`.
|
||||
|
||||
|
|
@ -110,13 +110,13 @@ mlr put '@sum += $a; end {emit @sum}' \
|
|||
data/a.dkvp
|
||||
GENMD-EOF
|
||||
|
||||
Out-of-stream variables' **extent** is from the start to the end of the record stream, i.e. every time the `put` or `filter` statement referring to them is executed.
|
||||
Out-of-stream variables' **extent** is from the start to the end of the record stream, i.e., every time the `put` or `filter` statement referring to them is executed.
|
||||
|
||||
Out-of-stream variables are **read-write**: you can do `$sum=@sum`, `@sum=$sum`, etc.
|
||||
|
||||
## Indexed out-of-stream variables
|
||||
|
||||
Using an index on the `@count` and `@sum` variables, we get the benefit of the `-g` (group-by) option which `mlr stats1` and various other Miller commands have:
|
||||
Using an index on the `@count` and `@sum` variables, we get the benefit of the `-g` (group-by) option, which `mlr stats1` and various other Miller commands have:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr put -q '
|
||||
|
|
@ -173,8 +173,8 @@ Local variables are similar to out-of-stream variables, except that their extent
|
|||
For example:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
# Here I'm using a specified random-number seed so this example always
|
||||
# produces the same output for this web document: in everyday practice we
|
||||
# Here I'm using a specified random-number seed, so this example always
|
||||
# produces the same output for this web document: in everyday practice, we
|
||||
# would leave off the --seed 12345 part.
|
||||
mlr --seed 12345 seqgen --start 1 --stop 10 then put '
|
||||
func f(a, b) { # function arguments a and b
|
||||
|
|
@ -193,7 +193,7 @@ GENMD-EOF
|
|||
|
||||
Things which are completely unsurprising, resembling many other languages:
|
||||
|
||||
* Parameter names are bound to their arguments but can be reassigned, e.g. if there is a parameter named `a` then you can reassign the value of `a` to be something else within the function if you like.
|
||||
* Parameter names are bound to their arguments but can be reassigned, e.g., if there is a parameter named `a`, then you can reassign the value of `a` to be something else within the function if you like.
|
||||
|
||||
* However, you cannot redeclare the *type* of an argument or a local: `var a=1; var a=2` is an error but `var a=1; a=2` is OK.
|
||||
|
||||
|
|
@ -207,13 +207,13 @@ Things which are completely unsurprising, resembling many other languages:
|
|||
|
||||
Things which are perhaps surprising compared to other languages:
|
||||
|
||||
* Type declarations using `var`, or typed using `num`, `int`, `float`, `str`, `bool`, `arr`, `map`, `funct` are not necessary to declare local variables. Function arguments and variables bound in for-loops over stream records and out-of-stream variables are *implicitly* declared using `var`. (Some examples are shown below.)
|
||||
* Type declarations using `var`, or typed using `num`, `int`, `float`, `str`, `bool`, `arr`, `map`, `funct`, are not necessary to declare local variables. Function arguments and variables bound in for-loops over stream records and out-of-stream variables are *implicitly* declared using `var`. (Some examples are shown below.)
|
||||
|
||||
* Type-checking is done at assignment time. For example, `float f = 0` is an error (since `0` is an integer), as is `float f = 0.0; f = 1`. For this reason I prefer to use `num` over `float` in most contexts since `num` encompasses integer and floating-point values. More information is at [Type-checking](reference-dsl-variables.md#type-checking).
|
||||
* Type-checking is done at assignment time. For example, `float f = 0` is an error (since `0` is an integer), as is `float f = 0.0; f = 1`. For this reason, I prefer to use `num` over `float` in most contexts, as `num` encompasses both integer and floating-point values. For more information, refer to [Type-checking](reference-dsl-variables.md#type-checking).
|
||||
|
||||
* Bound variables in for-loops over stream records and out-of-stream variables are implicitly local to that block. E.g. in `for (k, v in $*) { ... }` `for ((k1, k2), v in @*) { ... }` if there are `k`, `v`, etc. in the enclosing scope then those will be masked by the loop-local bound variables in the loop, and moreover the values of the loop-local bound variables are not available after the end of the loop.
|
||||
|
||||
* For C-style triple-for loops, if a for-loop variable is defined using `var`, `int`, etc. then it is scoped to that for-loop. E.g. `for (i = 0; i < 10; i += 1) { ... }` and `for (int i = 0; i < 10; i += 1) { ... }`. (This is unsurprising.). If there is no typedecl and an outer-scope variable of that name exists, then it is used. (This is also unsurprising.) But if there is no outer-scope variable of that name, then the variable is scoped to the for-loop only.
|
||||
* For C-style triple-for loops, if a for-loop variable is defined using `var`, `int`, etc., then it is scoped to that for-loop. E.g. `for (i = 0; i < 10; i += 1) { ... }` and `for (int i = 0; i < 10; i += 1) { ... }`. (This is unsurprising.). If there is no typedecl and an outer-scope variable of that name exists, then it is used. (This is also unsurprising.) But if there is no outer-scope variable of that name, then the variable is scoped to the for-loop only.
|
||||
|
||||
The following example demonstrates the scope rules:
|
||||
|
||||
|
|
@ -237,7 +237,7 @@ GENMD-EOF
|
|||
|
||||
## Map literals
|
||||
|
||||
Miller's `put`/`filter` DSL has four kinds of maps. **Stream records** are (single-level) maps from name to value. **Out-of-stream variables** and **local variables** can also be maps, although they can be multi-level maps (e.g. `@sum[$x][$y]`). The fourth kind is **map literals**. These cannot be on the left-hand side of assignment expressions. Syntactically they look like JSON, although Miller allows string and integer keys in its map literals while JSON allows only string keys (e.g. `"3"` rather than `3`). Note though that integer keys become stringified in Miller: `@mymap[3]=4` results in `@mymap` being `{"3":4}`.
|
||||
Miller's `put`/`filter` DSL has four kinds of maps. **Stream records** are (single-level) maps from name to value. **Out-of-stream variables** and **local variables** can also be maps, although they can be multi-level maps (e.g. `@sum[$x][$y]`). The fourth kind is **map literals**. These cannot be on the left-hand side of assignment expressions. Syntactically, they look like JSON, although Miller allows string and integer keys in its map literals while JSON allows only string keys (e.g., `"3"` rather than `3`). Note, though, that integer keys become stringified in Miller: `@mymap[3]=4` results in `@mymap` being `{"3":4}`.
|
||||
|
||||
For example, the following swaps the input stream's `a` and `i` fields, modifies `y`, and drops the rest:
|
||||
|
||||
|
|
@ -300,7 +300,7 @@ there are the read-only separator variables `IRS`, `ORS`, `IFS`, `OFS`, `IPS`,
|
|||
and `OPS` as discussed on the [separators page](reference-main-separators.md),
|
||||
and the flatten/unflatten separator `FLATSEP` discussed on the
|
||||
[flatten/unflatten page](flatten-unflatten.md). Lastly, the `ENV` map allows
|
||||
read/write access to environment variables, e.g. `ENV["HOME"]` or
|
||||
read/write access to environment variables, e.g., `ENV["HOME"]` or
|
||||
`ENV["foo_".$hostname]` or `ENV["VERSION"]="1.2.3"`.
|
||||
|
||||
<!--- TODO: FLATSEP IFLATSEP OFLATSEP --->
|
||||
|
|
@ -320,7 +320,7 @@ system environment variables at the time Miller starts. Any changes made to
|
|||
`ENV` by assigning to it will affect any subprocesses, such as using
|
||||
[piped tee](reference-dsl-output-statements.md#redirected-output-statements).
|
||||
|
||||
Their **scope is global**: you can refer to them in any `filter` or `put` statement. Their values are assigned by the input-record reader:
|
||||
Their **scope is global**: you can refer to them in any `filter` or `put` statement. The input-record reader assigns their values:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr --csv put '$nr = NR' data/a.csv
|
||||
|
|
@ -332,11 +332,11 @@ GENMD-EOF
|
|||
|
||||
The **extent** is for the duration of the put/filter: in a `begin` statement (which executes before the first input record is consumed) you will find `NR=1` and in an `end` statement (which is executed after the last input record is consumed) you will find `NR` to be the total number of records ingested.
|
||||
|
||||
These are all **read-only** for the `mlr put` and `mlr filter` DSL: they may be assigned from, e.g. `$nr=NR`, but they may not be assigned to: `NR=100` is a syntax error.
|
||||
These are all **read-only** for the `mlr put` and `mlr filter` DSL: they may be assigned from, e.g., `$nr=NR`, but they may not be assigned to: `NR=100` is a syntax error.
|
||||
|
||||
## Type-checking
|
||||
|
||||
Miller's `put`/`filter` DSL supports two optional kinds of type-checking. One is inline **type-tests** and **type-assertions** within expressions. The other is **type declarations** for assignments to local variables, binding of arguments to user-defined functions, and return values from user-defined functions, These are discussed in the following subsections.
|
||||
Miller's `put`/`filter` DSL supports two optional kinds of type-checking. One is inline **type tests** and **type assertions** within expressions. The other is **type declarations** for assignments to local variables, binding of arguments to user-defined functions, and return values from user-defined functions. These are discussed in the following subsections.
|
||||
|
||||
Use of type-checking is entirely up to you: omit it if you want flexibility with heterogeneous data; use it if you want to help catch misspellings in your DSL code or unexpected irregularities in your input data.
|
||||
|
||||
|
|
@ -354,22 +354,22 @@ GENMD-EOF
|
|||
|
||||
See [Data-cleaning Examples](data-cleaning-examples.md) for examples of how to use these.
|
||||
|
||||
### Type-declarations for local variables, function parameter, and function return values
|
||||
### Type declarations for local variables, function parameters, and function return values
|
||||
|
||||
Local variables can be defined either untyped as in `x = 1`, or typed as in `int x = 1`. Types include **var** (explicitly untyped), **int**, **float**, **num** (int or float), **str**, **bool**, **arr**, **map**, and **funct**. These optional type declarations are enforced at the time values are assigned to variables: whether at the initial value assignment as in `int x = 1` or in any subsequent assignments to the same variable farther down in the scope.
|
||||
|
||||
The reason for `num` is that `int` and `float` typedecls are very precise:
|
||||
|
||||
GENMD-CARDIFY
|
||||
float a = 0; # Runtime error since 0 is int not float
|
||||
int b = 1.0; # Runtime error since 1.0 is float not int
|
||||
float a = 0; # Runtime error since 0 is int, not float
|
||||
int b = 1.0; # Runtime error since 1.0 is float, not int
|
||||
num c = 0; # OK
|
||||
num d = 1.0; # OK
|
||||
GENMD-EOF
|
||||
|
||||
A suggestion is to use `num` for general use when you want numeric content, and use `int` when you genuinely want integer-only values, e.g. in loop indices or map keys (since Miller map keys can only be strings or ints).
|
||||
A suggestion is to use `num` for general use when you want numeric content, and use `int` when you genuinely want integer-only values, e.g., in loop indices or map keys (since Miller map keys can only be strings or ints).
|
||||
|
||||
The `var` type declaration indicates no type restrictions, e.g. `var x = 1` has the same type restrictions on `x` as `x = 1`. The difference is in intentional shadowing: if you have `x = 1` in outer scope and `x = 2` in inner scope (e.g. within a for-loop or an if-statement) then outer-scope `x` has value 2 after the second assignment. But if you have `var x = 2` in the inner scope, then you are declaring a variable scoped to the inner block.) For example:
|
||||
The `var` type declaration indicates no type restrictions, e.g., `var x = 1` has the same type restrictions on `x` as `x = 1`. The difference is in intentional shadowing: if you have `x = 1` in outer scope and `x = 2` in inner scope (e.g., within a for-loop or an if-statement) then outer-scope `x` has value 2 after the second assignment. But if you have `var x = 2` in the inner scope, then you are declaring a variable scoped to the inner block.) For example:
|
||||
|
||||
GENMD-CARDIFY
|
||||
x = 1;
|
||||
|
|
@ -387,7 +387,7 @@ if (NR == 4) {
|
|||
print x; # Value of this x is still 1
|
||||
GENMD-EOF
|
||||
|
||||
Likewise function arguments can optionally be typed, with type enforced when the function is called:
|
||||
Likewise, function arguments can optionally be typed, with type enforced when the function is called:
|
||||
|
||||
GENMD-CARDIFY
|
||||
func f(map m, int i) {
|
||||
|
|
@ -419,7 +419,7 @@ func f(map m, int i): bool {
|
|||
}
|
||||
...
|
||||
...
|
||||
# In Miller if your functions don't explicitly return a value, they return absent-null.
|
||||
# In Miller, if your functions don't explicitly return a value, they return absent-null.
|
||||
# So it would also be a runtime error on reaching the end of this function without
|
||||
# an explicit return statement.
|
||||
}
|
||||
|
|
@ -482,7 +482,7 @@ mlr --opprint --from data/small put -q '
|
|||
'
|
||||
GENMD-EOF
|
||||
|
||||
Example of out-of-stream variable assigned to full stream record, where the 2nd record is stashed, and the 4th record is overwritten with that:
|
||||
Example of an out-of-stream variable assigned to the full stream record, where the 2nd record is stashed, and the 4th record is overwritten with that:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr put 'NR == 2 {@keep = $*}; NR == 4 {$* = @keep}' data/small
|
||||
|
|
|
|||
|
|
@ -63,9 +63,9 @@ Notes:
|
|||
**Flags:**
|
||||
|
||||
* `--pass-comments`: Immediately print commented lines (prefixed by `#`) within the input.
|
||||
* `--pass-comments-with {string}`: Immediately print commented lines within input, with specified prefix.
|
||||
* `--pass-comments-with {string}`: Immediately print commented lines within input, with specified prefix. For CSV input format, the prefix must be a single character.
|
||||
* `--skip-comments`: Ignore commented lines (prefixed by `#`) within the input.
|
||||
* `--skip-comments-with {string}`: Ignore commented lines within input, with specified prefix.
|
||||
* `--skip-comments-with {string}`: Ignore commented lines within input, with specified prefix. For CSV input format, the prefix must be a single character.
|
||||
|
||||
## Compressed-data flags
|
||||
|
||||
|
|
@ -128,6 +128,15 @@ These are flags which are applicable to CSV format.
|
|||
* `--quote-all`: Force double-quoting of CSV fields.
|
||||
* `-N`: Keystroke-saver for `--implicit-csv-header --headerless-csv-output`.
|
||||
|
||||
## DKVP-only flags
|
||||
|
||||
These are flags which are applicable to DKVP format.
|
||||
|
||||
|
||||
**Flags:**
|
||||
|
||||
* `--incr-key`: Without this option, keyless DKVP fields are keyed by field number. For example: `a=10,b=20,30,d=40,50` is ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With this option, they're keyed by a running counter of keyless fields. For example: `a=10,b=20,30,d=40,50` is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`.
|
||||
|
||||
## File-format flags
|
||||
|
||||
See the File formats doc page, and or `mlr help file-formats`, for more
|
||||
|
|
@ -144,9 +153,9 @@ are overridden in all cases by setting output format to `format2`.
|
|||
**Flags:**
|
||||
|
||||
* `--asv or --asvlite`: Use ASV format for input and output data.
|
||||
* `--csv or -c`: Use CSV format for input and output data.
|
||||
* `--csv or -c or --c2c`: Use CSV format for input and output data.
|
||||
* `--csvlite`: Use CSV-lite format for input and output data.
|
||||
* `--dkvp`: Use DKVP format for input and output data.
|
||||
* `--dkvp or --d2d`: Use DKVP format for input and output data.
|
||||
* `--gen-field-name`: Specify field name for --igen. Defaults to "i".
|
||||
* `--gen-start`: Specify start value for --igen. Defaults to 1.
|
||||
* `--gen-step`: Specify step value for --igen. Defaults to 1.
|
||||
|
|
@ -166,9 +175,9 @@ are overridden in all cases by setting output format to `format2`.
|
|||
* `--itsvlite`: Use TSV-lite format for input data.
|
||||
* `--iusv or --iusvlite`: Use USV format for input data.
|
||||
* `--ixtab`: Use XTAB format for input data.
|
||||
* `--json or -j`: Use JSON format for input and output data.
|
||||
* `--jsonl`: Use JSON Lines format for input and output data.
|
||||
* `--nidx`: Use NIDX format for input and output data.
|
||||
* `--json or -j or --j2j`: Use JSON format for input and output data.
|
||||
* `--jsonl or --l2l`: Use JSON Lines format for input and output data.
|
||||
* `--nidx or --n2n`: Use NIDX format for input and output data.
|
||||
* `--oasv or --oasvlite`: Use ASV format for output data.
|
||||
* `--ocsv`: Use CSV format for output data.
|
||||
* `--ocsvlite`: Use CSV-lite format for output data.
|
||||
|
|
@ -182,11 +191,11 @@ are overridden in all cases by setting output format to `format2`.
|
|||
* `--otsvlite`: Use TSV-lite format for output data.
|
||||
* `--ousv or --ousvlite`: Use USV format for output data.
|
||||
* `--oxtab`: Use XTAB format for output data.
|
||||
* `--pprint`: Use PPRINT format for input and output data.
|
||||
* `--tsv or -t`: Use TSV format for input and output data.
|
||||
* `--pprint or --p2p`: Use PPRINT format for input and output data.
|
||||
* `--tsv or -t or --t2t`: Use TSV format for input and output data.
|
||||
* `--tsvlite`: Use TSV-lite format for input and output data.
|
||||
* `--usv or --usvlite`: Use USV format for input and output data.
|
||||
* `--xtab`: Use XTAB format for input and output data.
|
||||
* `--xtab or --x2x`: Use XTAB format for input and output data.
|
||||
* `--xvright`: Right-justify values for XTAB format.
|
||||
* `-i {format name}`: Use format name for input data. For example: `-i csv` is the same as `--icsv`.
|
||||
* `-o {format name}`: Use format name for output data. For example: `-o csv` is the same as `--ocsv`.
|
||||
|
|
@ -195,14 +204,14 @@ are overridden in all cases by setting output format to `format2`.
|
|||
|
||||
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
|
||||
|
||||
See the Flatten/unflatten doc page for more information.
|
||||
See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
|
||||
|
||||
|
||||
**Flags:**
|
||||
|
||||
* `--flatsep or --jflatsep {string}`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`.
|
||||
* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.
|
||||
* `--no-auto-unflatten`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.
|
||||
* `--no-auto-unflatten`: When input is non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
|
||||
## Format-conversion keystroke-saver flags
|
||||
|
||||
|
|
|
|||
|
|
@ -125,7 +125,7 @@ with the exception that the `min` and `max` functions are special: if one argume
|
|||
x=,y=3,a=3,b=
|
||||
</pre>
|
||||
|
||||
Likewise, empty works like 0 for addition and subtraction, and multiplication:
|
||||
Likewise, empty works like 0 for addition and subtraction, and like 1 for multiplication:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>echo 'x=,y=3' | mlr put '$a = $x + $y; $b = $x - $y; $c = $x * $y'</b>
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ GENMD-RUN-COMMAND
|
|||
echo 'x=,y=3' | mlr put '$a=min($x,$y);$b=max($x,$y)'
|
||||
GENMD-EOF
|
||||
|
||||
Likewise, empty works like 0 for addition and subtraction, and multiplication:
|
||||
Likewise, empty works like 0 for addition and subtraction, and like 1 for multiplication:
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
echo 'x=,y=3' | mlr put '$a = $x + $y; $b = $x - $y; $c = $x * $y'
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ See also the [Glossary](glossary.md) for more about terms such as
|
|||
|
||||
When you type `mlr {something} myfile.dat`, the `{something}` part is called a **verb**. It specifies how you want to transform your data. Most of the verbs are counterparts of built-in system tools like `cut` and `sort` -- but with file-format awareness, and giving you the ability to refer to fields by name.
|
||||
|
||||
The verbs `put` and `filter` are special in that they have a rich expression language (domain-specific language, or "DSL"). More information about them can be found at on the [Intro to Miller's programming language page](miller-programming-language.md); see also [DSL reference](reference-dsl.md) for more details.
|
||||
The verbs `put` and `filter` are special in that they have a rich expression language (domain-specific language, or "DSL"). More information about them can be found on the [Intro to Miller's Programming Language page](miller-programming-language.md); see also the [DSL Reference](reference-dsl.md) for more details.
|
||||
|
||||
Here's a comparison of verbs and `put`/`filter` DSL expressions:
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ See also the [Glossary](glossary.md) for more about terms such as
|
|||
|
||||
When you type `mlr {something} myfile.dat`, the `{something}` part is called a **verb**. It specifies how you want to transform your data. Most of the verbs are counterparts of built-in system tools like `cut` and `sort` -- but with file-format awareness, and giving you the ability to refer to fields by name.
|
||||
|
||||
The verbs `put` and `filter` are special in that they have a rich expression language (domain-specific language, or "DSL"). More information about them can be found at on the [Intro to Miller's programming language page](miller-programming-language.md); see also [DSL reference](reference-dsl.md) for more details.
|
||||
The verbs `put` and `filter` are special in that they have a rich expression language (domain-specific language, or "DSL"). More information about them can be found on the [Intro to Miller's Programming Language page](miller-programming-language.md); see also the [DSL Reference](reference-dsl.md) for more details.
|
||||
|
||||
Here's a comparison of verbs and `put`/`filter` DSL expressions:
|
||||
|
||||
|
|
|
|||
|
|
@ -251,7 +251,8 @@ package syntax // import "regexp/syntax"
|
|||
|
||||
Package syntax parses regular expressions into parse trees and compiles parse
|
||||
trees into programs. Most clients of regular expressions will use the facilities
|
||||
of package regexp (such as Compile and Match) instead of this package.
|
||||
of package regexp (such as regexp.Compile and regexp.Match) instead of this
|
||||
package.
|
||||
|
||||
# Syntax
|
||||
|
||||
|
|
@ -301,6 +302,7 @@ Grouping:
|
|||
|
||||
(re) numbered capturing group (submatch)
|
||||
(?P<name>re) named & numbered capturing group (submatch)
|
||||
(?<name>re) named & numbered capturing group (submatch)
|
||||
(?:re) non-capturing group
|
||||
(?flags) set flags within current group; non-capturing
|
||||
(?flags:re) set flags during re; non-capturing
|
||||
|
|
|
|||
|
|
@ -804,7 +804,7 @@ Options:
|
|||
-r Treat field names as regular expressions. "ab", "a.*b" will
|
||||
match any field name containing the substring "ab" or matching
|
||||
"a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may
|
||||
be used. The -o flag is ignored when -r is present.
|
||||
be used.
|
||||
-h|--help Show this message.
|
||||
Examples:
|
||||
mlr cut -f hostname,status
|
||||
|
|
@ -970,7 +970,7 @@ a,b,c
|
|||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
Usage: mlr filter [options] {DSL expression}
|
||||
Lets you use a domain-specific language to programatically filter which
|
||||
Lets you use a domain-specific language to programmatically filter which
|
||||
stream records will be output.
|
||||
See also: https://miller.readthedocs.io/en/latest/reference-verbs
|
||||
|
||||
|
|
@ -986,7 +986,7 @@ Options:
|
|||
Since the expression pieces are simply concatenated, please be sure to use intervening
|
||||
semicolons to separate expressions.)
|
||||
|
||||
-s name=value: Predefines out-of-stream variable @name to have
|
||||
-s name=value: Predefines out-of-stream variable @name to have
|
||||
Thus mlr put -s foo=97 '$column += @foo' is like
|
||||
mlr put 'begin {@foo = 97} $column += @foo'.
|
||||
The value part is subject to type-inferencing.
|
||||
|
|
@ -1464,6 +1464,8 @@ for the old string and handling multiple matches, like the `gsub` DSL function.
|
|||
See also the `sub` and `ssub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
</pre>
|
||||
|
||||
|
|
@ -1711,7 +1713,7 @@ be specified CSV as well unless you override with 'mlr --csv ... join --ijson -l
|
|||
Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be
|
||||
expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'.
|
||||
Please use "mlr --usage-separator-options" for information on specifying separators.
|
||||
Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information
|
||||
Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information
|
||||
including examples.
|
||||
</pre>
|
||||
|
||||
|
|
@ -2306,7 +2308,7 @@ Options:
|
|||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
Usage: mlr put [options] {DSL expression}
|
||||
Lets you use a domain-specific language to programatically alter stream records.
|
||||
Lets you use a domain-specific language to programmatically alter stream records.
|
||||
See also: https://miller.readthedocs.io/en/latest/reference-verbs
|
||||
|
||||
Options:
|
||||
|
|
@ -2321,7 +2323,7 @@ Options:
|
|||
Since the expression pieces are simply concatenated, please be sure to use intervening
|
||||
semicolons to separate expressions.)
|
||||
|
||||
-s name=value: Predefines out-of-stream variable @name to have
|
||||
-s name=value: Predefines out-of-stream variable @name to have
|
||||
Thus mlr put -s foo=97 '$column += @foo' is like
|
||||
mlr put 'begin {@foo = 97} $column += @foo'.
|
||||
The value part is subject to type-inferencing.
|
||||
|
|
@ -2958,6 +2960,7 @@ Options:
|
|||
-nf {comma-separated field names} Same as -n
|
||||
-nr {comma-separated field names} Numerical descending; nulls sort first
|
||||
-t {comma-separated field names} Natural ascending
|
||||
-b Move sort fields to start of record, as in reorder -b
|
||||
-tr|-rt {comma-separated field names} Natural descending
|
||||
-h|--help Show this message.
|
||||
|
||||
|
|
@ -3210,6 +3213,8 @@ Replaces old string with new string in specified field(s), without regex support
|
|||
the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
</pre>
|
||||
|
||||
|
|
@ -3714,6 +3719,8 @@ for the old string and not handling multiple matches, like the `sub` DSL functio
|
|||
See also the `gsub` and `ssub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
</pre>
|
||||
|
||||
|
|
@ -3851,6 +3858,21 @@ mean - - 5000.5 0.49860196816795804 0.5062057444929905
|
|||
median pan pan 5001 0.5011592202840128 0.5060212582772865
|
||||
</pre>
|
||||
|
||||
## surv
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr surv --help</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
Usage: mlr surv -d {duration-field} -s {status-field}
|
||||
|
||||
Estimate Kaplan-Meier survival curve (right-censored).
|
||||
Options:
|
||||
-d {field} Name of duration field (time-to-event or censoring).
|
||||
-s {field} Name of status field (0=censored, 1=event).
|
||||
-h, --help Show this message.
|
||||
</pre>
|
||||
|
||||
## tac
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
|
|
@ -4112,7 +4134,7 @@ There are two main ways to use `mlr uniq`: the first way is with `-g` to specify
|
|||
<b>wc -l data/colored-shapes.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
10079 data/colored-shapes.csv
|
||||
10079 data/colored-shapes.csv
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
|
|
@ -4269,7 +4291,7 @@ color=purple,shape=square,flag=0
|
|||
<b>wc -l data/repeats.dkvp</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
57 data/repeats.dkvp
|
||||
57 data/repeats.dkvp
|
||||
</pre>
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
|
|
|
|||
|
|
@ -1161,6 +1161,12 @@ GENMD-RUN-COMMAND
|
|||
mlr --from data/medium --opprint summary --transpose -a mean,median,mode
|
||||
GENMD-EOF
|
||||
|
||||
## surv
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
mlr surv --help
|
||||
GENMD-EOF
|
||||
|
||||
## tac
|
||||
|
||||
GENMD-RUN-COMMAND
|
||||
|
|
|
|||
|
|
@ -24,6 +24,8 @@ If your `mlr version` says something like `Miller v5.10.2` or `mlr 6.0.0`, witho
|
|||
| Release | Docs | Release notes |
|
||||
|---------|---------------------------------------------------------------------|---------------|
|
||||
main | [main branch](https://miller.readthedocs.io/en/main) | N/A |
|
||||
6.14.0 | [Miller 6.14.0](https://miller.readthedocs.io/en/6.14.0) | [Survival curve, misc. features and bugfixes](https://github.com/johnkerl/miller/releases/tag/v6.14.0) |
|
||||
6.13.0 | [Miller 6.13.0](https://miller.readthedocs.io/en/6.13.0) | [File-stat DSL function, new stats accumulator, misc. bugfixes](https://github.com/johnkerl/miller/releases/tag/v6.13.0) |
|
||||
6.12.0 | [Miller 6.12.0](https://miller.readthedocs.io/en/6.12.0) | [New sparsify verb, wide-table performance improvement, thousands separator for fmtnum function](https://github.com/johnkerl/miller/releases/tag/v6.12.0) |
|
||||
6.11.0 | [Miller 6.11.0](https://miller.readthedocs.io/en/6.11.0) | [CSV/TSV auto-unsparsify, regex-fieldname support for reorder/sub/ssub/gsub, strmatch DSL function, and more](https://github.com/johnkerl/miller/releases/tag/v6.11.0) |
|
||||
6.10.0 | [Miller 6.10.0](https://miller.readthedocs.io/en/6.10.0) | [Add --files option; bugfixes; use Go 1.19](https://github.com/johnkerl/miller/releases/tag/v6.10.0) |
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ If your `mlr version` says something like `Miller v5.10.2` or `mlr 6.0.0`, witho
|
|||
| Release | Docs | Release notes |
|
||||
|---------|---------------------------------------------------------------------|---------------|
|
||||
main | [main branch](https://miller.readthedocs.io/en/main) | N/A |
|
||||
6.14.0 | [Miller 6.14.0](https://miller.readthedocs.io/en/6.14.0) | [Survival curve, misc. features and bugfixes](https://github.com/johnkerl/miller/releases/tag/v6.14.0) |
|
||||
6.13.0 | [Miller 6.13.0](https://miller.readthedocs.io/en/6.13.0) | [File-stat DSL function, new stats accumulator, misc. bugfixes](https://github.com/johnkerl/miller/releases/tag/v6.13.0) |
|
||||
6.12.0 | [Miller 6.12.0](https://miller.readthedocs.io/en/6.12.0) | [New sparsify verb, wide-table performance improvement, thousands separator for fmtnum function](https://github.com/johnkerl/miller/releases/tag/v6.12.0) |
|
||||
6.11.0 | [Miller 6.11.0](https://miller.readthedocs.io/en/6.11.0) | [CSV/TSV auto-unsparsify, regex-fieldname support for reorder/sub/ssub/gsub, strmatch DSL function, and more](https://github.com/johnkerl/miller/releases/tag/v6.11.0) |
|
||||
6.10.0 | [Miller 6.10.0](https://miller.readthedocs.io/en/6.10.0) | [Add --files option; bugfixes; use Go 1.19](https://github.com/johnkerl/miller/releases/tag/v6.10.0) |
|
||||
|
|
|
|||
|
|
@ -234,7 +234,7 @@ then fraction -f count
|
|||
|
||||
Points:
|
||||
|
||||
* Same as above, where the `#!` line isn't needed. (But you can include a `#!` line; `mlr -s` will simply see it as a comment line.).
|
||||
* Same as above, where the `#!` line isn't needed. (But you can include a `#!` line; `mlr -s` will simply see it as a comment line.)
|
||||
* As above, you don't need all the backslashing for line-continuations.
|
||||
* As above, you don't need the explicit `--` or `"$@"`.
|
||||
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ GENMD-EOF
|
|||
|
||||
Points:
|
||||
|
||||
* Same as above, where the `#!` line isn't needed. (But you can include a `#!` line; `mlr -s` will simply see it as a comment line.).
|
||||
* Same as above, where the `#!` line isn't needed. (But you can include a `#!` line; `mlr -s` will simply see it as a comment line.)
|
||||
* As above, you don't need all the backslashing for line-continuations.
|
||||
* As above, you don't need the explicit `--` or `"$@"`.
|
||||
|
||||
|
|
|
|||
|
|
@ -19,13 +19,13 @@ Quick links:
|
|||
The goal is _multiple levels of detail_.
|
||||
|
||||
* The [Introduction page](index.md) is the shortest: headlines and **essential summary**.
|
||||
* The _Getting started_ section is for **new or near-new users** who want some simple examples along with connecting narrative. The goal is to get a new user up and running, able to do some interesting things with their own data.
|
||||
* The _Getting started_ section is for **new or near-new users** who want some simple examples along with a connecting narrative. The goal is to get a new user up and running, enabling them to perform interesting tasks with their own data.
|
||||
* The _Miller in more detail_ section is just-past-introductory, **tell-me-more material** about some of the things that make Miller unique: what file formats it handles (and how it handles them), how it relates to other tools in the Unix toolkit, and so on.
|
||||
* The _FAQs and examples_ section is non-introductory for people looking for various ways to do things by example. The discussion is pragmatic rather than theoretical, and **use-case-driven**.
|
||||
* The _Background_ section is some **non-essential historical** and meta material on why Miller was created.
|
||||
* The _Reference_ section aims to answer all questions the previous sections didn't. The discussion is **concept-driven**, although there are still plenty of examples throughout for concreteness.
|
||||
* _Main reference_ goes carefully through various aspects of Miller, concept by concept.
|
||||
* _DSL reference_ focuses on the [Miller programming language](miller-programming-language.md), again following a concept-at-a-time approach.
|
||||
* _Misc. reference_ is aptly named, with things like build-from-source notes.
|
||||
* _Documents for previous releases_ is not only for historical curiosity -- experience has shown that various Linux/BSD distros update their Miller versions on their own cadences, so the version on your system (as shown by `mlr --version`) might be best-served by its respective documentation version.
|
||||
* Lastly, new with the Miller 6 documents is a very easy-to-access **Search field** at the top of each page.
|
||||
* The _Reference_ section aims to answer all questions that the previous sections didn't. The discussion is **concept-driven**, although it includes numerous examples throughout for concreteness.
|
||||
* The main reference carefully examines various aspects of Miller, concept by concept.
|
||||
* The _DSL reference_ focuses on the [Miller programming language](miller-programming-language.md), again following a concept-at-a-time approach.
|
||||
* The _miscellaneous reference_ is aptly named, with things like build-from-source notes.
|
||||
* _Documents for previous releases_ is not only for historical curiosity -- experience has shown that various Linux/BSD distros update their Miller versions on their own cadences, so the version on your system (as shown by `mlr --version`) might be best served by its respective documentation version.
|
||||
* Lastly, new with the Miller 6 documents is an easy-to-access **Search field** at the top of each page.
|
||||
|
|
|
|||
|
|
@ -3,13 +3,13 @@
|
|||
The goal is _multiple levels of detail_.
|
||||
|
||||
* The [Introduction page](index.md) is the shortest: headlines and **essential summary**.
|
||||
* The _Getting started_ section is for **new or near-new users** who want some simple examples along with connecting narrative. The goal is to get a new user up and running, able to do some interesting things with their own data.
|
||||
* The _Getting started_ section is for **new or near-new users** who want some simple examples along with a connecting narrative. The goal is to get a new user up and running, enabling them to perform interesting tasks with their own data.
|
||||
* The _Miller in more detail_ section is just-past-introductory, **tell-me-more material** about some of the things that make Miller unique: what file formats it handles (and how it handles them), how it relates to other tools in the Unix toolkit, and so on.
|
||||
* The _FAQs and examples_ section is non-introductory for people looking for various ways to do things by example. The discussion is pragmatic rather than theoretical, and **use-case-driven**.
|
||||
* The _Background_ section is some **non-essential historical** and meta material on why Miller was created.
|
||||
* The _Reference_ section aims to answer all questions the previous sections didn't. The discussion is **concept-driven**, although there are still plenty of examples throughout for concreteness.
|
||||
* _Main reference_ goes carefully through various aspects of Miller, concept by concept.
|
||||
* _DSL reference_ focuses on the [Miller programming language](miller-programming-language.md), again following a concept-at-a-time approach.
|
||||
* _Misc. reference_ is aptly named, with things like build-from-source notes.
|
||||
* _Documents for previous releases_ is not only for historical curiosity -- experience has shown that various Linux/BSD distros update their Miller versions on their own cadences, so the version on your system (as shown by `mlr --version`) might be best-served by its respective documentation version.
|
||||
* Lastly, new with the Miller 6 documents is a very easy-to-access **Search field** at the top of each page.
|
||||
* The _Reference_ section aims to answer all questions that the previous sections didn't. The discussion is **concept-driven**, although it includes numerous examples throughout for concreteness.
|
||||
* The main reference carefully examines various aspects of Miller, concept by concept.
|
||||
* The _DSL reference_ focuses on the [Miller programming language](miller-programming-language.md), again following a concept-at-a-time approach.
|
||||
* The _miscellaneous reference_ is aptly named, with things like build-from-source notes.
|
||||
* _Documents for previous releases_ is not only for historical curiosity -- experience has shown that various Linux/BSD distros update their Miller versions on their own cadences, so the version on your system (as shown by `mlr --version`) might be best served by its respective documentation version.
|
||||
* Lastly, new with the Miller 6 documents is an easy-to-access **Search field** at the top of each page.
|
||||
|
|
|
|||
6
docs/src/swipes.sh
Executable file
6
docs/src/swipes.sh
Executable file
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
for x in *.md.in; do
|
||||
sed -i .emd 's/ *$//' $x
|
||||
rm $x.emd
|
||||
done
|
||||
|
|
@ -63,9 +63,9 @@ Likewise with `mlr sort`, `mlr tac`, and so on.
|
|||
|
||||
## awk-like features: mlr filter and mlr put
|
||||
|
||||
* `mlr filter` includes/excludes records based on a filter expression, e.g. `mlr filter '$count > 10'`.
|
||||
* `mlr filter` includes/excludes records based on a filter expression, e.g., `mlr filter '$count > 10'`.
|
||||
|
||||
* `mlr put` adds a new field as a function of others, e.g. `mlr put '$xy = $x * $y'` or `mlr put '$counter = NR'`.
|
||||
* `mlr put` adds a new field as a function of others, e.g., `mlr put '$xy = $x * $y'` or `mlr put '$counter = NR'`.
|
||||
|
||||
* The `$name` syntax is straight from `awk`'s `$1 $2 $3` (adapted to name-based indexing), as are the variables `FS`, `OFS`, `RS`, `ORS`, `NF`, `NR`, and `FILENAME`. The `ENV[...]` syntax is from Ruby.
|
||||
|
||||
|
|
@ -73,7 +73,7 @@ Likewise with `mlr sort`, `mlr tac`, and so on.
|
|||
|
||||
* Like `awk`, Miller (as of v5.0.0) allows you to define new functions within its `put` and `filter` expression language. Further programmability comes from chaining with `then`.
|
||||
|
||||
* As with `awk`, `$`-variables are stream variables and all verbs (such as `cut`, `stats1`, `put`, etc.) as well as `put`/`filter` statements operate on streams. This means that you define actions to be done on each record and then stream your data through those actions. The built-in variables `NF`, `NR`, etc. change from one record to another, `$x` is a label for field `x` in the current record, and the input to `sqrt($x)` changes from one record to the next. The expression language for the `put` and `filter` verbs additionally allows you to define `begin {...}` and `end {...}` blocks for actions to be taken before and after records are processed, respectively.
|
||||
* As with `awk`, `$`-variables are stream variables and all verbs (such as `cut`, `stats1`, `put`, etc.) as well as `put`/`filter` statements operate on streams. This means that you define actions to be done on each record and then stream your data through those actions. The built-in variables `NF`, `NR`, etc., change from one record to another, `$x` is a label for field `x` in the current record, and the input to `sqrt($x)` changes from one record to the next. The expression language for the `put` and `filter` verbs additionally allows you to define `begin {...}` and `end {...}` blocks for actions to be taken before and after records are processed, respectively.
|
||||
|
||||
* As with `awk`, Miller's `put`/`filter` language lets you set `@sum=0` before records are read, then update that sum on each record, then print its value at the end. Unlike `awk`, Miller makes syntactically explicit the difference between variables with extent across all records (names starting with `@`, such as `@sum`) and variables which are local to the current expression invocation (names starting without `@`, such as `sum`).
|
||||
|
||||
|
|
|
|||
|
|
@ -26,9 +26,9 @@ Likewise with `mlr sort`, `mlr tac`, and so on.
|
|||
|
||||
## awk-like features: mlr filter and mlr put
|
||||
|
||||
* `mlr filter` includes/excludes records based on a filter expression, e.g. `mlr filter '$count > 10'`.
|
||||
* `mlr filter` includes/excludes records based on a filter expression, e.g., `mlr filter '$count > 10'`.
|
||||
|
||||
* `mlr put` adds a new field as a function of others, e.g. `mlr put '$xy = $x * $y'` or `mlr put '$counter = NR'`.
|
||||
* `mlr put` adds a new field as a function of others, e.g., `mlr put '$xy = $x * $y'` or `mlr put '$counter = NR'`.
|
||||
|
||||
* The `$name` syntax is straight from `awk`'s `$1 $2 $3` (adapted to name-based indexing), as are the variables `FS`, `OFS`, `RS`, `ORS`, `NF`, `NR`, and `FILENAME`. The `ENV[...]` syntax is from Ruby.
|
||||
|
||||
|
|
@ -36,7 +36,7 @@ Likewise with `mlr sort`, `mlr tac`, and so on.
|
|||
|
||||
* Like `awk`, Miller (as of v5.0.0) allows you to define new functions within its `put` and `filter` expression language. Further programmability comes from chaining with `then`.
|
||||
|
||||
* As with `awk`, `$`-variables are stream variables and all verbs (such as `cut`, `stats1`, `put`, etc.) as well as `put`/`filter` statements operate on streams. This means that you define actions to be done on each record and then stream your data through those actions. The built-in variables `NF`, `NR`, etc. change from one record to another, `$x` is a label for field `x` in the current record, and the input to `sqrt($x)` changes from one record to the next. The expression language for the `put` and `filter` verbs additionally allows you to define `begin {...}` and `end {...}` blocks for actions to be taken before and after records are processed, respectively.
|
||||
* As with `awk`, `$`-variables are stream variables and all verbs (such as `cut`, `stats1`, `put`, etc.) as well as `put`/`filter` statements operate on streams. This means that you define actions to be done on each record and then stream your data through those actions. The built-in variables `NF`, `NR`, etc., change from one record to another, `$x` is a label for field `x` in the current record, and the input to `sqrt($x)` changes from one record to the next. The expression language for the `put` and `filter` verbs additionally allows you to define `begin {...}` and `end {...}` blocks for actions to be taken before and after records are processed, respectively.
|
||||
|
||||
* As with `awk`, Miller's `put`/`filter` language lets you set `@sum=0` before records are read, then update that sum on each record, then print its value at the end. Unlike `awk`, Miller makes syntactically explicit the difference between variables with extent across all records (names starting with `@`, such as `@sum`) and variables which are local to the current expression invocation (names starting without `@`, such as `sum`).
|
||||
|
||||
|
|
|
|||
|
|
@ -20,44 +20,44 @@ Someone asked me the other day about design, tradeoffs, thought process, why I f
|
|||
|
||||
## Who is Miller for?
|
||||
|
||||
For background, I'm a software engineer, with a heavy devops bent and a non-trivial amount of data-engineering in my career. **Initially I wrote Miller mainly for myself:** I'm coder-friendly (being a coder); I'm Github-friendly; most of my data are well-structured or easily structurable (TSV-formatted SQL-query output, CSV files, log files, JSON data structures); I care about interoperability between all the various formats Miller supports (I've encountered them all); I do all my work on Linux or OS X.
|
||||
For background, I'm a software engineer with a heavy devops bent and a non-trivial amount of data engineering in my career. **Initially, I wrote Miller mainly for myself:** I'm coder-friendly (being a coder); I'm Github-friendly; most of my data is either well-structured or easily structurable (TSV-formatted SQL-query output, CSV files, log files, JSON data structures); I care about interoperability between all the various formats Miller supports (I've encountered them all); I do all my work on Linux or OS X.
|
||||
|
||||
But now there's this neat little tool **which seems to be useful for people in various disciplines**. I don't even know entirely *who*. I can click through Github starrers and read a bit about what they seem to do, but not everyone that uses Miller is even *on* Github (or stars things). I've gotten a lot of feature requests through Github -- but only from people who are Github users. Not everyone's a coder (it seems like a lot of Miller's Github starrers are devops folks like myself, or data-science-ish people, or biology/genomics folks.) A lot of people care 100% about CSV. And so on.
|
||||
But now there's this neat little tool **which seems to be useful for people in various disciplines**. I don't even know entirely *who*. I can click through Github starrers and read a bit about what they seem to do, but not everyone who uses Miller is even *on* Github (or stars things). I've gotten a lot of feature requests through Github -- but only from people who are Github users. Not everyone's a coder (it seems like many of Miller's Github starrers are devops folks like myself, or data-science-ish people, or biology/genomics folks.) A lot of people care 100% about CSV. And so on.
|
||||
|
||||
So the reason for the [Miller User Survey](https://github.com/johnkerl/miller/discussions/542) is to answer questions such as: does Miller do what you need? Do you use it for all sorts of things, or just one or two nice things? Are there things you wish it did but it doesn't? Is it almost there, or just nowhere near what you want? Are there not enough features or way too many? Are the docs too complicated; do you have a hard time finding out how to do what you want? Should I think differently about what this tool even *is* in the first place? Should I think differently about who it's for?
|
||||
So the reason for the [Miller User Survey](https://github.com/johnkerl/miller/discussions/542) is to answer questions such as: does Miller do what you need? Do you use it for all sorts of things, or just one or two nice things? Are there things you wish it did, but it doesn't? Is it almost there, or just nowhere near what you want? Are there not enough features or way too many? Are the docs too complicated? Do you have a hard time finding out how to do what you want? Should I think differently about what this tool even *is* in the first place? Should I think differently about who it's for?
|
||||
|
||||
## What was Miller created to do?
|
||||
|
||||
First: there are tools like `xsv` which handles CSV marvelously and `jq` which handles JSON marvelously, and so on -- but I over the years of my career in the software industry I've found myself, and others, doing a lot of ad-hoc things which really were fundamentally the same *except* for format. So the number one thing about Miller is doing common things while supporting **multiple formats**: (a) ingest a list of records where a record is a list of key-value pairs (however represented in the input files); (b) transform that stream of records; (c) emit the transformed stream -- either in the same format as input, or in a different format.
|
||||
The first thing: there are tools like `xsv` which handles CSV marvelously and `jq` which handles JSON marvelously, and so on -- but I over the years of my career in the software industry I've found myself, and others, doing a lot of ad-hoc things which were fundamentally the same *except* for format. So the number one thing about Miller is doing common things while supporting **multiple formats**: (a) ingest a list of records where a record is a list of key-value pairs (however represented in the input files); (b) transform that stream of records; (c) emit the transformed stream -- either in the same format as input, or in a different format.
|
||||
|
||||
Second thing, a lot like the first: just as I didn't want to build something only for a single file format, I didn't want to build something only for one problem domain. In my work doing software engineering, devops, data engineering, etc. I saw a lot of commonalities and I wanted to **solve as many problems simultaneously as possible**.
|
||||
The second thing is a lot like the first: just as I didn't want to build something only for a single file format, I didn't want to build something only for one problem domain. In my work doing software engineering, devops, data engineering, etc. I saw a lot of commonalities, and I wanted to **solve as many problems simultaneously as possible**.
|
||||
|
||||
Third: it had to be **streaming**. As time goes by and we (some of us, sometimes) have machines with tens or hundreds of GB of RAM, it's maybe less important, but I'm unhappy with tools which ingest all data, then do stuff, then emit all data. One reason is to be able to handle files bigger than available RAM. Another reason is to be able to handle input which trickles in, e.g. you have some process emitting data now and then and you can pipe it to Miller and it will emit transformed records one at a time.
|
||||
Third: it had to be **streaming**. As time goes by and we (some of us, sometimes) have machines with tens or hundreds of GB of RAM, it's less important, but I'm unhappy with tools that ingest all data, then do stuff, then emit all data. One reason is to be able to handle files bigger than available RAM. Another reason is to be able to handle input which trickles in, e.g., you have some process emitting data now and then, and you can pipe it to Miller and it will emit transformed records one at a time.
|
||||
|
||||
Fourth: it had to be **fast**. This precludes all sorts of very nice things written in Ruby, for example. I love Ruby as a very expressive language, and I have several very useful little utility scripts written in Ruby. But a few years ago I ported over some of my old tried-and-true C programs and the lines-of-code count was a *lot* lower -- it was great! Until I ran them on multi-GB files and realized they took 60x as long to complete. So I couldn't write Miller in Ruby, or in languages like it. I was going to have to do something in a low-level language in order to make it performant.
|
||||
Fourth: it had to be **fast**. This precludes all sorts of very nice things written in Ruby, for example. I love Ruby as a very expressive language, and I have several very useful little utility scripts written in Ruby. But a few years ago, I ported over some of my old tried-and-true C programs and the lines-of-code count was a *lot* lower -- it was great! Until I ran them on multi-GB files and realized they took 60x as long to complete. So I couldn't write Miller in Ruby, or languages like it. I was going to have to do something in a low-level language in order to make it performant.
|
||||
|
||||
Fifth thing: I wanted Miller to be **pipe-friendly and interoperate with other command-line tools**. Since the basic paradigm is ingest records, transform records, emit records -- where the input and output formats can be the same or different, and the transform can be complex, or just pass-through -- this means you can use it to transform data, or re-format it, or both. So if you just want to do data-cleaning/prep/formatting and do all the "real" work in R, you can. If you just want a little glue script between other tools you can get that. And if you want to do non-trivial data-reduction in Miller you can.
|
||||
The fifth thing: I wanted Miller to be **pipe-friendly and interoperate with other command-line tools**. Since the basic paradigm is ingest records, transform records, emit records -- where the input and output formats can be the same or different, and the transform can be complex, or just pass-through -- this means you can use it to transform data, or re-format it, or both. So if you just want to do data-cleaning/prep/formatting and do all the "real" work in R, you can. If you want a little glue script between other tools, you can get that. And if you want to do non-trivial data-reduction in Miller, you can.
|
||||
|
||||
Sixth thing: Must have **comprehensive documentation and unit-test**. Since Miller handles a lot of formats and solves a lot of problems, there's a lot to test and a lot to keep working correctly as I add features or optimize. And I wanted it to be able to explain itself -- not only through web docs like the one you're reading but also through `man mlr` and `mlr --help`, `mlr sort --help`, etc.
|
||||
Sixth thing: Must have **comprehensive documentation and unit-test**. Since Miller handles a wide range of formats and solves numerous problems, there's a lot to test and a lot to keep working correctly as I add features or optimize. And I wanted it to be able to explain itself -- not only through web docs like the one you're reading but also through `man mlr` and `mlr --help`, `mlr sort --help`, etc.
|
||||
|
||||
Seventh thing: **Must have a domain-specific language** (DSL) **but also must let you do common things without it**. All those little verbs Miller has to help you *avoid* having to write for-loops are great. I use them for keystroke-saving: `mlr stats1 -a mean,stddev,min,max -f quantity`, for example, without you having to write for-loops or define accumulator variables. But you also have to be able to break out of that and write arbitrary code when you want to: `mlr put '$distance = $rate * $time'` or anything else you can think up. In Perl/AWK/etc. it's all DSL. In xsv et al. it's all verbs. In Miller I like having the combination.
|
||||
Seventh thing: **Must have a domain-specific language** (DSL) **but also must let you do everyday things without it**. All those little verbs Miller has to help you *avoid* having to write for-loops are great. I use them for keystroke-saving: `mlr stats1 -a mean,stddev,min,max -f quantity`, for example, without you having to write for-loops or define accumulator variables. But you also have to be able to break out of that and write arbitrary code when you want to: `mlr put '$distance = $rate * $time'` or anything else you can think up. In Perl/AWK/etc. it's all DSL. In xsv et al. it's all verbs. In Miller, I like having the combination.
|
||||
|
||||
Eighth thing: It's an **awful lot of fun to write**. In my experience I didn't find any tools which do multi-format, streaming, efficient, multi-purpose, with DSL and non-DSL, so I wrote one. But I don't guarantee it's unique in the world. It fills a niche in the world (people use it) but it also fills a niche in my life.
|
||||
Eighth thing: It's an **awful lot of fun to write**. In my experience, I didn't find any tools that do multi-format, streaming, efficient, multi-purpose, with DSL and non-DSL, so I wrote one. But I don't guarantee it's unique in the world. It fills a niche in the world (people use it), but it also fills a niche in my life.
|
||||
|
||||
## Tradeoffs
|
||||
|
||||
Miller is command-line-only by design. People who want a graphical user interface won't find it here. This is in part (a) accommodating my personal preferences, and in part (b) guided by my experience/belief that the command line is very expressive. Steeper learning curve than a GUI, yes. I consider that price worth paying for the tool-niche which Miller occupies.
|
||||
Miller is command-line-only by design. People who want a graphical user interface won't find it here. This is in part (a) accommodating my personal preferences, and in part (b) guided by my experience/belief that the command line is very expressive. Steeper learning curve than a GUI, yes. That price is worth paying for the tool-niche which Miller occupies.
|
||||
|
||||
Another tradeoff: supporting lists of records keeps me supporting only what can be expressed in *all* of those formats. For example, `[1,2,3,4,5]` is valid but unmillerable JSON: the list elements are not records. So Miller can't (and won't) handle arbitrary JSON -- because Miller only handles tabular data which can be expressed in a variety of formats.
|
||||
Another tradeoff: supporting lists of records keeps me supporting only what can be expressed in *all* of those formats. For example, `[1,2,3,4,5]` is valid but unmillerable JSON: the list elements are not records. So Miller can't (and won't) handle arbitrary JSON -- because Miller only handles tabular data, which can be expressed in a variety of formats.
|
||||
|
||||
A third tradeoff is doing build-from-scratch in a low-level language. It'd be quicker to write (but slower to run) if written in a high-level language. If Miller were written in Python, it would be implemented in significantly fewer lines of code than its current Go implementation. The DSL would just be an `eval` of Python code. And it would run slower, but maybe not enough slower to be a problem for most folks. Later I found out about the [rows](https://github.com/turicas/rows) tool -- if you find Miller useful, you should check out `rows` as well.
|
||||
A third tradeoff is building from scratch in a low-level language. It'd be quicker to write (but slower to run) if written in a high-level language. If Miller were written in Python, it would be implemented in significantly fewer lines of code than its current Go implementation. The DSL would be an `eval` of Python code. And it would run slower, but maybe not slow enough to be a problem for most people. Later, I discovered the [rows](https://github.com/turicas/rows) tool -- if you find Miller useful, you should also check out `rows`.
|
||||
|
||||
A fourth tradeoff is in the DSL (more visibly so in 5.0.0 but already in pre-5.0.0): how much to make it dynamically typed -- so you can just say `y=x+1` with a minimum number of keystrokes -- vs. having it do a good job of telling you when you've made a typo. This is a common paradigm across *all* languages. Some like Ruby you don't declare anything and they're quick to code little stuff in but programs of even a few thousand lines (which isn't large in the software world) become insanely unmanageable. Then, Java at the other extreme, does scale and is very typesafe -- but you have to type in a lot of punctuation, angle brackets, datatypes, repetition, etc. just to be able to get anything done. And some in the middle like Go are typesafe but with type-inference which aim to do the best of both. In the Miller (5.0.0) DSL you get `y=x+1` by default but you can have things like `int y = x+1` etc. so the typesafety is opt-in. See also the [Type-checking page](reference-dsl-variables.md#type-checking) for more information on this.
|
||||
A fourth tradeoff is in the DSL (more visibly so in 5.0.0 but already in pre-5.0.0): how much to make it dynamically typed -- so you can just say `y=x+1` with a minimum number of keystrokes -- vs. having it do a good job of telling you when you've made a typo. This is a common paradigm across *all* languages. In some languages, like Ruby, you don't declare anything, and they're quick to code little stuff in, but programs of even a few thousand lines (which isn't large in the software world) become insanely unmanageable. Then, Java at the other extreme, does scale and is very typesafe -- but you have to type in a lot of punctuation, angle brackets, datatypes, repetition, etc., just to be able to get anything done. And some in the middle, like Go, are typesafe but with type inference, which aim to do the best of both. In the Miller (5.0.0) DSL, you get `y=x+1` by default, but you can have things like `int y = x+1` etc., so the typesafety is opt-in. See also the [Type-checking page](reference-dsl-variables.md#type-checking) for more information on this.
|
||||
|
||||
## Related tools
|
||||
|
||||
Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). It doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well.
|
||||
Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). It doesn't mention [rows](https://github.com/turicas/rows), so here's a plug for that as well.
|
||||
|
||||
## Moving forward
|
||||
|
||||
I originally aimed Miller at people who already know what `sed`/`awk`/`cut`/`sort`/`join` are and wanted some options. But as time goes by I realize that tools like this can be useful to folks who *don't* know what those things are; people who aren't primarily coders; people who are scientists, or data scientists. These days some journalists do data analysis. So moving forward in terms of docs, I am working on having more cookbook, follow-by-example stuff in addition to the existing language-reference kinds of stuff. And continuing to seek out input from people who use Miller on where to go next.
|
||||
I initially aimed Miller at people who already know what `sed`/`awk`/`cut`/`sort`/`join` are and wanted some options. But as time goes by, I realize that tools like this can be helpful to folks who *don't* know what those things are; people who aren't primarily coders; people who are scientists, or data scientists. These days some journalists do data analysis. Moving forward in terms of docs, I am working on having more cookbook, follow-by-example stuff in addition to the existing language-reference kinds of stuff. And continuing to seek out input from people who use Miller on where to go next.
|
||||
|
|
|
|||
|
|
@ -4,44 +4,44 @@ Someone asked me the other day about design, tradeoffs, thought process, why I f
|
|||
|
||||
## Who is Miller for?
|
||||
|
||||
For background, I'm a software engineer, with a heavy devops bent and a non-trivial amount of data-engineering in my career. **Initially I wrote Miller mainly for myself:** I'm coder-friendly (being a coder); I'm Github-friendly; most of my data are well-structured or easily structurable (TSV-formatted SQL-query output, CSV files, log files, JSON data structures); I care about interoperability between all the various formats Miller supports (I've encountered them all); I do all my work on Linux or OS X.
|
||||
For background, I'm a software engineer with a heavy devops bent and a non-trivial amount of data engineering in my career. **Initially, I wrote Miller mainly for myself:** I'm coder-friendly (being a coder); I'm Github-friendly; most of my data is either well-structured or easily structurable (TSV-formatted SQL-query output, CSV files, log files, JSON data structures); I care about interoperability between all the various formats Miller supports (I've encountered them all); I do all my work on Linux or OS X.
|
||||
|
||||
But now there's this neat little tool **which seems to be useful for people in various disciplines**. I don't even know entirely *who*. I can click through Github starrers and read a bit about what they seem to do, but not everyone that uses Miller is even *on* Github (or stars things). I've gotten a lot of feature requests through Github -- but only from people who are Github users. Not everyone's a coder (it seems like a lot of Miller's Github starrers are devops folks like myself, or data-science-ish people, or biology/genomics folks.) A lot of people care 100% about CSV. And so on.
|
||||
But now there's this neat little tool **which seems to be useful for people in various disciplines**. I don't even know entirely *who*. I can click through Github starrers and read a bit about what they seem to do, but not everyone who uses Miller is even *on* Github (or stars things). I've gotten a lot of feature requests through Github -- but only from people who are Github users. Not everyone's a coder (it seems like many of Miller's Github starrers are devops folks like myself, or data-science-ish people, or biology/genomics folks.) A lot of people care 100% about CSV. And so on.
|
||||
|
||||
So the reason for the [Miller User Survey](https://github.com/johnkerl/miller/discussions/542) is to answer questions such as: does Miller do what you need? Do you use it for all sorts of things, or just one or two nice things? Are there things you wish it did but it doesn't? Is it almost there, or just nowhere near what you want? Are there not enough features or way too many? Are the docs too complicated; do you have a hard time finding out how to do what you want? Should I think differently about what this tool even *is* in the first place? Should I think differently about who it's for?
|
||||
So the reason for the [Miller User Survey](https://github.com/johnkerl/miller/discussions/542) is to answer questions such as: does Miller do what you need? Do you use it for all sorts of things, or just one or two nice things? Are there things you wish it did, but it doesn't? Is it almost there, or just nowhere near what you want? Are there not enough features or way too many? Are the docs too complicated? Do you have a hard time finding out how to do what you want? Should I think differently about what this tool even *is* in the first place? Should I think differently about who it's for?
|
||||
|
||||
## What was Miller created to do?
|
||||
|
||||
First: there are tools like `xsv` which handles CSV marvelously and `jq` which handles JSON marvelously, and so on -- but I over the years of my career in the software industry I've found myself, and others, doing a lot of ad-hoc things which really were fundamentally the same *except* for format. So the number one thing about Miller is doing common things while supporting **multiple formats**: (a) ingest a list of records where a record is a list of key-value pairs (however represented in the input files); (b) transform that stream of records; (c) emit the transformed stream -- either in the same format as input, or in a different format.
|
||||
The first thing: there are tools like `xsv` which handles CSV marvelously and `jq` which handles JSON marvelously, and so on -- but I over the years of my career in the software industry I've found myself, and others, doing a lot of ad-hoc things which were fundamentally the same *except* for format. So the number one thing about Miller is doing common things while supporting **multiple formats**: (a) ingest a list of records where a record is a list of key-value pairs (however represented in the input files); (b) transform that stream of records; (c) emit the transformed stream -- either in the same format as input, or in a different format.
|
||||
|
||||
Second thing, a lot like the first: just as I didn't want to build something only for a single file format, I didn't want to build something only for one problem domain. In my work doing software engineering, devops, data engineering, etc. I saw a lot of commonalities and I wanted to **solve as many problems simultaneously as possible**.
|
||||
The second thing is a lot like the first: just as I didn't want to build something only for a single file format, I didn't want to build something only for one problem domain. In my work doing software engineering, devops, data engineering, etc. I saw a lot of commonalities, and I wanted to **solve as many problems simultaneously as possible**.
|
||||
|
||||
Third: it had to be **streaming**. As time goes by and we (some of us, sometimes) have machines with tens or hundreds of GB of RAM, it's maybe less important, but I'm unhappy with tools which ingest all data, then do stuff, then emit all data. One reason is to be able to handle files bigger than available RAM. Another reason is to be able to handle input which trickles in, e.g. you have some process emitting data now and then and you can pipe it to Miller and it will emit transformed records one at a time.
|
||||
Third: it had to be **streaming**. As time goes by and we (some of us, sometimes) have machines with tens or hundreds of GB of RAM, it's less important, but I'm unhappy with tools that ingest all data, then do stuff, then emit all data. One reason is to be able to handle files bigger than available RAM. Another reason is to be able to handle input which trickles in, e.g., you have some process emitting data now and then, and you can pipe it to Miller and it will emit transformed records one at a time.
|
||||
|
||||
Fourth: it had to be **fast**. This precludes all sorts of very nice things written in Ruby, for example. I love Ruby as a very expressive language, and I have several very useful little utility scripts written in Ruby. But a few years ago I ported over some of my old tried-and-true C programs and the lines-of-code count was a *lot* lower -- it was great! Until I ran them on multi-GB files and realized they took 60x as long to complete. So I couldn't write Miller in Ruby, or in languages like it. I was going to have to do something in a low-level language in order to make it performant.
|
||||
Fourth: it had to be **fast**. This precludes all sorts of very nice things written in Ruby, for example. I love Ruby as a very expressive language, and I have several very useful little utility scripts written in Ruby. But a few years ago, I ported over some of my old tried-and-true C programs and the lines-of-code count was a *lot* lower -- it was great! Until I ran them on multi-GB files and realized they took 60x as long to complete. So I couldn't write Miller in Ruby, or languages like it. I was going to have to do something in a low-level language in order to make it performant.
|
||||
|
||||
Fifth thing: I wanted Miller to be **pipe-friendly and interoperate with other command-line tools**. Since the basic paradigm is ingest records, transform records, emit records -- where the input and output formats can be the same or different, and the transform can be complex, or just pass-through -- this means you can use it to transform data, or re-format it, or both. So if you just want to do data-cleaning/prep/formatting and do all the "real" work in R, you can. If you just want a little glue script between other tools you can get that. And if you want to do non-trivial data-reduction in Miller you can.
|
||||
The fifth thing: I wanted Miller to be **pipe-friendly and interoperate with other command-line tools**. Since the basic paradigm is ingest records, transform records, emit records -- where the input and output formats can be the same or different, and the transform can be complex, or just pass-through -- this means you can use it to transform data, or re-format it, or both. So if you just want to do data-cleaning/prep/formatting and do all the "real" work in R, you can. If you want a little glue script between other tools, you can get that. And if you want to do non-trivial data-reduction in Miller, you can.
|
||||
|
||||
Sixth thing: Must have **comprehensive documentation and unit-test**. Since Miller handles a lot of formats and solves a lot of problems, there's a lot to test and a lot to keep working correctly as I add features or optimize. And I wanted it to be able to explain itself -- not only through web docs like the one you're reading but also through `man mlr` and `mlr --help`, `mlr sort --help`, etc.
|
||||
Sixth thing: Must have **comprehensive documentation and unit-test**. Since Miller handles a wide range of formats and solves numerous problems, there's a lot to test and a lot to keep working correctly as I add features or optimize. And I wanted it to be able to explain itself -- not only through web docs like the one you're reading but also through `man mlr` and `mlr --help`, `mlr sort --help`, etc.
|
||||
|
||||
Seventh thing: **Must have a domain-specific language** (DSL) **but also must let you do common things without it**. All those little verbs Miller has to help you *avoid* having to write for-loops are great. I use them for keystroke-saving: `mlr stats1 -a mean,stddev,min,max -f quantity`, for example, without you having to write for-loops or define accumulator variables. But you also have to be able to break out of that and write arbitrary code when you want to: `mlr put '$distance = $rate * $time'` or anything else you can think up. In Perl/AWK/etc. it's all DSL. In xsv et al. it's all verbs. In Miller I like having the combination.
|
||||
Seventh thing: **Must have a domain-specific language** (DSL) **but also must let you do everyday things without it**. All those little verbs Miller has to help you *avoid* having to write for-loops are great. I use them for keystroke-saving: `mlr stats1 -a mean,stddev,min,max -f quantity`, for example, without you having to write for-loops or define accumulator variables. But you also have to be able to break out of that and write arbitrary code when you want to: `mlr put '$distance = $rate * $time'` or anything else you can think up. In Perl/AWK/etc. it's all DSL. In xsv et al. it's all verbs. In Miller, I like having the combination.
|
||||
|
||||
Eighth thing: It's an **awful lot of fun to write**. In my experience I didn't find any tools which do multi-format, streaming, efficient, multi-purpose, with DSL and non-DSL, so I wrote one. But I don't guarantee it's unique in the world. It fills a niche in the world (people use it) but it also fills a niche in my life.
|
||||
Eighth thing: It's an **awful lot of fun to write**. In my experience, I didn't find any tools that do multi-format, streaming, efficient, multi-purpose, with DSL and non-DSL, so I wrote one. But I don't guarantee it's unique in the world. It fills a niche in the world (people use it), but it also fills a niche in my life.
|
||||
|
||||
## Tradeoffs
|
||||
|
||||
Miller is command-line-only by design. People who want a graphical user interface won't find it here. This is in part (a) accommodating my personal preferences, and in part (b) guided by my experience/belief that the command line is very expressive. Steeper learning curve than a GUI, yes. I consider that price worth paying for the tool-niche which Miller occupies.
|
||||
Miller is command-line-only by design. People who want a graphical user interface won't find it here. This is in part (a) accommodating my personal preferences, and in part (b) guided by my experience/belief that the command line is very expressive. Steeper learning curve than a GUI, yes. That price is worth paying for the tool-niche which Miller occupies.
|
||||
|
||||
Another tradeoff: supporting lists of records keeps me supporting only what can be expressed in *all* of those formats. For example, `[1,2,3,4,5]` is valid but unmillerable JSON: the list elements are not records. So Miller can't (and won't) handle arbitrary JSON -- because Miller only handles tabular data which can be expressed in a variety of formats.
|
||||
Another tradeoff: supporting lists of records keeps me supporting only what can be expressed in *all* of those formats. For example, `[1,2,3,4,5]` is valid but unmillerable JSON: the list elements are not records. So Miller can't (and won't) handle arbitrary JSON -- because Miller only handles tabular data, which can be expressed in a variety of formats.
|
||||
|
||||
A third tradeoff is doing build-from-scratch in a low-level language. It'd be quicker to write (but slower to run) if written in a high-level language. If Miller were written in Python, it would be implemented in significantly fewer lines of code than its current Go implementation. The DSL would just be an `eval` of Python code. And it would run slower, but maybe not enough slower to be a problem for most folks. Later I found out about the [rows](https://github.com/turicas/rows) tool -- if you find Miller useful, you should check out `rows` as well.
|
||||
A third tradeoff is building from scratch in a low-level language. It'd be quicker to write (but slower to run) if written in a high-level language. If Miller were written in Python, it would be implemented in significantly fewer lines of code than its current Go implementation. The DSL would be an `eval` of Python code. And it would run slower, but maybe not slow enough to be a problem for most people. Later, I discovered the [rows](https://github.com/turicas/rows) tool -- if you find Miller useful, you should also check out `rows`.
|
||||
|
||||
A fourth tradeoff is in the DSL (more visibly so in 5.0.0 but already in pre-5.0.0): how much to make it dynamically typed -- so you can just say `y=x+1` with a minimum number of keystrokes -- vs. having it do a good job of telling you when you've made a typo. This is a common paradigm across *all* languages. Some like Ruby you don't declare anything and they're quick to code little stuff in but programs of even a few thousand lines (which isn't large in the software world) become insanely unmanageable. Then, Java at the other extreme, does scale and is very typesafe -- but you have to type in a lot of punctuation, angle brackets, datatypes, repetition, etc. just to be able to get anything done. And some in the middle like Go are typesafe but with type-inference which aim to do the best of both. In the Miller (5.0.0) DSL you get `y=x+1` by default but you can have things like `int y = x+1` etc. so the typesafety is opt-in. See also the [Type-checking page](reference-dsl-variables.md#type-checking) for more information on this.
|
||||
A fourth tradeoff is in the DSL (more visibly so in 5.0.0 but already in pre-5.0.0): how much to make it dynamically typed -- so you can just say `y=x+1` with a minimum number of keystrokes -- vs. having it do a good job of telling you when you've made a typo. This is a common paradigm across *all* languages. In some languages, like Ruby, you don't declare anything, and they're quick to code little stuff in, but programs of even a few thousand lines (which isn't large in the software world) become insanely unmanageable. Then, Java at the other extreme, does scale and is very typesafe -- but you have to type in a lot of punctuation, angle brackets, datatypes, repetition, etc., just to be able to get anything done. And some in the middle, like Go, are typesafe but with type inference, which aim to do the best of both. In the Miller (5.0.0) DSL, you get `y=x+1` by default, but you can have things like `int y = x+1` etc., so the typesafety is opt-in. See also the [Type-checking page](reference-dsl-variables.md#type-checking) for more information on this.
|
||||
|
||||
## Related tools
|
||||
|
||||
Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). It doesn't mention [rows](https://github.com/turicas/rows) so here's a plug for that as well.
|
||||
Here's a comprehensive list: [https://github.com/dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools). It doesn't mention [rows](https://github.com/turicas/rows), so here's a plug for that as well.
|
||||
|
||||
## Moving forward
|
||||
|
||||
I originally aimed Miller at people who already know what `sed`/`awk`/`cut`/`sort`/`join` are and wanted some options. But as time goes by I realize that tools like this can be useful to folks who *don't* know what those things are; people who aren't primarily coders; people who are scientists, or data scientists. These days some journalists do data analysis. So moving forward in terms of docs, I am working on having more cookbook, follow-by-example stuff in addition to the existing language-reference kinds of stuff. And continuing to seek out input from people who use Miller on where to go next.
|
||||
I initially aimed Miller at people who already know what `sed`/`awk`/`cut`/`sort`/`join` are and wanted some options. But as time goes by, I realize that tools like this can be helpful to folks who *don't* know what those things are; people who aren't primarily coders; people who are scientists, or data scientists. These days some journalists do data analysis. Moving forward in terms of docs, I am working on having more cookbook, follow-by-example stuff in addition to the existing language-reference kinds of stuff. And continuing to seek out input from people who use Miller on where to go next.
|
||||
|
|
|
|||
19
go.mod
19
go.mod
|
|
@ -14,27 +14,32 @@ module github.com/johnkerl/miller/v6
|
|||
// Local development:
|
||||
// replace github.com/johnkerl/lumin => /Users/kerl/git/johnkerl/lumin
|
||||
|
||||
go 1.21
|
||||
go 1.24.0
|
||||
|
||||
require (
|
||||
github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb
|
||||
github.com/johnkerl/lumin v1.0.0
|
||||
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51
|
||||
github.com/klauspost/compress v1.17.10
|
||||
github.com/lestrrat-go/strftime v1.1.0
|
||||
github.com/klauspost/compress v1.18.3
|
||||
github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1
|
||||
github.com/lestrrat-go/strftime v1.1.1
|
||||
github.com/mattn/go-isatty v0.0.20
|
||||
github.com/nine-lives-later/go-windows-terminal-sequences v1.0.4
|
||||
github.com/pkg/profile v1.7.0
|
||||
github.com/stretchr/testify v1.9.0
|
||||
golang.org/x/sys v0.25.0
|
||||
golang.org/x/term v0.24.0
|
||||
golang.org/x/text v0.18.0
|
||||
github.com/stretchr/testify v1.11.1
|
||||
golang.org/x/sys v0.40.0
|
||||
golang.org/x/term v0.39.0
|
||||
golang.org/x/text v0.33.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/felixge/fgprof v0.9.3 // indirect
|
||||
github.com/golang/snappy v1.0.0 // indirect
|
||||
github.com/google/pprof v0.0.0-20211214055906-6f57359322fd // indirect
|
||||
github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
golang.org/x/tools v0.40.0 // indirect
|
||||
gonum.org/v1/gonum v0.16.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
|
|
|
|||
35
go.sum
35
go.sum
|
|
@ -8,6 +8,8 @@ github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb h1:IT4JYU7k4ikYg1S
|
|||
github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb/go.mod h1:bH6Xx7IW64qjjJq8M2u4dxNaBiDfKK+z/3eGDpXEQhc=
|
||||
github.com/felixge/fgprof v0.9.3 h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g=
|
||||
github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw=
|
||||
github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
|
||||
github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||
github.com/google/pprof v0.0.0-20211214055906-6f57359322fd h1:1FjCyPC+syAzJ5/2S8fqdZK1R22vvA0J7JZKcuOIQ7Y=
|
||||
github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg=
|
||||
github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w=
|
||||
|
|
@ -15,11 +17,16 @@ github.com/johnkerl/lumin v1.0.0 h1:CV34cHZOJ92Y02RbQ0rd4gA0C06Qck9q8blOyaPoWpU=
|
|||
github.com/johnkerl/lumin v1.0.0/go.mod h1:eLf5AdQOaLvzZ2zVy4REr/DSeEwG+CZreHwNLICqv9E=
|
||||
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs=
|
||||
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8=
|
||||
github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0=
|
||||
github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
|
||||
github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
|
||||
github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
||||
github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb h1:Z5BVHFk/DLOIUAd2NycF0mLtKfhl7ynm4Uy5+AFhT48=
|
||||
github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb/go.mod h1:+U+6yzfITr4/teU2YhxWhdyw6YzednT/16/UBMjlDrU=
|
||||
github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1 h1:UyIQ1VTQq/0CS/wLYjf3DV6uRKTd1xcsng3BccM4XCY=
|
||||
github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1/go.mod h1:uvVFnikBpVz7S1pdsyUI+BBRlz64vmU6Q+kviiB+fpU=
|
||||
github.com/lestrrat-go/envload v0.0.0-20180220234015-a3eb8ddeffcc h1:RKf14vYWi2ttpEmkA4aQ3j4u9dStX2t4M8UM6qqNsG8=
|
||||
github.com/lestrrat-go/strftime v1.1.0 h1:gMESpZy44/4pXLO/m+sL0yBd1W6LjgjrrD4a68Gapyg=
|
||||
github.com/lestrrat-go/strftime v1.1.0/go.mod h1:uzeIB52CeUJenCo1syghlugshMysrqUT51HlxphXVeI=
|
||||
github.com/lestrrat-go/envload v0.0.0-20180220234015-a3eb8ddeffcc/go.mod h1:kopuH9ugFRkIXf3YoqHKyrJ9YfUFsckUU9S7B+XP+is=
|
||||
github.com/lestrrat-go/strftime v1.1.1 h1:zgf8QCsgj27GlKBy3SU9/8MMgegZ8UCzlCyHYrUF0QU=
|
||||
github.com/lestrrat-go/strftime v1.1.1/go.mod h1:YDrzHJAODYQ+xxvrn5SG01uFIQAeDTzpxNVppCz7Nmw=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/nine-lives-later/go-windows-terminal-sequences v1.0.4 h1:NC4H8hewgaktBqMI5yzy6L/Vln5/H7BEziyxaE2fX3Y=
|
||||
|
|
@ -32,16 +39,20 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
|
|||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
|
||||
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.24.0 h1:Mh5cbb+Zk2hqqXNO7S1iTjEphVL+jb8ZWaqh/g+JWkM=
|
||||
golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8=
|
||||
golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224=
|
||||
golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
|
||||
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
|
||||
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY=
|
||||
golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww=
|
||||
golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
|
||||
golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
|
||||
golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA=
|
||||
golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc=
|
||||
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
|
||||
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
|
|
|||
100
man/manpage.txt
100
man/manpage.txt
|
|
@ -27,7 +27,7 @@
|
|||
insertion-ordered hash map. This encompasses a variety of data
|
||||
formats, including but not limited to the familiar CSV, TSV, and JSON.
|
||||
(Miller can handle positionally-indexed data as a special case.) This
|
||||
manpage documents mlr 6.13.0.
|
||||
manpage documents mlr 6.16.0.
|
||||
|
||||
1mEXAMPLES0m
|
||||
mlr --icsv --opprint cat example.csv
|
||||
|
|
@ -124,6 +124,7 @@
|
|||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv/tsv-only-flags
|
||||
mlr help dkvp-only-flags
|
||||
mlr help file-format-flags
|
||||
mlr help flatten-unflatten-flags
|
||||
mlr help format-conversion-keystroke-saver-flags
|
||||
|
|
@ -177,8 +178,8 @@
|
|||
merge-fields most-frequent nest nothing put regularize remove-empty-columns
|
||||
rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle
|
||||
skip-trivial-records sort sort-within-records sparsify split ssub stats1
|
||||
stats2 step sub summary tac tail tee template top utf8-to-latin1 unflatten
|
||||
uniq unspace unsparsify
|
||||
stats2 step sub summary surv tac tail tee template top utf8-to-latin1
|
||||
unflatten uniq unspace unsparsify
|
||||
|
||||
1mFUNCTION LIST0m
|
||||
abs acos acosh antimode any append apply arrayify asin asinh asserting_absent
|
||||
|
|
@ -233,12 +234,14 @@
|
|||
within the input.
|
||||
--pass-comments-with {string}
|
||||
Immediately print commented lines within input, with
|
||||
specified prefix.
|
||||
specified prefix. For CSV input format, the prefix
|
||||
must be a single character.
|
||||
--skip-comments Ignore commented lines (prefixed by `#`) within the
|
||||
input.
|
||||
--skip-comments-with {string}
|
||||
Ignore commented lines within input, with specified
|
||||
prefix.
|
||||
prefix. For CSV input format, the prefix must be a
|
||||
single character.
|
||||
|
||||
1mCOMPRESSED-DATA FLAGS0m
|
||||
Miller offers a few different ways to handle reading data files
|
||||
|
|
@ -335,6 +338,16 @@
|
|||
-N Keystroke-saver for `--implicit-csv-header
|
||||
--headerless-csv-output`.
|
||||
|
||||
1mDKVP-ONLY FLAGS0m
|
||||
These are flags which are applicable to DKVP format.
|
||||
|
||||
--incr-key Without this option, keyless DKVP fields are keyed by
|
||||
field number. For example: `a=10,b=20,30,d=40,50` is
|
||||
ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With
|
||||
this option, they're keyed by a running counter of
|
||||
keyless fields. For example: `a=10,b=20,30,d=40,50`
|
||||
is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`.
|
||||
|
||||
1mFILE-FORMAT FLAGS0m
|
||||
See the File formats doc page, and or `mlr help file-formats`, for more
|
||||
about file formats Miller supports.
|
||||
|
|
@ -347,9 +360,9 @@
|
|||
are overridden in all cases by setting output format to `format2`.
|
||||
|
||||
--asv or --asvlite Use ASV format for input and output data.
|
||||
--csv or -c Use CSV format for input and output data.
|
||||
--csv or -c or --c2c Use CSV format for input and output data.
|
||||
--csvlite Use CSV-lite format for input and output data.
|
||||
--dkvp Use DKVP format for input and output data.
|
||||
--dkvp or --d2d Use DKVP format for input and output data.
|
||||
--gen-field-name Specify field name for --igen. Defaults to "i".
|
||||
--gen-start Specify start value for --igen. Defaults to 1.
|
||||
--gen-step Specify step value for --igen. Defaults to 1.
|
||||
|
|
@ -373,9 +386,9 @@
|
|||
--itsvlite Use TSV-lite format for input data.
|
||||
--iusv or --iusvlite Use USV format for input data.
|
||||
--ixtab Use XTAB format for input data.
|
||||
--json or -j Use JSON format for input and output data.
|
||||
--jsonl Use JSON Lines format for input and output data.
|
||||
--nidx Use NIDX format for input and output data.
|
||||
--json or -j or --j2j Use JSON format for input and output data.
|
||||
--jsonl or --l2l Use JSON Lines format for input and output data.
|
||||
--nidx or --n2n Use NIDX format for input and output data.
|
||||
--oasv or --oasvlite Use ASV format for output data.
|
||||
--ocsv Use CSV format for output data.
|
||||
--ocsvlite Use CSV-lite format for output data.
|
||||
|
|
@ -389,11 +402,11 @@
|
|||
--otsvlite Use TSV-lite format for output data.
|
||||
--ousv or --ousvlite Use USV format for output data.
|
||||
--oxtab Use XTAB format for output data.
|
||||
--pprint Use PPRINT format for input and output data.
|
||||
--tsv or -t Use TSV format for input and output data.
|
||||
--pprint or --p2p Use PPRINT format for input and output data.
|
||||
--tsv or -t or --t2t Use TSV format for input and output data.
|
||||
--tsvlite Use TSV-lite format for input and output data.
|
||||
--usv or --usvlite Use USV format for input and output data.
|
||||
--xtab Use XTAB format for input and output data.
|
||||
--xtab or --x2x Use XTAB format for input and output data.
|
||||
--xvright Right-justify values for XTAB format.
|
||||
-i {format name} Use format name for input data. For example: `-i csv`
|
||||
is the same as `--icsv`.
|
||||
|
|
@ -403,7 +416,7 @@
|
|||
1mFLATTEN-UNFLATTEN FLAGS0m
|
||||
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
|
||||
|
||||
See the Flatten/unflatten doc page for more information.
|
||||
See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
|
||||
|
||||
--flatsep or --jflatsep {string}
|
||||
Separator for flattening multi-level JSON keys, e.g.
|
||||
|
|
@ -411,32 +424,31 @@
|
|||
formats. Defaults to `.`.
|
||||
--no-auto-flatten When output is non-JSON, suppress the default
|
||||
auto-flatten behavior. Default: if `$y = [7,8,9]`
|
||||
then this flattens to `y.1=7,y.2=8,y.3=9, and
|
||||
then this flattens to `y.1=7,y.2=8,y.3=9`, and
|
||||
similarly for maps. With `--no-auto-flatten`, instead
|
||||
we get `$y=[1, 2, 3]`.
|
||||
--no-auto-unflatten When input non-JSON and output is JSON, suppress the
|
||||
default auto-unflatten behavior. Default: if the
|
||||
--no-auto-unflatten When input is non-JSON and output is JSON, suppress
|
||||
the default auto-unflatten behavior. Default: if the
|
||||
input has `y.1=7,y.2=8,y.3=9` then this unflattens to
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
|
||||
`--no-auto-flatten`, instead we get
|
||||
`${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
`$y=[7,8,9]`. With `--no-auto-flatten`, instead we
|
||||
get `${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
|
||||
1mFORMAT-CONVERSION KEYSTROKE-SAVER FLAGS0m
|
||||
As keystroke-savers for format-conversion you may use the following.
|
||||
The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX,
|
||||
JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively.
|
||||
|
||||
| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
|
||||
+----------+-------+-------+--------+--------+--------+--------+--------+--------+----------|
|
||||
| CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
|
||||
| TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
|
||||
| JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
|
||||
| JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m |
|
||||
| DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m |
|
||||
| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m |
|
||||
| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m |
|
||||
| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m |
|
||||
| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
|
||||
| In\out | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
|
||||
+----------+----------+----------+----------+-------+-------+-------+-------+--------+----------|
|
||||
| CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
|
||||
| TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
|
||||
| JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
|
||||
| JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m |
|
||||
| DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m |
|
||||
| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m |
|
||||
| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m |
|
||||
| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m |
|
||||
| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
|
||||
|
||||
-p Keystroke-saver for `--nidx --fs space --repifs`.
|
||||
-T Keystroke-saver for `--nidx --fs tab`.
|
||||
|
|
@ -1012,7 +1024,7 @@
|
|||
-r Treat field names as regular expressions. "ab", "a.*b" will
|
||||
match any field name containing the substring "ab" or matching
|
||||
"a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may
|
||||
be used. The -o flag is ignored when -r is present.
|
||||
be used.
|
||||
-h|--help Show this message.
|
||||
Examples:
|
||||
mlr cut -f hostname,status
|
||||
|
|
@ -1056,7 +1068,7 @@
|
|||
|
||||
1mfilter0m
|
||||
Usage: mlr filter [options] {DSL expression}
|
||||
Lets you use a domain-specific language to programatically filter which
|
||||
Lets you use a domain-specific language to programmatically filter which
|
||||
stream records will be output.
|
||||
See also: https://miller.readthedocs.io/en/latest/reference-verbs
|
||||
|
||||
|
|
@ -1254,6 +1266,8 @@
|
|||
See also the `sub` and `ssub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
|
||||
1mhaving-fields0m
|
||||
|
|
@ -1363,7 +1377,7 @@
|
|||
Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be
|
||||
expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'.
|
||||
Please use "mlr --usage-separator-options" for information on specifying separators.
|
||||
Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information
|
||||
Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information
|
||||
including examples.
|
||||
|
||||
1mlabel0m
|
||||
|
|
@ -1514,7 +1528,7 @@
|
|||
|
||||
1mput0m
|
||||
Usage: mlr put [options] {DSL expression}
|
||||
Lets you use a domain-specific language to programatically alter stream records.
|
||||
Lets you use a domain-specific language to programmatically alter stream records.
|
||||
See also: https://miller.readthedocs.io/en/latest/reference-verbs
|
||||
|
||||
Options:
|
||||
|
|
@ -1815,6 +1829,7 @@
|
|||
-nf {comma-separated field names} Same as -n
|
||||
-nr {comma-separated field names} Numerical descending; nulls sort first
|
||||
-t {comma-separated field names} Natural ascending
|
||||
-b Move sort fields to start of record, as in reorder -b
|
||||
-tr|-rt {comma-separated field names} Natural descending
|
||||
-h|--help Show this message.
|
||||
|
||||
|
|
@ -1889,6 +1904,8 @@
|
|||
the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
|
||||
1mstats10m
|
||||
|
|
@ -2036,6 +2053,8 @@
|
|||
See also the `gsub` and `ssub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
|
||||
1msummary0m
|
||||
|
|
@ -2081,6 +2100,15 @@
|
|||
--transpose Show output with field names as column names..
|
||||
-h|--help Show this message.
|
||||
|
||||
1msurv0m
|
||||
Usage: mlr surv -d {duration-field} -s {status-field}
|
||||
|
||||
Estimate Kaplan-Meier survival curve (right-censored).
|
||||
Options:
|
||||
-d {field} Name of duration field (time-to-event or censoring).
|
||||
-s {field} Name of status field (0=censored, 1=event).
|
||||
-h, --help Show this message.
|
||||
|
||||
1mtac0m
|
||||
Usage: mlr tac [options]
|
||||
Prints records in reverse order from the order in which they were encountered.
|
||||
|
|
@ -3710,4 +3738,4 @@
|
|||
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
|
||||
https://miller.readthedocs.io
|
||||
|
||||
2024-10-05 4mMILLER24m(1)
|
||||
2026-01-02 4mMILLER24m(1)
|
||||
|
|
|
|||
120
man/mlr.1
120
man/mlr.1
|
|
@ -2,12 +2,12 @@
|
|||
.\" Title: mlr
|
||||
.\" Author: [see the "AUTHOR" section]
|
||||
.\" Generator: ./mkman.rb
|
||||
.\" Date: 2024-10-05
|
||||
.\" Date: 2026-01-02
|
||||
.\" Manual: \ \&
|
||||
.\" Source: \ \&
|
||||
.\" Language: English
|
||||
.\"
|
||||
.TH "MILLER" "1" "2024-10-05" "\ \&" "\ \&"
|
||||
.TH "MILLER" "1" "2026-01-02" "\ \&" "\ \&"
|
||||
.\" -----------------------------------------------------------------
|
||||
.\" * Portability definitions
|
||||
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
@ -47,7 +47,7 @@ on integer-indexed fields: if the natural data structure for the latter is the
|
|||
array, then Miller's natural data structure is the insertion-ordered hash map.
|
||||
This encompasses a variety of data formats, including but not limited to the
|
||||
familiar CSV, TSV, and JSON. (Miller can handle positionally-indexed data as
|
||||
a special case.) This manpage documents mlr 6.13.0.
|
||||
a special case.) This manpage documents mlr 6.16.0.
|
||||
.SH "EXAMPLES"
|
||||
.sp
|
||||
|
||||
|
|
@ -161,6 +161,7 @@ Flags:
|
|||
mlr help comments-in-data-flags
|
||||
mlr help compressed-data-flags
|
||||
mlr help csv/tsv-only-flags
|
||||
mlr help dkvp-only-flags
|
||||
mlr help file-format-flags
|
||||
mlr help flatten-unflatten-flags
|
||||
mlr help format-conversion-keystroke-saver-flags
|
||||
|
|
@ -220,8 +221,8 @@ json-parse json-stringify join label latin1-to-utf8 least-frequent
|
|||
merge-fields most-frequent nest nothing put regularize remove-empty-columns
|
||||
rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle
|
||||
skip-trivial-records sort sort-within-records sparsify split ssub stats1
|
||||
stats2 step sub summary tac tail tee template top utf8-to-latin1 unflatten
|
||||
uniq unspace unsparsify
|
||||
stats2 step sub summary surv tac tail tee template top utf8-to-latin1
|
||||
unflatten uniq unspace unsparsify
|
||||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
|
|
@ -290,12 +291,14 @@ Notes:
|
|||
within the input.
|
||||
--pass-comments-with {string}
|
||||
Immediately print commented lines within input, with
|
||||
specified prefix.
|
||||
specified prefix. For CSV input format, the prefix
|
||||
must be a single character.
|
||||
--skip-comments Ignore commented lines (prefixed by `#`) within the
|
||||
input.
|
||||
--skip-comments-with {string}
|
||||
Ignore commented lines within input, with specified
|
||||
prefix.
|
||||
prefix. For CSV input format, the prefix must be a
|
||||
single character.
|
||||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
|
|
@ -410,6 +413,24 @@ These are flags which are applicable to CSV format.
|
|||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
.SH "DKVP-ONLY FLAGS"
|
||||
.sp
|
||||
|
||||
.if n \{\
|
||||
.RS 0
|
||||
.\}
|
||||
.nf
|
||||
These are flags which are applicable to DKVP format.
|
||||
|
||||
--incr-key Without this option, keyless DKVP fields are keyed by
|
||||
field number. For example: `a=10,b=20,30,d=40,50` is
|
||||
ingested as `$a=10,$b=20,$3=30,$d=40,$5=50`. With
|
||||
this option, they're keyed by a running counter of
|
||||
keyless fields. For example: `a=10,b=20,30,d=40,50`
|
||||
is ingested as `$a=10,$b=20,$1=30,$d=40,$2=50`.
|
||||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
.SH "FILE-FORMAT FLAGS"
|
||||
.sp
|
||||
|
||||
|
|
@ -428,9 +449,9 @@ The latter sets up input and output flags for `format1`, not all of which
|
|||
are overridden in all cases by setting output format to `format2`.
|
||||
|
||||
--asv or --asvlite Use ASV format for input and output data.
|
||||
--csv or -c Use CSV format for input and output data.
|
||||
--csv or -c or --c2c Use CSV format for input and output data.
|
||||
--csvlite Use CSV-lite format for input and output data.
|
||||
--dkvp Use DKVP format for input and output data.
|
||||
--dkvp or --d2d Use DKVP format for input and output data.
|
||||
--gen-field-name Specify field name for --igen. Defaults to "i".
|
||||
--gen-start Specify start value for --igen. Defaults to 1.
|
||||
--gen-step Specify step value for --igen. Defaults to 1.
|
||||
|
|
@ -454,9 +475,9 @@ are overridden in all cases by setting output format to `format2`.
|
|||
--itsvlite Use TSV-lite format for input data.
|
||||
--iusv or --iusvlite Use USV format for input data.
|
||||
--ixtab Use XTAB format for input data.
|
||||
--json or -j Use JSON format for input and output data.
|
||||
--jsonl Use JSON Lines format for input and output data.
|
||||
--nidx Use NIDX format for input and output data.
|
||||
--json or -j or --j2j Use JSON format for input and output data.
|
||||
--jsonl or --l2l Use JSON Lines format for input and output data.
|
||||
--nidx or --n2n Use NIDX format for input and output data.
|
||||
--oasv or --oasvlite Use ASV format for output data.
|
||||
--ocsv Use CSV format for output data.
|
||||
--ocsvlite Use CSV-lite format for output data.
|
||||
|
|
@ -470,11 +491,11 @@ are overridden in all cases by setting output format to `format2`.
|
|||
--otsvlite Use TSV-lite format for output data.
|
||||
--ousv or --ousvlite Use USV format for output data.
|
||||
--oxtab Use XTAB format for output data.
|
||||
--pprint Use PPRINT format for input and output data.
|
||||
--tsv or -t Use TSV format for input and output data.
|
||||
--pprint or --p2p Use PPRINT format for input and output data.
|
||||
--tsv or -t or --t2t Use TSV format for input and output data.
|
||||
--tsvlite Use TSV-lite format for input and output data.
|
||||
--usv or --usvlite Use USV format for input and output data.
|
||||
--xtab Use XTAB format for input and output data.
|
||||
--xtab or --x2x Use XTAB format for input and output data.
|
||||
--xvright Right-justify values for XTAB format.
|
||||
-i {format name} Use format name for input data. For example: `-i csv`
|
||||
is the same as `--icsv`.
|
||||
|
|
@ -492,7 +513,7 @@ are overridden in all cases by setting output format to `format2`.
|
|||
.nf
|
||||
These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
|
||||
|
||||
See the Flatten/unflatten doc page for more information.
|
||||
See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
|
||||
|
||||
--flatsep or --jflatsep {string}
|
||||
Separator for flattening multi-level JSON keys, e.g.
|
||||
|
|
@ -500,15 +521,14 @@ See the Flatten/unflatten doc page for more information.
|
|||
formats. Defaults to `.`.
|
||||
--no-auto-flatten When output is non-JSON, suppress the default
|
||||
auto-flatten behavior. Default: if `$y = [7,8,9]`
|
||||
then this flattens to `y.1=7,y.2=8,y.3=9, and
|
||||
then this flattens to `y.1=7,y.2=8,y.3=9`, and
|
||||
similarly for maps. With `--no-auto-flatten`, instead
|
||||
we get `$y=[1, 2, 3]`.
|
||||
--no-auto-unflatten When input non-JSON and output is JSON, suppress the
|
||||
default auto-unflatten behavior. Default: if the
|
||||
--no-auto-unflatten When input is non-JSON and output is JSON, suppress
|
||||
the default auto-unflatten behavior. Default: if the
|
||||
input has `y.1=7,y.2=8,y.3=9` then this unflattens to
|
||||
`$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
|
||||
`--no-auto-flatten`, instead we get
|
||||
`${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
`$y=[7,8,9]`. With `--no-auto-flatten`, instead we
|
||||
get `${y.1}=7,${y.2}=8,${y.3}=9`.
|
||||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
|
|
@ -523,17 +543,17 @@ As keystroke-savers for format-conversion you may use the following.
|
|||
The letters c, t, j, l, d, n, x, p, and m refer to formats CSV, TSV, DKVP, NIDX,
|
||||
JSON, JSON Lines, XTAB, PPRINT, and markdown, respectively.
|
||||
|
||||
| In\eout | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
|
||||
+----------+-------+-------+--------+--------+--------+--------+--------+--------+----------|
|
||||
| CSV | | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
|
||||
| TSV | --t2c | | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
|
||||
| JSON | --j2c | --j2t | | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
|
||||
| JSONL | --l2c | --l2t | | | --l2d | --l2n | --l2x | --l2p | --l2m |
|
||||
| DKVP | --d2c | --d2t | --d2j | --d2l | | --d2n | --d2x | --d2p | --d2m |
|
||||
| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | | --n2x | --n2p | --n2m |
|
||||
| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | | --x2p | --x2m |
|
||||
| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | | --p2m |
|
||||
| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
|
||||
| In\eout | CSV | TSV | JSON | JSONL | DKVP | NIDX | XTAB | PPRINT | Markdown |
|
||||
+----------+----------+----------+----------+-------+-------+-------+-------+--------+----------|
|
||||
| CSV | --c2c,-c | --c2t | --c2j | --c2l | --c2d | --c2n | --c2x | --c2p | --c2m |
|
||||
| TSV | --t2c | --t2t,-t | --t2j | --t2l | --t2d | --t2n | --t2x | --t2p | --t2m |
|
||||
| JSON | --j2c | --j2t | --j2j,-j | --j2l | --j2d | --j2n | --j2x | --j2p | --j2m |
|
||||
| JSONL | --l2c | --l2t | --l2j | --l2l | --l2d | --l2n | --l2x | --l2p | --l2m |
|
||||
| DKVP | --d2c | --d2t | --d2j | --d2l | --d2d | --d2n | --d2x | --d2p | --d2m |
|
||||
| NIDX | --n2c | --n2t | --n2j | --n2l | --n2d | --n2n | --n2x | --n2p | --n2m |
|
||||
| XTAB | --x2c | --x2t | --x2j | --x2l | --x2d | --x2n | --x2x | --x2p | --x2m |
|
||||
| PPRINT | --p2c | --p2t | --p2j | --p2l | --p2d | --p2n | --p2x | -p2p | --p2m |
|
||||
| Markdown | --m2c | --m2t | --m2j | --m2l | --m2d | --m2n | --m2x | --m2p | |
|
||||
|
||||
-p Keystroke-saver for `--nidx --fs space --repifs`.
|
||||
-T Keystroke-saver for `--nidx --fs tab`.
|
||||
|
|
@ -1251,7 +1271,7 @@ Options:
|
|||
-r Treat field names as regular expressions. "ab", "a.*b" will
|
||||
match any field name containing the substring "ab" or matching
|
||||
"a.*b", respectively; anchors of the form "^ab$", "^a.*b$" may
|
||||
be used. The -o flag is ignored when -r is present.
|
||||
be used.
|
||||
-h|--help Show this message.
|
||||
Examples:
|
||||
mlr cut -f hostname,status
|
||||
|
|
@ -1319,7 +1339,7 @@ Options:
|
|||
.\}
|
||||
.nf
|
||||
Usage: mlr filter [options] {DSL expression}
|
||||
Lets you use a domain-specific language to programatically filter which
|
||||
Lets you use a domain-specific language to programmatically filter which
|
||||
stream records will be output.
|
||||
See also: https://miller.readthedocs.io/en/latest/reference-verbs
|
||||
|
||||
|
|
@ -1335,7 +1355,7 @@ Options:
|
|||
Since the expression pieces are simply concatenated, please be sure to use intervening
|
||||
semicolons to separate expressions.)
|
||||
|
||||
-s name=value: Predefines out-of-stream variable @name to have
|
||||
-s name=value: Predefines out-of-stream variable @name to have
|
||||
Thus mlr put -s foo=97 '$column += @foo' is like
|
||||
mlr put 'begin {@foo = 97} $column += @foo'.
|
||||
The value part is subject to type-inferencing.
|
||||
|
|
@ -1565,6 +1585,8 @@ for the old string and handling multiple matches, like the `gsub` DSL function.
|
|||
See also the `sub` and `ssub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
.fi
|
||||
.if n \{\
|
||||
|
|
@ -1710,7 +1732,7 @@ be specified CSV as well unless you override with 'mlr --csv ... join --ijson -l
|
|||
Likewise, if you have 'mlr --csv --implicit-csv-header ...' then the join-in file will be
|
||||
expected to be headerless as well unless you put '--no-implicit-csv-header' after 'join'.
|
||||
Please use "mlr --usage-separator-options" for information on specifying separators.
|
||||
Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#join for more information
|
||||
Please see https://miller.readthedocs.io/en/latest/reference-verbs#join for more information
|
||||
including examples.
|
||||
.fi
|
||||
.if n \{\
|
||||
|
|
@ -1909,7 +1931,7 @@ Options:
|
|||
.\}
|
||||
.nf
|
||||
Usage: mlr put [options] {DSL expression}
|
||||
Lets you use a domain-specific language to programatically alter stream records.
|
||||
Lets you use a domain-specific language to programmatically alter stream records.
|
||||
See also: https://miller.readthedocs.io/en/latest/reference-verbs
|
||||
|
||||
Options:
|
||||
|
|
@ -1924,7 +1946,7 @@ Options:
|
|||
Since the expression pieces are simply concatenated, please be sure to use intervening
|
||||
semicolons to separate expressions.)
|
||||
|
||||
-s name=value: Predefines out-of-stream variable @name to have
|
||||
-s name=value: Predefines out-of-stream variable @name to have
|
||||
Thus mlr put -s foo=97 '$column += @foo' is like
|
||||
mlr put 'begin {@foo = 97} $column += @foo'.
|
||||
The value part is subject to type-inferencing.
|
||||
|
|
@ -2288,6 +2310,7 @@ Options:
|
|||
-nf {comma-separated field names} Same as -n
|
||||
-nr {comma-separated field names} Numerical descending; nulls sort first
|
||||
-t {comma-separated field names} Natural ascending
|
||||
-b Move sort fields to start of record, as in reorder -b
|
||||
-tr|-rt {comma-separated field names} Natural descending
|
||||
-h|--help Show this message.
|
||||
|
||||
|
|
@ -2386,6 +2409,8 @@ Replaces old string with new string in specified field(s), without regex support
|
|||
the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
.fi
|
||||
.if n \{\
|
||||
|
|
@ -2557,6 +2582,8 @@ for the old string and not handling multiple matches, like the `sub` DSL functio
|
|||
See also the `gsub` and `ssub` verbs.
|
||||
Options:
|
||||
-f {a,b,c} Field names to convert.
|
||||
-r {regex} Regular expression for field names to convert.
|
||||
-a Convert all fields.
|
||||
-h|--help Show this message.
|
||||
.fi
|
||||
.if n \{\
|
||||
|
|
@ -2610,6 +2637,21 @@ Options:
|
|||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
.SS "surv"
|
||||
.if n \{\
|
||||
.RS 0
|
||||
.\}
|
||||
.nf
|
||||
Usage: mlr surv -d {duration-field} -s {status-field}
|
||||
|
||||
Estimate Kaplan-Meier survival curve (right-censored).
|
||||
Options:
|
||||
-d {field} Name of duration field (time-to-event or censoring).
|
||||
-s {field} Name of status field (0=censored, 1=event).
|
||||
-h, --help Show this message.
|
||||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
.SS "tac"
|
||||
.if n \{\
|
||||
.RS 0
|
||||
|
|
|
|||
11
miller.spec
11
miller.spec
|
|
@ -1,6 +1,6 @@
|
|||
Summary: Name-indexed data processing tool
|
||||
Name: miller
|
||||
Version: 6.13.0
|
||||
Version: 6.16.0
|
||||
Release: 1%{?dist}
|
||||
License: BSD
|
||||
Source: https://github.com/johnkerl/miller/releases/download/%{version}/miller-%{version}.tar.gz
|
||||
|
|
@ -36,6 +36,15 @@ make install
|
|||
%{_mandir}/man1/mlr.1*
|
||||
|
||||
%changelog
|
||||
* Fri Jan 2 2026 John Kerl <kerl.john.r@gmail.com> - 6.16.0-1
|
||||
- 6.16.0 release
|
||||
|
||||
* Thu Aug 14 2025 John Kerl <kerl.john.r@gmail.com> - 6.15.0-1
|
||||
- 6.15.0 release
|
||||
|
||||
* Fri Jul 4 2025 John Kerl <kerl.john.r@gmail.com> - 6.14.0-1
|
||||
- 6.14.0 release
|
||||
|
||||
* Sat Oct 5 2024 John Kerl <kerl.john.r@gmail.com> - 6.13.0-1
|
||||
- 6.13.0 release
|
||||
|
||||
|
|
|
|||
|
|
@ -834,7 +834,7 @@ func min_i_ii(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
|
|||
// a=F | min=a min=a
|
||||
// a=T | min=b min=b
|
||||
func min_b_bb(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
|
||||
if input1.AcquireBoolValue() == false {
|
||||
if !input1.AcquireBoolValue() {
|
||||
return input1
|
||||
} else {
|
||||
return input2
|
||||
|
|
@ -946,7 +946,7 @@ func BIF_minlen_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval {
|
|||
}
|
||||
// Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation.
|
||||
retval := lib.UTF8Strlen(mlrvals[0].OriginalString())
|
||||
for i, _ := range mlrvals {
|
||||
for i := range mlrvals {
|
||||
clen := lib.UTF8Strlen(mlrvals[i].OriginalString())
|
||||
if clen < retval {
|
||||
retval = clen
|
||||
|
|
@ -1004,7 +1004,7 @@ func max_i_ii(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
|
|||
// a=F | max=a max=b
|
||||
// a=T | max=a max=b
|
||||
func max_b_bb(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
|
||||
if input2.AcquireBoolValue() == false {
|
||||
if !input2.AcquireBoolValue() {
|
||||
return input1
|
||||
} else {
|
||||
return input2
|
||||
|
|
@ -1116,7 +1116,7 @@ func BIF_maxlen_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval {
|
|||
}
|
||||
// Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation.
|
||||
retval := lib.UTF8Strlen(mlrvals[0].OriginalString())
|
||||
for i, _ := range mlrvals {
|
||||
for i := range mlrvals {
|
||||
clen := lib.UTF8Strlen(mlrvals[i].OriginalString())
|
||||
if clen > retval {
|
||||
retval = clen
|
||||
|
|
|
|||
|
|
@ -104,6 +104,8 @@ func _zero1(input1 *mlrval.Mlrval) *mlrval.Mlrval {
|
|||
}
|
||||
|
||||
// Return one (unary)
|
||||
//
|
||||
//lint:ignore U1000 util function might be used later
|
||||
func __one1(input1 *mlrval.Mlrval) *mlrval.Mlrval {
|
||||
return mlrval.FromInt(1)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -246,7 +246,7 @@ func eq_b_aa(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
|
|||
for i := range a {
|
||||
eq := BIF_equals(a[i], b[i])
|
||||
lib.InternalCodingErrorIf(eq.Type() != mlrval.MT_BOOL)
|
||||
if eq.AcquireBoolValue() == false {
|
||||
if !eq.AcquireBoolValue() {
|
||||
return mlrval.FALSE
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -373,7 +373,7 @@ func BIF_joink(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
|
|||
// joinv([3,4,5], ",") -> "3,4,5"
|
||||
// joinv({"a":3,"b":4,"c":5}, ",") -> "3,4,5"
|
||||
func BIF_joinv(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
|
||||
if !input2.IsString() {
|
||||
if !input2.IsStringOrVoid() {
|
||||
return mlrval.FromNotStringError("joinv", input2)
|
||||
}
|
||||
fieldSeparator := input2.AcquireStringValue()
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue