mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 18:25:45 +00:00
1541 lines
63 KiB
HTML
1541 lines
63 KiB
HTML
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
|
<html lang="en">
|
|
|
|
<!-- PAGE GENERATED FROM template.html and content-for-reference.html BY poki. -->
|
|
<!-- PLEASE MAKE CHANGES THERE AND THEN RE-RUN poki. -->
|
|
<head>
|
|
<meta http-equiv="Content-type" content="text/html;charset=UTF-8"/>
|
|
<meta name="description" content="Miller documentation"/>
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"/> <!-- mobile-friendly -->
|
|
<meta name="keywords"
|
|
content="John Kerl, Kerl, Miller, miller, mlr, OLAP, data analysis software, regression, correlation, variance, data tools, " />
|
|
|
|
<title> Reference </title>
|
|
<link rel="stylesheet" type="text/css" href="css/miller.css"/>
|
|
<link rel="stylesheet" type="text/css" href="css/poki-callbacks.css"/>
|
|
</head>
|
|
|
|
<!-- ================================================================ -->
|
|
<script type="text/javascript">
|
|
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
|
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
|
</script>
|
|
<script type="text/javascript">
|
|
try {
|
|
var pageTracker = _gat._getTracker("UA-15651652-1");
|
|
pageTracker._trackPageview();
|
|
} catch(err) {}
|
|
</script>
|
|
|
|
<!-- ================================================================ -->
|
|
<script type="text/javascript">
|
|
function toggle_div(div) {
|
|
if (div != null) {
|
|
if (div.id.startsWith("section_toggle_")) {
|
|
var state = div.style.display;
|
|
if (state == "block") {
|
|
div.style.display = "none";
|
|
} else {
|
|
div.style.display = "block";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
function expand_div(div) {
|
|
if (div != null) {
|
|
if (div.id.startsWith("section_toggle_")) {
|
|
div.style.display = "block";
|
|
}
|
|
}
|
|
}
|
|
function collapse_div(div) {
|
|
if (div != null) {
|
|
if (div.id.startsWith("section_toggle_")) {
|
|
div.style.display = "none";
|
|
}
|
|
}
|
|
}
|
|
|
|
function toggle_by_name(divName) {
|
|
toggle_div(document.getElementById(divName));
|
|
}
|
|
function expand_by_name(divName) {
|
|
expand_div(document.getElementById(divName));
|
|
}
|
|
function collapse_by_name(divName) {
|
|
collapse_div(document.getElementById(divName));
|
|
}
|
|
|
|
function expand_all() {
|
|
var divs = document.getElementsByTagName("div");
|
|
for(var i = 0; i < divs.length; i++) {
|
|
expand_div(divs[i]);
|
|
}
|
|
}
|
|
function collapse_all() {
|
|
var divs = document.getElementsByTagName("div");
|
|
for(var i = 0; i < divs.length; i++){
|
|
collapse_div(divs[i]);
|
|
}
|
|
}
|
|
</script>
|
|
|
|
<!--
|
|
The background image is from a screenshot of a Google search for "data analysis
|
|
tools", lightened and sepia-toned. Over this was placed a Mac Terminal app with
|
|
very light-grey font and translucent background, in which a few statistical
|
|
Miller commands were run with pretty-print-tabular output format.
|
|
<body background="pix/sepia-overlay.jpg">
|
|
-->
|
|
<body bgcolor="#ffffff">
|
|
|
|
<!-- ================================================================ -->
|
|
<table width="100%">
|
|
<tr>
|
|
|
|
<!-- navbar -->
|
|
<td width="15%">
|
|
<!--
|
|
<img src="pix/mlr.jpg" />
|
|
<img style="border-width:1px; color:black;" src="pix/mlr.jpg" />
|
|
-->
|
|
|
|
<div class="pokinav">
|
|
<center><titleinbody>Miller</titleinbody></center>
|
|
|
|
<!-- PAGE LIST GENERATED FROM template.html BY poki -->
|
|
<br/><b>Overview:</b>
|
|
<br/>• <a href="index.html">About Miller</a>
|
|
<br/>• <a href="10-min.html">Miller in 10 minutes</a>
|
|
<br/>• <a href="file-formats.html">File formats</a>
|
|
<br/>• <a href="feature-comparison.html">Miller features in the context of the Unix toolkit</a>
|
|
<br/>• <a href="record-heterogeneity.html">Record-heterogeneity</a>
|
|
<br/>• <a href="internationalization.html">Internationalization</a>
|
|
<br/><b>Using Miller:</b>
|
|
<br/>• <a href="faq.html">FAQ</a>
|
|
<br/>• <a href="cookbook.html">Cookbook part 1</a>
|
|
<br/>• <a href="cookbook2.html">Cookbook part 2</a>
|
|
<br/>• <a href="cookbook3.html">Cookbook part 3</a>
|
|
<br/>• <a href="data-examples.html">Data-diving examples</a>
|
|
<br/>• <a href="manpage.html">Manpage</a>
|
|
<br/>• <a href="reference.html"><b>Reference</b></a>
|
|
<br/>• <a href="reference-verbs.html">Reference: Verbs</a>
|
|
<br/>• <a href="reference-dsl.html">Reference: DSL</a>
|
|
<br/>• <a href="release-docs.html">Documents by release</a>
|
|
<br/>• <a href="build.html">Installation, portability, dependencies, and testing</a>
|
|
<br/><b>Background:</b>
|
|
<br/>• <a href="why.html">Why?</a>
|
|
<br/>• <a href="whyc.html">Why C?</a>
|
|
<br/>• <a href="etymology.html">Why call it Miller?</a>
|
|
<br/>• <a href="originality.html">How original is Miller?</a>
|
|
<br/>• <a href="performance.html">Performance</a>
|
|
<br/><b>Repository:</b>
|
|
<br/>• <a href="to-do.html">Things to do</a>
|
|
<br/>• <a href="contact.html">Contact information</a>
|
|
<br/>• <a href="https://github.com/johnkerl/miller">GitHub repo</a>
|
|
<br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/>
|
|
<br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/>
|
|
<br/> <br/> <br/> <br/> <br/> <br/>
|
|
</div>
|
|
</td>
|
|
|
|
<!-- page body -->
|
|
<td>
|
|
<!--
|
|
This is a visually gorgeous feature (here & in the CSS): it allows for
|
|
independent scroll of the nav and body panels. In particular the nav
|
|
stays on-screen as you scroll the body.
|
|
|
|
However, two problems:
|
|
|
|
(1) In Firefox & Chrome both I get janky end-of-body scrolls: there is
|
|
more content but I can't scroll down to it unless I repeatedly retry the
|
|
scrolldown. Which is weird.
|
|
|
|
(2) Worse, only the first page renders in PDF (again, Firefox & Chrome).
|
|
|
|
For now I'm disabling this separate-scroll feature. A frontender, I am
|
|
not ... maybe someday I'll find a config which gets *all* the features
|
|
I want; for now, it's a tradeoff.
|
|
-->
|
|
|
|
<!-- Implementation details: one bit is right here:
|
|
|
|
div style="overflow-y:scroll;height:1500px"
|
|
|
|
and the other bit is in css/poki-callbacks.css:
|
|
|
|
.pokinav {
|
|
display: inline-block;
|
|
background: #e8d9bc;
|
|
border: 1;
|
|
box-shadow: 0px 0px 3px 3px #C9C9C9;
|
|
margin: 10px;
|
|
padding-top: 10px;
|
|
padding-bottom: 10px;
|
|
padding-left: 10px;
|
|
padding-right: 10px;
|
|
overflow-y: scroll; < - - - - - - here
|
|
height: 1500px;
|
|
}
|
|
|
|
-->
|
|
<div>
|
|
<center> <titleinbody> Reference </titleinbody> </center>
|
|
<p/>
|
|
|
|
<!-- BODY COPIED FROM content-for-reference.html BY poki -->
|
|
<div class="pokitoc">
|
|
<center><b>Contents:</b></center>
|
|
• <a href="#Command_overview">Command overview</a><br/>
|
|
• <a href="#I/O_options">I/O options</a><br/>
|
|
• <a href="#Formats">Formats</a><br/>
|
|
• <a href="#In-place_mode">In-place mode</a><br/>
|
|
• <a href="#Compression">Compression</a><br/>
|
|
• <a href="#Record/field/pair_separators">Record/field/pair separators</a><br/>
|
|
• <a href="#Number_formatting">Number formatting</a><br/>
|
|
• <a href="#Data_transformations_(verbs)">Data transformations (verbs)</a><br/>
|
|
• <a href="#Expression_language_for_filter_and_put">Expression language for filter and put</a><br/>
|
|
• <a href="#then-chaining">then-chaining</a><br/>
|
|
• <a href="#Data_types">Data types</a><br/>
|
|
• <a href="#Null_data:_empty_and_absent">Null data: empty and absent</a><br/>
|
|
• <a href="#String_literals">String literals</a><br/>
|
|
• <a href="#Regular_expressions">Regular expressions</a><br/>
|
|
• <a href="#Regex_captures">Regex captures</a><br/>
|
|
• <a href="#Arithmetic">Arithmetic</a><br/>
|
|
• <a href="#Input_scanning">Input scanning</a><br/>
|
|
• <a href="#Conversion_by_math_routines">Conversion by math routines</a><br/>
|
|
• <a href="#Conversion_by_arithmetic_operators">Conversion by arithmetic operators</a><br/>
|
|
• <a href="#Pythonic_division">Pythonic division</a><br/>
|
|
• <a href="#On-line_help">On-line help</a><br/>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>
|
|
<button style="font-weight:bold;color:maroon;border:0" onclick="expand_all();" href="javascript:;">Expand all sections</button>
|
|
<button style="font-weight:bold;color:maroon;border:0" onclick="collapse_all();" href="javascript:;">Collapse all sections</button>
|
|
|
|
<a id="Command_overview"/><h1>Command overview</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_overview');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_overview" style="display: block">
|
|
|
|
<p>
|
|
Whereas the Unix toolkit is made of the separate executables <tt>cat</tt>, <tt>tail</tt>, <tt>cut</tt>,
|
|
<tt>sort</tt>, etc., Miller has subcommands, invoked as follows:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
mlr tac *.dat
|
|
mlr cut --complement -f os_version *.dat
|
|
mlr sort -f hostname,uptime *.dat
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>These fall into categories as follows:
|
|
|
|
<table border=1>
|
|
<tr class="mlrbg">
|
|
<th>Commands </th>
|
|
<th>Description</th>
|
|
</tr>
|
|
<tr>
|
|
<td>
|
|
<a href="reference-verbs.html#cat"><tt>cat</tt></a>,
|
|
<a href="reference-verbs.html#cut"><tt>cut</tt></a>,
|
|
<a href="reference-verbs.html#grep"><tt>grep</tt></a>,
|
|
<a href="reference-verbs.html#head"><tt>head</tt></a>,
|
|
<a href="reference-verbs.html#join"><tt>join</tt></a>,
|
|
<a href="reference-verbs.html#sort"><tt>sort</tt></a>,
|
|
<a href="reference-verbs.html#tac"><tt>tac</tt></a>,
|
|
<a href="reference-verbs.html#tail"><tt>tail</tt></a>,
|
|
<a href="reference-verbs.html#top"><tt>top</tt></a>,
|
|
<a href="reference-verbs.html#uniq"><tt>uniq</tt></a>
|
|
</td>
|
|
<td> Analogs of their Unix-toolkit namesakes, discussed below as well as in
|
|
<a href="feature-comparison.html">Miller features in the context of the Unix toolkit</a> </td>
|
|
</tr>
|
|
|
|
<tr>
|
|
<td>
|
|
<a href="reference-verbs.html#filter"><tt>filter</tt></a>,
|
|
<a href="reference-verbs.html#put"><tt>put</tt></a>,
|
|
<a href="reference-verbs.html#sec2gmt"><tt>sec2gmt</tt></a>,
|
|
<a href="reference-verbs.html#sec2gmtdate"><tt>sec2gmtdate</tt></a>,
|
|
<a href="reference-verbs.html#step"><tt>step</tt></a>,
|
|
<a href="reference-verbs.html#tee"><tt>tee</tt></a>
|
|
</td>
|
|
<td> <tt>awk</tt>-like functionality </td>
|
|
</tr>
|
|
|
|
<tr>
|
|
<td>
|
|
<a href="reference-verbs.html#bar"><tt>bar</tt></a>,
|
|
<a href="reference-verbs.html#bootstrap"><tt>bootstrap</tt></a>,
|
|
<a href="reference-verbs.html#decimate"><tt>decimate</tt></a>,
|
|
<a href="reference-verbs.html#histogram"><tt>histogram</tt></a>,
|
|
<a href="reference-verbs.html#least-frequent"><tt>least-frequent</tt></a>,
|
|
<a href="reference-verbs.html#most-frequent"><tt>most-frequent</tt></a>,
|
|
<a href="reference-verbs.html#sample"><tt>sample</tt></a>,
|
|
<a href="reference-verbs.html#shuffle"><tt>shuffle</tt></a>,
|
|
<a href="reference-verbs.html#stats1"><tt>stats1</tt></a>,
|
|
<a href="reference-verbs.html#stats2"><tt>stats2</tt></a>
|
|
</td>
|
|
<td> Statistically oriented </td>
|
|
</tr>
|
|
|
|
<tr>
|
|
<td>
|
|
<a href="reference-verbs.html#group-by"><tt>group-by</tt></a>,
|
|
<a href="reference-verbs.html#group-like"><tt>group-like</tt></a>,
|
|
<a href="reference-verbs.html#having-fields"><tt>having-fields</tt></a>
|
|
</td>
|
|
<td> Particularly oriented toward <a href="record-heterogeneity.html">Record-heterogeneity</a>, although
|
|
all Miller commands can handle heterogeneous records
|
|
</tr>
|
|
|
|
<tr>
|
|
<td>
|
|
<a href="reference-verbs.html#check"><tt>check</tt></a>,
|
|
<a href="reference-verbs.html#count-distinct"><tt>count-distinct</tt></a>,
|
|
<a href="reference-verbs.html#label"><tt>label</tt></a>,
|
|
<a href="reference-verbs.html#merge-fields"><tt>merge-fields</tt></a>,
|
|
<a href="reference-verbs.html#nest"><tt>nest</tt></a>,
|
|
<a href="reference-verbs.html#nothing"><tt>nothing</tt></a>,
|
|
<a href="reference-verbs.html#regularize"><tt>rename</tt></a>,
|
|
<a href="reference-verbs.html#rename"><tt>rename</tt></a>,
|
|
<a href="reference-verbs.html#reorder"><tt>reorder</tt></a>,
|
|
<a href="reference-verbs.html#reshape"><tt>reshape</tt></a>,
|
|
<a href="reference-verbs.html#seqgen"><tt>seqgen</tt></a>
|
|
</td>
|
|
<td> These draw from other sources (see also <a href="originality.html">How original is Miller?</a>):
|
|
<a href="reference-verbs.html#count-distinct"><tt>count-distinct</tt></a> is SQL-ish, and
|
|
<a href="reference-verbs.html#rename"><tt>rename</tt></a> can be done by <tt>sed</tt> (which does it faster:
|
|
see <a href="performance.html">Performance</a>).
|
|
</td>
|
|
</tr>
|
|
|
|
</table>
|
|
|
|
</div>
|
|
<!-- ================================================================ -->
|
|
<a id="I/O_options"/><h1>I/O options</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_io_options');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_io_options" style="display: block">
|
|
|
|
<!-- ================================================================ -->
|
|
<a id="Formats"/><h2>Formats</h2>
|
|
|
|
<p/> Options:
|
|
|
|
<pre>
|
|
--dkvp --idkvp --odkvp
|
|
--nidx --inidx --onidx
|
|
--csv --icsv --ocsv
|
|
--csvlite --icsvlite --ocsvlite
|
|
--pprint --ipprint --opprint --right
|
|
--xtab --ixtab --oxtab
|
|
--json --ijson --ojson
|
|
</pre>
|
|
|
|
<p/> These are as discussed in <a href="file-formats.html">File formats</a>, with the exception of <tt>--right</tt>
|
|
which makes pretty-printed output right-aligned:
|
|
|
|
<table><tr><td>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint cat data/small
|
|
a b i x y
|
|
pan pan 1 0.3467901443380824 0.7268028627434533
|
|
eks pan 2 0.7586799647899636 0.5221511083334797
|
|
wye wye 3 0.20460330576630303 0.33831852551664776
|
|
eks wye 4 0.38139939387114097 0.13418874328430463
|
|
wye pan 5 0.5732889198020006 0.8636244699032729
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
</td><td>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint --right cat data/small
|
|
a b i x y
|
|
pan pan 1 0.3467901443380824 0.7268028627434533
|
|
eks pan 2 0.7586799647899636 0.5221511083334797
|
|
wye wye 3 0.20460330576630303 0.33831852551664776
|
|
eks wye 4 0.38139939387114097 0.13418874328430463
|
|
wye pan 5 0.5732889198020006 0.8636244699032729
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
</td></tr></table>
|
|
|
|
<p/>Additional notes:
|
|
|
|
<ul>
|
|
|
|
<li/> Use <tt>--csv</tt>, <tt>--pprint</tt>, etc. when the input and output formats are the same.
|
|
|
|
<li/> Use <tt>--icsv --opprint</tt>, etc. when you want format conversion as part of what Miller does to your data.
|
|
|
|
<li/> DKVP (key-value-pair) format is the default for input and output. So,
|
|
<tt>--oxtab</tt> is the same as <tt>--idkvp --oxtab</tt>.
|
|
|
|
</ul>
|
|
|
|
<!-- ================================================================ -->
|
|
<a id="In-place_mode"/><h2>In-place mode</h2>
|
|
|
|
<p/> Use the <tt>mlr -I</tt> flag to process files in-place. For example,
|
|
<tt>mlr -I --csv cut -x -f unwanted_column_name mydata/*.csv</tt> will remove
|
|
<tt>unwanted_column_name</tt> from all your <tt>*.csv</tt> files in your
|
|
<tt>mydata/</tt> subdirectory.
|
|
|
|
<p/> By default, Miller output goes to the screen (or you can redirect a file
|
|
using <tt>></tt> or to another process using <tt>|</tt>). With <tt>-I</tt>,
|
|
for each file name on the command line, output is written to a temporary file
|
|
in the same directory. Miller writes its output into that temp file, which is
|
|
then renamed over the original. Then, processing continues on the next file.
|
|
Each file is processed in isolation: if the output format is CSV, CSV headers
|
|
will be present in each output file; statistics are only over each file's own
|
|
records; and so on.
|
|
|
|
<p/> Please see <a href="10-min.html#Choices_for_printing_to_files">here</a>
|
|
for examples.
|
|
|
|
<!-- ================================================================ -->
|
|
<a id="Compression"/><h2>Compression</h2>
|
|
|
|
<p/> Options:
|
|
|
|
<pre>
|
|
--prepipe {command}
|
|
</pre>
|
|
|
|
<p/>The prepipe command is anything which reads from standard input and produces data acceptable to
|
|
Miller. Nominally this allows you to use whichever decompression utilities you have installed on your
|
|
system, on a per-file basis. If the command has flags, quote them: e.g. <tt>mlr --prepipe 'zcat -cf'</tt>. Examples:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
# These two produce the same output:
|
|
$ gunzip < myfile1.csv.gz | mlr cut -f hostname,uptime
|
|
$ mlr --prepipe gunzip cut -f hostname,uptime myfile1.csv.gz
|
|
# With multiple input files you need --prepipe:
|
|
$ mlr --prepipe gunzip cut -f hostname,uptime myfile1.csv.gz myfile2.csv.gz
|
|
$ mlr --prepipe gunzip --idkvp --oxtab cut -f hostname,uptime myfile1.dat.gz myfile2.dat.gz
|
|
|
|
# Similar to the above, but with compressed output as well as input:
|
|
$ gunzip < myfile1.csv.gz | mlr cut -f hostname,uptime | gzip > outfile.csv.gz
|
|
$ mlr --prepipe gunzip cut -f hostname,uptime myfile1.csv.gz | gzip > outfile.csv.gz
|
|
$ mlr --prepipe gunzip cut -f hostname,uptime myfile1.csv.gz myfile2.csv.gz | gzip > outfile.csv.gz
|
|
|
|
# Similar to the above, but with different compression tools for input and output:
|
|
$ gunzip < myfile1.csv.gz | mlr cut -f hostname,uptime | xz -z > outfile.csv.xz
|
|
$ xz -cd < myfile1.csv.xz | mlr cut -f hostname,uptime | gzip > outfile.csv.xz
|
|
$ mlr --prepipe 'xz -cd' cut -f hostname,uptime myfile1.csv.xz myfile2.csv.xz | xz -z > outfile.csv.xz
|
|
|
|
... etc.
|
|
</pre>
|
|
</div>
|
|
|
|
<!-- ================================================================ -->
|
|
<a id="Record/field/pair_separators"/><h2>Record/field/pair separators</h2>
|
|
|
|
<p/> Miller has record separators <tt>IRS</tt> and <tt>ORS</tt>, field
|
|
separators <tt>IFS</tt> and <tt>OFS</tt>, and pair separators <tt>IPS</tt> and
|
|
<tt>OPS</tt>. For example, in the DKVP line <tt>a=1,b=2,c=3</tt>, the record
|
|
separator is newline, field separator is comma, and pair separator is the
|
|
equals sign. These are the default values.
|
|
|
|
<p/> Options:
|
|
<pre>
|
|
--rs --irs --ors
|
|
--fs --ifs --ofs --repifs
|
|
--ps --ips --ops
|
|
</pre>
|
|
|
|
<ul>
|
|
|
|
<li/> You can change a separator from input to output via e.g. <tt>--ifs =
|
|
--ofs :</tt>. Or, you can specify that the same separator is to be used for
|
|
input and output via e.g. <tt>--fs :</tt>.
|
|
|
|
<li/> The pair separator is only relevant to DKVP format.
|
|
|
|
<li/> Pretty-print and xtab formats ignore the separator arguments altogether.
|
|
|
|
<li/> The <tt>--repifs</tt> means that multiple successive occurrences of the
|
|
field separator count as one. For example, in CSV data we often signify nulls
|
|
by empty strings, e.g. <tt>2,9,,,,,6,5,4</tt>. On the other hand, if the field
|
|
separator is a space, it might be more natural to parse <tt>2 4 5</tt> the
|
|
same as <tt>2 4 5</tt>: <tt>--repifs --ifs ' '</tt> lets this happen. In fact,
|
|
the <tt>--ipprint</tt> option above is internally implemented in terms of
|
|
<tt>--repifs</tt>.
|
|
|
|
<li/> Just write out the desired separator, e.g. <tt>--ofs '|'</tt>. But you
|
|
may use the symbolic names <tt>newline</tt>, <tt>space</tt>, <tt>tab</tt>,
|
|
<tt>pipe</tt>, or <tt>semicolon</tt> if you like.
|
|
|
|
</ul>
|
|
|
|
<!-- ================================================================ -->
|
|
<a id="Number_formatting"/><h2>Number formatting</h2>
|
|
|
|
<p/> The command-line option <tt>--ofmt {format string}</tt> is the global
|
|
number format for commands which generate numeric output, e.g.
|
|
<tt>stats1</tt>, <tt>stats2</tt>, <tt>histogram</tt>, and <tt>step</tt>, as
|
|
well as <tt>mlr put</tt>. Examples:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
--ofmt %.9le --ofmt %.6lf --ofmt %.0lf
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/> These are just C <tt>printf</tt> formats applied to double-precision
|
|
numbers. Please don’t use <tt>%s</tt> or <tt>%d</tt>. Additionally, if
|
|
you use leading width (e.g. <tt>%18.12lf</tt>) then the output will contain
|
|
embedded whitespace, which may not be what you want if you pipe the output to
|
|
something else, particularly CSV. I use Miller’s pretty-print format
|
|
(<tt>mlr --opprint</tt>) to column-align numerical data.
|
|
|
|
<p/> To apply formatting to a single field, overriding the global
|
|
<tt>ofmt</tt>, use <tt>fmtnum</tt> function within <tt>mlr put</tt>. For example:
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ echo 'x=3.1,y=4.3' | mlr put '$z=fmtnum($x*$y,"%08lf")'
|
|
x=3.1,y=4.3,z=13.330000
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ echo 'x=0xffff,y=0xff' | mlr put '$z=fmtnum(int($x*$y),"%08llx")'
|
|
x=0xffff,y=0xff,z=00feff01
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>Input conversion from hexadecimal is done automatically on fields handled
|
|
by <tt>mlr put</tt> and <tt>mlr filter</tt> as long as the field value begins
|
|
with "0x". To apply output conversion to hexadecimal on a single column, you
|
|
may use <tt>fmtnum</tt>, or the keystroke-saving <tt>hexfmt</tt> function.
|
|
Example:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ echo 'x=0xffff,y=0xff' | mlr put '$z=hexfmt($x*$y)'
|
|
x=0xffff,y=0xff,z=0xfeff01
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<!-- ================================================================ -->
|
|
</div>
|
|
<a id="Data_transformations_(verbs)"/><h1>Data transformations (verbs)</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_data_transformations');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_data_transformations" style="display: block">
|
|
|
|
<p/> Please see <a href="reference-verbs.html">the separate page here</a>.
|
|
|
|
<!-- ================================================================ -->
|
|
</div>
|
|
<a id="Expression_language_for_filter_and_put"/><h1>Expression language for filter and put</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_dsl_ref');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_dsl_ref" style="display: block">
|
|
|
|
<p/> Please see <a href="reference-dsl.html">the separate page here</a>.
|
|
|
|
<!-- ================================================================ -->
|
|
</div>
|
|
<a id="then-chaining"/><h1>then-chaining</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_then_chaining');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_then_chaining" style="display: block">
|
|
|
|
<p/>
|
|
In accord with the
|
|
<a href="http://en.wikipedia.org/wiki/Unix_philosophy">Unix philosophy</a>, you can pipe data into or out of
|
|
Miller. For example:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
mlr cut --complement -f os_version *.dat | mlr sort -f hostname,uptime
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>
|
|
You can, if you like, instead simply chain commands together using the
|
|
<tt>then</tt> keyword:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
mlr cut --complement -f os_version then sort -f hostname,uptime *.dat
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>(You can precede the very first verb with <tt>then</tt>, if you like, for symmetry.)
|
|
|
|
Here’s a performance comparison:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
% cat piped.sh
|
|
mlr cut -x -f i,y data/big | mlr sort -n y > /dev/null
|
|
|
|
% time sh piped.sh
|
|
real 0m2.828s
|
|
user 0m3.183s
|
|
sys 0m0.137s
|
|
|
|
|
|
% cat chained.sh
|
|
mlr cut -x -f i,y then sort -n y data/big > /dev/null
|
|
|
|
% time sh chained.sh
|
|
real 0m2.082s
|
|
user 0m1.933s
|
|
sys 0m0.137s
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
There are two reasons to use then-chaining: one is for performance, although I
|
|
don’t expect this to be a win in all cases. Using then-chaining avoids
|
|
redundant string-parsing and string-formatting at each pipeline step: instead
|
|
input records are parsed once, they are fed through each pipeline stage in
|
|
memory, and then output records are formatted once. On the other hand, Miller
|
|
is single-threaded, while modern systems are usually multi-processor, and when
|
|
streaming-data programs operate through pipes, each one can use a CPU. Rest
|
|
assured you get the same results either way.
|
|
|
|
<p/>The other reason to use then-chaining is for simplicity: you don’t
|
|
have re-type formatting flags (e.g. <tt>--csv --fs tab</tt>) at every
|
|
pipeline stage.
|
|
|
|
<!-- ================================================================ -->
|
|
</div>
|
|
<a id="Data_types"/><h1>Data types</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_data_types');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_data_types" style="display: block">
|
|
|
|
<p/> Miller’s input and output are all string-oriented: there is (as of
|
|
August 2015 anyway) no support for binary record packing. In this sense,
|
|
everything is a string in and out of Miller. During processing, field names
|
|
are always strings, even if they have names like "3"; field values are usually
|
|
strings. Field values’ ability to be interpreted as a non-string type
|
|
only has meaning when comparison or function operations are done on them. And
|
|
it is an error condition if Miller encounters non-numeric (or otherwise
|
|
mistyped) data in a field in which it has been asked to do numeric (or
|
|
otherwise type-specific) operations.
|
|
|
|
<p/> Field values are treated as numeric for the following:
|
|
<ul>
|
|
<li/> Numeric sort: <tt>mlr sort -n</tt>, <tt>mlr sort -nr</tt>.
|
|
<li/> Statistics: <tt>mlr histogram</tt>, <tt>mlr stats1</tt>, <tt>mlr stats2</tt>.
|
|
<li/> Cross-record arithmetic: <tt>mlr step</tt>.
|
|
</ul>
|
|
|
|
<p/>For <tt>mlr put</tt> and <tt>mlr filter</tt>:
|
|
|
|
<ul>
|
|
|
|
<li/> Miller’s types for function processing are <b>empty-null</b> (empty
|
|
string), <b>absent-null</b> (reads of unset right-hand sides, or fall-through
|
|
non-explicit return values from user-defined functions), <b>error</b>,
|
|
<b>string</b>, <b>float</b> (double-precision), <b>int</b> (64-bit signed), and
|
|
<b>boolean</b>.
|
|
|
|
<li/> On input, string values representable as numbers, e.g. "3" or "3.1", are
|
|
treated as int or float, respectively. If a record has <tt>x=1,y=2</tt> then
|
|
<tt>mlr put '$z=$x+$y'</tt> will produce <tt>x=1,y=2,z=3</tt>, and <tt>mlr put
|
|
'$z=$x.$y'</tt> does not give an error simply because the dot operator has been
|
|
generalized to stringify non-strings. To coerce back to string for processing,
|
|
use the <tt>string</tt> function: <tt>mlr put '$z=string($x).string($y)'</tt>
|
|
will produce <tt>x=1,y=2,z=12</tt>.
|
|
|
|
<li/> On input, string values representable as boolean (e.g. <tt>"true"</tt>,
|
|
<tt>"false"</tt>) are <i>not</i> automatically treated as boolean. (This is
|
|
because <tt>"true"</tt> and <tt>"false"</tt> are ordinary words, and auto
|
|
string-to-boolean on a column consisting of words would result in some strings
|
|
mixed with some booleans.) Use the <tt>boolean</tt> function to coerce: e.g.
|
|
giving the record <tt>x=1,y=2,w=false</tt> to <tt>mlr put '$z=($x<$y) ||
|
|
boolean($w)'</tt>.
|
|
|
|
<li/> Functions take types as described in <tt>mlr --help-all-functions</tt>:
|
|
for example, <tt>log10</tt> takes float input and produces float output,
|
|
<tt>gmt2sec</tt> maps string to int, and <tt>sec2gmt</tt> maps int to string.
|
|
|
|
<li/> All math functions described in <tt>mlr --help-all-functions</tt> take
|
|
integer as well as float input.
|
|
|
|
</ul>
|
|
|
|
<!-- ================================================================ -->
|
|
</div>
|
|
<a id="Null_data:_empty_and_absent"/><h1>Null data: empty and absent</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_null_data');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_null_data" style="display: block">
|
|
|
|
<p/> One of Miller’s key features is its support for <b>heterogeneous</b>
|
|
data. For example, take <tt>mlr sort</tt>: if you try to sort on field
|
|
<tt>hostname</tt> when not all records in the data stream <i>have</i> a field
|
|
named <tt>hostname</tt>, it is not an error (although you could pre-filter the
|
|
data stream using <tt>mlr having-fields --at-least hostname then sort
|
|
...</tt>). Rather, records lacking one or more sort keys are simply output
|
|
contiguously by <tt>mlr sort</tt>.
|
|
|
|
<p/> Miller has two kinds of null data:
|
|
|
|
<ul>
|
|
|
|
<li/> <b>Empty (key present, value empty)</b>: a field name is present in a
|
|
record (or in an out-of-stream variable) with empty value: e.g. <tt>x=,y=2</tt>
|
|
in the data input stream, or assignment <tt>$x=""</tt> or <tt>@x=""</tt> in
|
|
<tt>mlr put</tt>.
|
|
|
|
<li/> <b>Absent (key not present)</b>: a field name is not present, e.g. input
|
|
record is <tt>x=1,y=2</tt> and a <tt>put</tt> or <tt>filter</tt> expression
|
|
refers to <tt>$z</tt>. Or, reading an out-of-stream variable which hasn’t
|
|
been assigned a value yet, e.g. <tt>mlr put -q '@sum += $x'; end{emit
|
|
@sum}'</tt> or <tt>mlr put -q '@sum[$a][$b] += $x'; end{emit @sum, "a",
|
|
"b"}'</tt>.
|
|
|
|
</ul>
|
|
|
|
<p/>You can test these programatically using the functions
|
|
<tt>is_empty</tt>/<tt>is_not_empty</tt>, <tt>is_absent</tt>/<tt>is_present</tt>, and
|
|
<tt>is_null</tt>/<tt>is_not_null</tt>. For the last pair, note that null means
|
|
either empty or absent.
|
|
|
|
<p/>
|
|
Rules for null-handling:
|
|
|
|
<ul>
|
|
|
|
<li/> Records with one or more empty sort-field values sort after records with
|
|
all sort-field values present:
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr cat data/sort-null.dat
|
|
a=3,b=2
|
|
a=1,b=8
|
|
a=,b=4
|
|
x=9,b=10
|
|
a=5,b=7
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr sort -n a data/sort-null.dat
|
|
a=1,b=8
|
|
a=3,b=2
|
|
a=5,b=7
|
|
a=,b=4
|
|
x=9,b=10
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr sort -nr a data/sort-null.dat
|
|
a=,b=4
|
|
a=5,b=7
|
|
a=3,b=2
|
|
a=1,b=8
|
|
x=9,b=10
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<li/> Functions/operators which have one or more <i>empty</i> arguments produce empty output: e.g.
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ echo 'x=2,y=3' | mlr put '$a=$x+$y'
|
|
x=2,y=3,a=5
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ echo 'x=,y=3' | mlr put '$a=$x+$y'
|
|
x=,y=3,a=
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ echo 'x=,y=3' | mlr put '$a=log($x);$b=log($y)'
|
|
x=,y=3,a=,b=1.098612
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
with the exception that the <tt>min</tt> and <tt>max</tt> functions are
|
|
special: if one argument is non-null, it wins:
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ echo 'x=,y=3' | mlr put '$a=min($x,$y);$b=max($x,$y)'
|
|
x=,y=3,a=3,b=3
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<li/> Functions of <i>absent</i> variables (e.g. <tt>mlr put '$y =
|
|
log10($nonesuch)'</tt>) evaluate to absent, and arithmetic/bitwise/boolean
|
|
operators with both operands being absent evaluate to absent.
|
|
Arithmetic operators with one absent operand return the other operand.
|
|
More specifically, absent values act like zero for addition/subtraction, and
|
|
one for multiplication: Furthermore, <b>any expression which evaluates to
|
|
absent is not stored in the left-hand side of an assignment statement </b>:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ echo 'x=2,y=3' | mlr put '$a=$u+$v; $b=$u+$y; $c=$x+$y'
|
|
x=2,y=3,b=3,c=5
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ echo 'x=2,y=3' | mlr put '$a=min($x,$v);$b=max($u,$y);$c=min($u,$v)'
|
|
x=2,y=3,a=2,b=3
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<li/> Likewise, for assignment to maps, <b>absent-valued keys or values result
|
|
in a skipped assignment</b>.
|
|
|
|
</ul>
|
|
|
|
The reasoning is as follows:
|
|
|
|
<ul>
|
|
|
|
<li/> Empty values are explicit in the data so they should explicitly affect accumulations:
|
|
<tt>mlr put '@sum += $x'</tt>
|
|
should accumulate numeric <tt>x</tt> values into the sum but an empty
|
|
<tt>x</tt>, when encountered in the input data stream, should make the sum
|
|
non-numeric. To work around this you can use the
|
|
<tt>is_not_null</tt> function as follows:
|
|
<tt>mlr put 'is_not_null($x) { @sum += $x }'</tt>
|
|
|
|
<li/> Absent stream-record values should not break accumulations, since Miller
|
|
by design handles heterogenous data: the running <tt>@sum</tt> in
|
|
<tt>mlr put '@sum += $x'</tt>
|
|
should not be invalidated for records which have no <tt>x</tt>.
|
|
|
|
<li/> Absent out-of-stream-variable values are precisely what allow you to write
|
|
<tt>mlr put '@sum += $x'</tt>. Otherwise you would have to write
|
|
<tt>mlr put 'begin{@sum = 0}; @sum += $x'</tt> —
|
|
which is tolerable — but for
|
|
<tt>mlr put 'begin{...}; @sum[$a][$b] += $x'</tt>
|
|
you’d have to pre-initialize <tt>@sum</tt> for all values of <tt>$a</tt> and <tt>$b</tt> in your
|
|
input data stream, which is intolerable.
|
|
|
|
<li/> The penalty for the absent feature is that misspelled variables can be hard to find:
|
|
e.g. in <tt>mlr put 'begin{@sumx = 10}; ...; update @sumx somehow per-record; ...; end {@something = @sum * 2}'</tt>
|
|
the accumulator is spelt <tt>@sumx</tt> in the begin-block but <tt>@sum</tt> in the end-block, where since it
|
|
is absent, <tt>@sum*2</tt> evaluates to 2. See also the section on
|
|
<a href="reference-dsl.html#Errors_and_transparency">errors and transparency</a>.
|
|
|
|
</ul>
|
|
|
|
<p/>Since absent plus absent is absent (and likewise for other operators),
|
|
accumulations such as <tt>@sum += $x</tt> work correctly on heterogenous data,
|
|
as do within-record formulas if both operands are absent. If one operand is
|
|
present, you may get behavior you don’t desire. To work around this
|
|
— namely, to set an output field only for records which have all the
|
|
inputs present — you can use a pattern-action block with
|
|
<tt>is_present</tt>:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr cat data/het.dkvp
|
|
resource=/path/to/file,loadsec=0.45,ok=true
|
|
record_count=100,resource=/path/to/file
|
|
resource=/path/to/second/file,loadsec=0.32,ok=true
|
|
record_count=150,resource=/path/to/second/file
|
|
resource=/some/other/path,loadsec=0.97,ok=false
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr put 'is_present($loadsec) { $loadmillis = $loadsec * 1000 }' data/het.dkvp
|
|
resource=/path/to/file,loadsec=0.45,ok=true,loadmillis=450.000000
|
|
record_count=100,resource=/path/to/file
|
|
resource=/path/to/second/file,loadsec=0.32,ok=true,loadmillis=320.000000
|
|
record_count=150,resource=/path/to/second/file
|
|
resource=/some/other/path,loadsec=0.97,ok=false,loadmillis=970.000000
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr put '$loadmillis = (is_present($loadsec) ? $loadsec : 0.0) * 1000' data/het.dkvp
|
|
resource=/path/to/file,loadsec=0.45,ok=true,loadmillis=450.000000
|
|
record_count=100,resource=/path/to/file,loadmillis=0.000000
|
|
resource=/path/to/second/file,loadsec=0.32,ok=true,loadmillis=320.000000
|
|
record_count=150,resource=/path/to/second/file,loadmillis=0.000000
|
|
resource=/some/other/path,loadsec=0.97,ok=false,loadmillis=970.000000
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/> If you’re interested in a formal description of how empty and absent
|
|
fields participate in arithmetic, here’s a table for plus (other
|
|
arithmetic/boolean/bitwise operators are similar):
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --print-type-arithmetic-info
|
|
(+) | error absent empty string int float bool
|
|
------ + ------ ------ ------ ------ ------ ------ ------
|
|
error | error error error error error error error
|
|
absent | error absent absent error int float error
|
|
empty | error absent empty error empty empty error
|
|
string | error error error error error error error
|
|
int | error int empty error int float error
|
|
float | error float empty error float float error
|
|
bool | error error error error error error error
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<!-- ================================================================ -->
|
|
</div>
|
|
<a id="String_literals"/><h1>String literals</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_string_literals');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_string_literals" style="display: block">
|
|
|
|
<p/>
|
|
You can use the following backslash escapes for strings such as between the double quotes in contexts such as
|
|
<tt>mlr filter '$name =~ "..."'</tt>,
|
|
<tt>mlr put '$name = $othername . "..."'</tt>,
|
|
<tt>mlr put '$name = sub($name, "...", "...")</tt>, etc.:
|
|
|
|
<ul>
|
|
<li/> <tt>\a</tt>: ASCII code 0x07 (alarm/bell)
|
|
<li/> <tt>\b</tt>: ASCII code 0x08 (backspace)
|
|
<li/> <tt>\f</tt>: ASCII code 0x0c (formfeed)
|
|
<li/> <tt>\n</tt>: ASCII code 0x0a (LF/linefeed/newline)
|
|
<li/> <tt>\r</tt>: ASCII code 0x0d (CR/carriage return)
|
|
<li/> <tt>\t</tt>: ASCII code 0x09 (tab)
|
|
<li/> <tt>\v</tt>: ASCII code 0x0b (vertical tab)
|
|
<li/> <tt>\\</tt>: backslash
|
|
<li/> <tt>\"</tt>: double quote
|
|
<li/> <tt>\123</tt>: Octal 123, etc. for <tt>\000</tt> up to <tt>\377</tt>
|
|
<li/> <tt>\x7f</tt>: Hexadecimal 7f, etc. for <tt>\x00</tt> up to <tt>\xff</tt>
|
|
</ul>
|
|
|
|
<p/>See also <a href="https://en.wikipedia.org/wiki/Escape_sequences_in_C">https://en.wikipedia.org/wiki/Escape_sequences_in_C</a>.
|
|
|
|
<p/>These replacements apply only to strings you key in for the DSL expressions for <tt>filter</tt> and <tt>put</tt>:
|
|
that is, if you type <tt>\t</tt> in a string literal for a <tt>filter</tt>/<tt>put</tt> expression, it will be turned into a tab character. If you want a backslash followed by a <tt>t</tt>, then please type <tt>\\t</tt>.
|
|
|
|
<p/>However, these replacements are not done automatically within your data stream. If you wish to make these
|
|
replacements, you can do, for example, for a field named <tt>field</tt>, <tt> mlr put '$field = gsub($field, "\\t",
|
|
"\t")'</tt>. If you need to make such a replacement for all fields in your data, you should probably simply use the
|
|
system <tt>sed</tt> command.
|
|
|
|
</div>
|
|
<a id="Regular_expressions"/><h1>Regular expressions</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_regular_expressions');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_regular_expressions" style="display: block">
|
|
|
|
<p/>Miller lets you use regular expressions (of type POSIX.2) in the following contexts:
|
|
|
|
<ul>
|
|
|
|
<li/> In <tt>mlr filter</tt> with <tt>=~</tt> or <tt>!=~</tt>, e.g. <tt>mlr
|
|
filter '$url =~ "http.*com"'</tt>
|
|
|
|
<li/> In <tt>mlr put</tt> with <tt>sub</tt> or <tt>gsub</tt>, e.g. <tt>mlr put
|
|
'$url = sub($url, "http.*com", "")'</tt>
|
|
|
|
<li/> In <tt>mlr having-fields</tt>, e.g. <tt>mlr having-fields
|
|
--any-matching '^sda[0-9]'</tt>
|
|
|
|
<li/> In <tt>mlr cut</tt>, e.g. <tt>mlr cut -r -f '^status$,^sda[0-9]'</tt>
|
|
|
|
<li/> In <tt>mlr rename</tt>, e.g. <tt>mlr rename -r '^(sda[0-9]).*$,dev/\1'</tt>
|
|
|
|
<li/> In <tt>mlr grep</tt>, e.g. <tt>mlr --csv grep 00188555487 myfiles*.csv</tt>
|
|
|
|
</ul>
|
|
|
|
<p/>Points demonstrated by the above examples:
|
|
|
|
<ul>
|
|
|
|
<li/> There are no implicit start-of-string or end-of-string anchors; please
|
|
use <tt>^</tt> and/or <tt>$</tt> explicitly.
|
|
|
|
<li/> Miller regexes are wrapped with double quotes rather than slashes.
|
|
|
|
<li/> The <tt>i</tt> after the ending double quote indicates a case-insensitive
|
|
regex.
|
|
|
|
<li/> Capture groups are wrapped with <tt>(...)</tt> rather than
|
|
<tt>\(...\)</tt>; use <tt>\(</tt> and <tt>\)</tt> to match against parentheses.
|
|
|
|
</ul>
|
|
|
|
<p/>For <tt>filter</tt> and <tt>put</tt>, if the regular expression is a string
|
|
literal (the normal case), it is precompiled at process start and reused
|
|
thereafter, which is efficient. If the regular expression is a more complex
|
|
expression, including string concatenation using <tt>.</tt>, or a column name
|
|
(in which case you can take regular expressions from input data!), then regexes
|
|
are compiled on each record which works but is less efficient. As well, in this
|
|
case there is no way to specify case-insensitive matching.
|
|
|
|
<p/>Example:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ cat data/regex-in-data.dat
|
|
name=jane,regex=^j.*e$
|
|
name=bill,regex=^b[ou]ll$
|
|
name=bull,regex=^b[ou]ll$
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr filter '$name =~ $regex' data/regex-in-data.dat
|
|
name=jane,regex=^j.*e$
|
|
name=bull,regex=^b[ou]ll$
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<a id="Regex_captures"/><h2>Regex captures</h2>
|
|
|
|
<p/>Regex captures of the form <tt>\0</tt> through <tt>\9</tt> are supported as
|
|
follows: <ul>
|
|
|
|
<li/> Captures have in-function context for <tt>sub</tt> and <tt>gsub</tt>.
|
|
For example, the first <tt>\1,\2</tt> pair belong to the first <tt>sub</tt> and
|
|
the second <tt>\1,\2</tt> pair belong to the second <tt>sub</tt>:
|
|
|
|
<p/>
|
|
<div class=pokipanel>
|
|
<pre>
|
|
mlr put '$b = sub($a, "(..)_(...)", "\2-\1"); $c = sub($a, "(..)_(.)(..)", ":\1:\2:\3")'
|
|
</pre>
|
|
</div>
|
|
|
|
<li/> Captures endure for the entirety of a <tt>put</tt> for the <tt>=~</tt>
|
|
and <tt>!=~</tt> operators. For example, here the <tt>\1,\2</tt> are set by the
|
|
<tt>=~</tt> operator and are used by both subsequent assignment statements:
|
|
|
|
<p/>
|
|
<div class=pokipanel>
|
|
<pre>
|
|
mlr put '$a =~ "(..)_(....); $b = "left_\1"; $c = "right_\2"'
|
|
</pre>
|
|
</div>
|
|
|
|
<li/>The captures are not retained across multiple puts. For example, here the
|
|
<tt>\1,\2</tt> won’t be expanded from the regex capture:
|
|
|
|
<p/>
|
|
<div class=pokipanel>
|
|
<pre>
|
|
mlr put '$a =~ "(..)_(....)' then {... something else ...} then put '$b = "left_\1"; $c = "right_\2"'
|
|
</pre>
|
|
</div>
|
|
|
|
<li/> Captures are ignored in <tt>filter</tt> for the <tt>=~</tt> and
|
|
<tt>!=~</tt> operators. For example, there is no mechanism provided to refer to
|
|
the first <tt>(..)</tt> as <tt>\1</tt> or to the second <tt>(....)</tt> as
|
|
<tt>\2</tt> in the following filter statement:
|
|
|
|
<p/>
|
|
<div class=pokipanel>
|
|
<pre>
|
|
mlr filter '$a =~ "(..)_(....)'
|
|
</pre>
|
|
</div>
|
|
|
|
<li/> Up to nine matches are supported: <tt>\1</tt> through <tt>\9</tt>, while
|
|
<tt>\0</tt> is the entire match string; <tt>\15</tt> is treated as <tt>\1</tt>
|
|
followed by an unrelated <tt>5</tt>.
|
|
</ul>
|
|
|
|
<!-- ================================================================ -->
|
|
</div>
|
|
<a id="Arithmetic"/><h1>Arithmetic</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_arithmetic');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_arithmetic" style="display: block">
|
|
|
|
<a id="Input_scanning"/><h2>Input scanning</h2>
|
|
|
|
<p/>Numbers in Miller are double-precision float or 64-bit signed integers.
|
|
Anything scannable as int, e.g <tt>123</tt> or <tt>0xabcd</tt>, is treated as
|
|
an integer; otherwise, input scannable as float (<tt>4.56</tt> or <tt>8e9</tt>)
|
|
is treated as float; everything else is a string.
|
|
|
|
<p/>If you want all numbers to be treated as floats, then you may use
|
|
<tt>float()</tt> in your filter/put expressions (e.g. replacing <tt>$c = $a *
|
|
$b</tt> with <tt>$c = float($a) * float($b)</tt>) — or, more simply, use
|
|
<tt>mlr filter -F</tt> and <tt>mlr put -F</tt> which forces all numeric input,
|
|
whether from expression literals or field values, to float. Likewise <tt>mlr
|
|
stats1 -F</tt> and <tt>mlr step -F</tt> force integerable accumulators (such as
|
|
<tt>count</tt>) to be done in floating-point.
|
|
|
|
<a id="Conversion_by_math_routines"/><h2>Conversion by math routines</h2>
|
|
|
|
<p/>For most math functions, integers are cast to float on input, and produce
|
|
float output: e.g. <tt>exp(0) = 1.0</tt> rather than <tt>1</tt>. The
|
|
following, however, produce integer output if their inputs are integers:
|
|
<tt>+</tt> <tt>-</tt> <tt>*</tt> <tt>/</tt> <tt>//</tt> <tt>%</tt> <tt>abs</tt>
|
|
<tt>ceil</tt> <tt>floor</tt> <tt>max</tt> <tt>min</tt> <tt>round</tt>
|
|
<tt>roundm</tt> <tt>sgn</tt>. As well, <tt>stats1 -a min</tt>, <tt>stats1 -a
|
|
max</tt>, <tt>stats1 -a sum</tt>, <tt>step -a delta</tt>, and <tt>step -a
|
|
rsum</tt> produce integer output if their inputs are integers.
|
|
|
|
<a id="Conversion_by_arithmetic_operators"/><h2>Conversion by arithmetic operators</h2>
|
|
|
|
<p/>The sum, difference, and product of integers is again integer, except for
|
|
when that would overflow a 64-bit integer at which point Miller converts the
|
|
result to float.
|
|
|
|
<p/>The short of it is that Miller does this transparently for you so you
|
|
needn’t think about it.
|
|
|
|
<p/>Implementation details of this, for the interested: integer adds and
|
|
subtracts overflow by at most one bit so it suffices to check sign-changes.
|
|
Thus, Miller allows you to add and subtract arbitrary 64-bit signed integers,
|
|
converting only to float precisely when the result is less than -2<sup>63</sup>
|
|
or greater than 2<sup>63</sup>-1. Multiplies, on the other hand, can overflow
|
|
by a word size and a sign-change technique does not suffice to detect overflow.
|
|
Instead Miller tests whether the floating-point product exceeds the
|
|
representable integer range. Now, 64-bit integers have 64-bit precision while
|
|
IEEE-doubles have only 52-bit mantissas — so, there are 53 bits including
|
|
implicit leading one. The following experiment explicitly demonstrates the
|
|
resolution at this range:
|
|
|
|
<div class=pokipanel>
|
|
<pre>
|
|
64-bit integer 64-bit integer Casted to double Back to 64-bit
|
|
in hex in decimal integer
|
|
0x7ffffffffffff9ff 9223372036854774271 9223372036854773760.000000 0x7ffffffffffff800
|
|
0x7ffffffffffffa00 9223372036854774272 9223372036854773760.000000 0x7ffffffffffff800
|
|
0x7ffffffffffffbff 9223372036854774783 9223372036854774784.000000 0x7ffffffffffffc00
|
|
0x7ffffffffffffc00 9223372036854774784 9223372036854774784.000000 0x7ffffffffffffc00
|
|
0x7ffffffffffffdff 9223372036854775295 9223372036854774784.000000 0x7ffffffffffffc00
|
|
0x7ffffffffffffe00 9223372036854775296 9223372036854775808.000000 0x8000000000000000
|
|
0x7ffffffffffffffe 9223372036854775806 9223372036854775808.000000 0x8000000000000000
|
|
0x7fffffffffffffff 9223372036854775807 9223372036854775808.000000 0x8000000000000000
|
|
</pre>
|
|
</div>
|
|
|
|
<p/>That is, one cannot check an integer product to see if it is precisely
|
|
greater than 2<sup>63</sup>-1 or less than -2<sup>63</sup> using either integer
|
|
arithmetic (it may have already overflowed) or using double-precision (due to
|
|
granularity). Instead Miller checks for overflow in 64-bit integer
|
|
multiplication by seeing whether the absolute value of the double-precision
|
|
product exceeds the largest representable IEEE double less than 2<sup>63</sup>,
|
|
which we see from the listing above is 9223372036854774784. (An alternative
|
|
would be to do all integer multiplies using handcrafted multi-word 128-bit
|
|
arithmetic. This approach is not taken.)
|
|
|
|
<a id="Pythonic_division"/><h2>Pythonic division</h2>
|
|
|
|
<p/>Division and remainder are
|
|
<a href="http://python-history.blogspot.com/2010/08/why-pythons-integer-division-floors.html">
|
|
pythonic</a>:
|
|
<ul>
|
|
<li/> Quotient of integers is floating-point: <tt>7/2</tt> is <tt>3.5</tt>.
|
|
<li/> Integer division is done with <tt>//</tt>: <tt>7//2</tt> is <tt>3</tt>.
|
|
This rounds toward the negative.
|
|
<li/> Remainders are non-negative.
|
|
</ul>
|
|
|
|
</div>
|
|
<!-- ================================================================ -->
|
|
<a id="On-line_help"/><h1>On-line help</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_online_help');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_online_help" style="display: block">
|
|
|
|
<p/>Examples:<p/>
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --help
|
|
Usage: mlr [I/O options] {verb} [verb-dependent options ...] {zero or more file names}
|
|
|
|
Command-line-syntax examples:
|
|
mlr --csv cut -f hostname,uptime mydata.csv
|
|
mlr --tsv --rs lf filter '$status != "down" && $upsec >= 10000' *.tsv
|
|
mlr --nidx put '$sum = $7 < 0.0 ? 3.5 : $7 + 2.1*$8' *.dat
|
|
grep -v '^#' /etc/group | mlr --ifs : --nidx --opprint label group,pass,gid,member then sort -f group
|
|
mlr join -j account_id -f accounts.dat then group-by account_name balances.dat
|
|
mlr --json put '$attr = sub($attr, "([0-9]+)_([0-9]+)_.*", "\1:\2")' data/*.json
|
|
mlr stats1 -a min,mean,max,p10,p50,p90 -f flag,u,v data/*
|
|
mlr stats2 -a linreg-pca -f u,v -g shape data/*
|
|
mlr put -q '@sum[$a][$b] += $x; end {emit @sum, "a", "b"}' data/*
|
|
mlr --from estimates.tbl put '
|
|
for (k,v in $*) {
|
|
if (is_numeric(v) && k =~ "^[t-z].*$") {
|
|
$sum += v; $count += 1
|
|
}
|
|
}
|
|
$mean = $sum / $count # no assignment if count unset'
|
|
mlr --from infile.dat put -f analyze.mlr
|
|
mlr --from infile.dat put 'tee > "./taps/data-".$a."-".$b, $*'
|
|
mlr --from infile.dat put 'tee | "gzip > ./taps/data-".$a."-".$b.".gz", $*'
|
|
mlr --from infile.dat put -q '@v=$*; dump | "jq .[]"'
|
|
mlr --from infile.dat put '(NR % 1000 == 0) { print > stderr, "Checkpoint ".NR}'
|
|
|
|
Data-format examples:
|
|
DKVP: delimited key-value pairs (Miller default format)
|
|
+---------------------+
|
|
| apple=1,bat=2,cog=3 | Record 1: "apple" => "1", "bat" => "2", "cog" => "3"
|
|
| dish=7,egg=8,flint | Record 2: "dish" => "7", "egg" => "8", "3" => "flint"
|
|
+---------------------+
|
|
|
|
NIDX: implicitly numerically indexed (Unix-toolkit style)
|
|
+---------------------+
|
|
| the quick brown | Record 1: "1" => "the", "2" => "quick", "3" => "brown"
|
|
| fox jumped | Record 2: "1" => "fox", "2" => "jumped"
|
|
+---------------------+
|
|
|
|
CSV/CSV-lite: comma-separated values with separate header line
|
|
+---------------------+
|
|
| apple,bat,cog |
|
|
| 1,2,3 | Record 1: "apple => "1", "bat" => "2", "cog" => "3"
|
|
| 4,5,6 | Record 2: "apple" => "4", "bat" => "5", "cog" => "6"
|
|
+---------------------+
|
|
|
|
Tabular JSON: nested objects are supported, although arrays within them are not:
|
|
+---------------------+
|
|
| { |
|
|
| "apple": 1, | Record 1: "apple" => "1", "bat" => "2", "cog" => "3"
|
|
| "bat": 2, |
|
|
| "cog": 3 |
|
|
| } |
|
|
| { |
|
|
| "dish": { | Record 2: "dish:egg" => "7", "dish:flint" => "8", "garlic" => ""
|
|
| "egg": 7, |
|
|
| "flint": 8 |
|
|
| }, |
|
|
| "garlic": "" |
|
|
| } |
|
|
+---------------------+
|
|
|
|
PPRINT: pretty-printed tabular
|
|
+---------------------+
|
|
| apple bat cog |
|
|
| 1 2 3 | Record 1: "apple => "1", "bat" => "2", "cog" => "3"
|
|
| 4 5 6 | Record 2: "apple" => "4", "bat" => "5", "cog" => "6"
|
|
+---------------------+
|
|
|
|
XTAB: pretty-printed transposed tabular
|
|
+---------------------+
|
|
| apple 1 | Record 1: "apple" => "1", "bat" => "2", "cog" => "3"
|
|
| bat 2 |
|
|
| cog 3 |
|
|
| |
|
|
| dish 7 | Record 2: "dish" => "7", "egg" => "8"
|
|
| egg 8 |
|
|
+---------------------+
|
|
|
|
Markdown tabular (supported for output only):
|
|
+-----------------------+
|
|
| | apple | bat | cog | |
|
|
| | --- | --- | --- | |
|
|
| | 1 | 2 | 3 | | Record 1: "apple => "1", "bat" => "2", "cog" => "3"
|
|
| | 4 | 5 | 6 | | Record 2: "apple" => "4", "bat" => "5", "cog" => "6"
|
|
+-----------------------+
|
|
|
|
Help options:
|
|
-h or --help Show this message.
|
|
--version Show the software version.
|
|
{verb name} --help Show verb-specific help.
|
|
--help-all-verbs Show help on all verbs.
|
|
-l or --list-all-verbs List only verb names.
|
|
-L List only verb names, one per line.
|
|
-f or --help-all-functions Show help on all built-in functions.
|
|
-F Show a bare listing of built-in functions by name.
|
|
-k or --help-all-keywords Show help on all keywords.
|
|
-K Show a bare listing of keywords by name.
|
|
|
|
Verbs:
|
|
bar bootstrap cat check count-distinct cut decimate filter grep group-by
|
|
group-like having-fields head histogram join label least-frequent
|
|
merge-fields most-frequent nest nothing put regularize rename reorder repeat
|
|
reshape sample sec2gmt sec2gmtdate seqgen shuffle sort stats1 stats2 step
|
|
tac tail tee top uniq unsparsify
|
|
|
|
Functions for the filter and put verbs:
|
|
+ + - - * / // % ** | ^ & ~ << >> == != =~ !=~ > >= < <= && || ^^ ! ? : .
|
|
gsub strlen sub substr tolower toupper abs acos acosh asin asinh atan atan2
|
|
atanh cbrt ceil cos cosh erf erfc exp expm1 floor invqnorm log log10 log1p
|
|
logifit madd max mexp min mmul msub pow qnorm round roundm sgn sin sinh sqrt
|
|
tan tanh urand urand32 urandint dhms2fsec dhms2sec fsec2dhms fsec2hms
|
|
gmt2sec hms2fsec hms2sec sec2dhms sec2gmt sec2gmtdate sec2hms strftime
|
|
strptime systime is_absent is_bool is_boolean is_empty is_empty_map is_float
|
|
is_int is_map is_nonempty_map is_not_empty is_not_map is_not_null is_null
|
|
is_numeric is_present is_string asserting_absent asserting_bool
|
|
asserting_boolean asserting_empty asserting_empty_map asserting_float
|
|
asserting_int asserting_map asserting_nonempty_map asserting_not_empty
|
|
asserting_not_map asserting_not_null asserting_null asserting_numeric
|
|
asserting_present asserting_string boolean float fmtnum hexfmt int string
|
|
typeof depth haskey joink joinkv joinv leafcount length mapdiff mapsum
|
|
splitkv splitkvx splitnv splitnvx
|
|
|
|
Please use "mlr --help-function {function name}" for function-specific help.
|
|
|
|
Data-format options, for input, output, or both:
|
|
--idkvp --odkvp --dkvp Delimited key-value pairs, e.g "a=1,b=2"
|
|
(this is Miller's default format).
|
|
|
|
--inidx --onidx --nidx Implicitly-integer-indexed fields
|
|
(Unix-toolkit style).
|
|
|
|
--icsv --ocsv --csv Comma-separated value (or tab-separated
|
|
with --fs tab, etc.)
|
|
|
|
--itsv --otsv --tsv Keystroke-savers for "--icsv --ifs tab",
|
|
"--ocsv --ofs tab", "--csv --fs tab".
|
|
|
|
--ipprint --opprint --pprint Pretty-printed tabular (produces no
|
|
output until all input is in).
|
|
--right Right-justifies all fields for PPRINT output.
|
|
--barred Prints a border around PPRINT output
|
|
(only available for output).
|
|
|
|
--omd Markdown-tabular (only available for output).
|
|
|
|
--ixtab --oxtab --xtab Pretty-printed vertical-tabular.
|
|
--xvright Right-justifies values for XTAB format.
|
|
|
|
--ijson --ojson --json JSON tabular: sequence or list of one-level
|
|
maps: {...}{...} or [{...},{...}].
|
|
--json-skip-arrays-on-input JSON arrays are unmillerable, and by default they
|
|
cause a fatal error when read. With this option,
|
|
they are ignored. Please use the jq tool for full
|
|
JSON (pre)processing.
|
|
--jvstack Put one key-value pair per line for JSON
|
|
output.
|
|
--jlistwrap Wrap JSON output in outermost [ ].
|
|
--jknquoteint Do not quote non-string map keys in JSON output.
|
|
--jvquoteall Quote map values in JSON output, even if they're
|
|
numeric.
|
|
--jflatsep {string} Separator for flattening multi-level JSON keys,
|
|
e.g. '{"a":{"b":3}}' becomes a:b => 3 for
|
|
non-JSON formats. Defaults to :.
|
|
|
|
-p is a keystroke-saver for --nidx --fs space --repifs
|
|
|
|
Examples: --csv for CSV-formatted input and output; --idkvp --opprint for
|
|
DKVP-formatted input and pretty-printed output.
|
|
|
|
Format-conversion keystroke-saver options, for input, output, or both:
|
|
As keystroke-savers for format-conversion you may use the following:
|
|
--c2t --c2d --c2n --c2j --c2x --c2p --c2m
|
|
--t2c --t2d --t2n --t2j --t2x --t2p --t2m
|
|
--d2c --d2t --d2n --d2j --d2x --d2p --d2m
|
|
--n2c --n2t --n2d --n2j --n2x --n2p --n2m
|
|
--j2c --j2t --j2d --j2n --j2x --j2p --j2m
|
|
--x2c --x2t --x2d --x2n --x2j --x2p --x2m
|
|
--p2c --p2t --p2d --p2n --p2j --p2x --p2m
|
|
The letters c t d n j x p m refer to formats CSV, TSV, DKVP, NIDX, JSON, XTAB,
|
|
PPRINT, and markdown, respectively. Note that markdown format is available for
|
|
output only.
|
|
|
|
Compressed-data options:
|
|
--prepipe {command} This allows Miller to handle compressed inputs. You can do
|
|
without this for single input files, e.g. "gunzip < myfile.csv.gz | mlr ...".
|
|
However, when multiple input files are present, between-file separations are
|
|
lost; also, the FILENAME variable doesn't iterate. Using --prepipe you can
|
|
specify an action to be taken on each input file. This pre-pipe command must
|
|
be able to read from standard input; it will be invoked with
|
|
{command} < {filename}.
|
|
Examples:
|
|
mlr --prepipe 'gunzip'
|
|
mlr --prepipe 'zcat -cf'
|
|
mlr --prepipe 'xz -cd'
|
|
mlr --prepipe cat
|
|
Note that this feature is quite general and is not limited to decompression
|
|
utilities. You can use it to apply per-file filters of your choice.
|
|
For output compression (or other) utilities, simply pipe the output:
|
|
mlr ... | {your compression command}
|
|
|
|
Separator options, for input, output, or both:
|
|
--rs --irs --ors Record separators, e.g. 'lf' or '\r\n'
|
|
--fs --ifs --ofs --repifs Field separators, e.g. comma
|
|
--ps --ips --ops Pair separators, e.g. equals sign
|
|
|
|
Notes about line endings:
|
|
* Default line endings (--irs and --ors) are "auto" which means autodetect from
|
|
the input file format, as long as the input file(s) have lines ending in either
|
|
LF (also known as linefeed, '\n', 0x0a, Unix-style) or CRLF (also known as
|
|
carriage-return/linefeed pairs, '\r\n', 0x0d 0x0a, Windows style).
|
|
* If both irs and ors are auto (which is the default) then LF input will lead to LF
|
|
output and CRLF input will lead to CRLF output, regardless of the platform you're
|
|
running on.
|
|
* The line-ending autodetector triggers on the first line ending detected in the input
|
|
stream. E.g. if you specify a CRLF-terminated file on the command line followed by an
|
|
LF-terminated file then autodetected line endings will be CRLF.
|
|
* If you use --ors {something else} with (default or explicitly specified) --irs auto
|
|
then line endings are autodetected on input and set to what you specify on output.
|
|
* If you use --irs {something else} with (default or explicitly specified) --ors auto
|
|
then the output line endings used are LF on Unix/Linux/BSD/MacOSX, and CRLF on Windows.
|
|
|
|
Notes about all other separators:
|
|
* IPS/OPS are only used for DKVP and XTAB formats, since only in these formats
|
|
do key-value pairs appear juxtaposed.
|
|
* IRS/ORS are ignored for XTAB format. Nominally IFS and OFS are newlines;
|
|
XTAB records are separated by two or more consecutive IFS/OFS -- i.e.
|
|
a blank line. Everything above about --irs/--ors/--rs auto becomes --ifs/--ofs/--fs
|
|
auto for XTAB format. (XTAB's default IFS/OFS are "auto".)
|
|
* OFS must be single-character for PPRINT format. This is because it is used
|
|
with repetition for alignment; multi-character separators would make
|
|
alignment impossible.
|
|
* OPS may be multi-character for XTAB format, in which case alignment is
|
|
disabled.
|
|
* TSV is simply CSV using tab as field separator ("--fs tab").
|
|
* FS/PS are ignored for markdown format; RS is used.
|
|
* All FS and PS options are ignored for JSON format, since they are not relevant
|
|
to the JSON format.
|
|
* You can specify separators in any of the following ways, shown by example:
|
|
- Type them out, quoting as necessary for shell escapes, e.g.
|
|
"--fs '|' --ips :"
|
|
- C-style escape sequences, e.g. "--rs '\r\n' --fs '\t'".
|
|
- To avoid backslashing, you can use any of the following names:
|
|
cr crcr newline lf lflf crlf crlfcrlf tab space comma pipe slash colon semicolon equals
|
|
* Default separators by format:
|
|
File format RS FS PS
|
|
dkvp auto , =
|
|
json auto (N/A) (N/A)
|
|
nidx auto space (N/A)
|
|
csv auto , (N/A)
|
|
csvlite auto , (N/A)
|
|
markdown auto (N/A) (N/A)
|
|
pprint auto space (N/A)
|
|
xtab (N/A) auto space
|
|
|
|
Relevant to CSV/CSV-lite input only:
|
|
--implicit-csv-header Use 1,2,3,... as field labels, rather than from line 1
|
|
of input files. Tip: combine with "label" to recreate
|
|
missing headers.
|
|
--headerless-csv-output Print only CSV data lines.
|
|
|
|
Double-quoting for CSV output:
|
|
--quote-all Wrap all fields in double quotes
|
|
--quote-none Do not wrap any fields in double quotes, even if they have
|
|
OFS or ORS in them
|
|
--quote-minimal Wrap fields in double quotes only if they have OFS or ORS
|
|
in them (default)
|
|
--quote-numeric Wrap fields in double quotes only if they have numbers
|
|
in them
|
|
--quote-original Wrap fields in double quotes if and only if they were
|
|
quoted on input. This isn't sticky for computed fields:
|
|
e.g. if fields a and b were quoted on input and you do
|
|
"put '$c = $a . $b'" then field c won't inherit a or b's
|
|
was-quoted-on-input flag.
|
|
|
|
Numerical formatting:
|
|
--ofmt {format} E.g. %.18lf, %.0lf. Please use sprintf-style codes for
|
|
double-precision. Applies to verbs which compute new
|
|
values, e.g. put, stats1, stats2. See also the fmtnum
|
|
function within mlr put (mlr --help-all-functions).
|
|
Defaults to %lf.
|
|
|
|
Other options:
|
|
--seed {n} with n of the form 12345678 or 0xcafefeed. For put/filter
|
|
urand()/urandint()/urand32().
|
|
--nr-progress-mod {m}, with m a positive integer: print filename and record
|
|
count to stderr every m input records.
|
|
--from {filename} Use this to specify an input file before the verb(s),
|
|
rather than after. May be used more than once. Example:
|
|
"mlr --from a.dat --from b.dat cat" is the same as
|
|
"mlr cat a.dat b.dat".
|
|
-n Process no input files, nor standard input either. Useful
|
|
for mlr put with begin/end statements only. (Same as --from
|
|
/dev/null.) Also useful in "mlr -n put -v '...'" for
|
|
analyzing abstract syntax trees (if that's your thing).
|
|
-I Process files in-place. For each file name on the command
|
|
line, output is written to a temp file in the same
|
|
directory, which is then renamed over the original. Each
|
|
file is processed in isolation: if the output format is
|
|
CSV, CSV headers will be present in each output file;
|
|
statistics are only over each file's own records; and so on.
|
|
|
|
Then-chaining:
|
|
Output of one verb may be chained as input to another using "then", e.g.
|
|
mlr stats1 -a min,mean,max -f flag,u,v -g color then sort -f color
|
|
|
|
For more information please see http://johnkerl.org/miller/doc and/or
|
|
http://github.com/johnkerl/miller. This is Miller version v5.0.0-dev.
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr sort --help
|
|
Usage: mlr sort {flags}
|
|
Flags:
|
|
-f {comma-separated field names} Lexical ascending
|
|
-n {comma-separated field names} Numerical ascending; nulls sort last
|
|
-nf {comma-separated field names} Numerical ascending; nulls sort last
|
|
-r {comma-separated field names} Lexical descending
|
|
-nr {comma-separated field names} Numerical descending; nulls sort first
|
|
Sorts records primarily by the first specified field, secondarily by the second
|
|
field, and so on. (Any records not having all specified sort keys will appear
|
|
at the end of the output, in the order they were encountered, regardless of the
|
|
specified sort order.) The sort is stable: records that compare equal will sort
|
|
in the order they were encountered in the input record stream.
|
|
|
|
Example:
|
|
mlr sort -f a,b -nr x,y,z
|
|
which is the same as:
|
|
mlr sort -f a -f b -nr x -nr y -nr z
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
</div>
|
|
</div>
|
|
</td>
|
|
|
|
</table>
|
|
</body>
|
|
</html>
|