miller/doc/cookbook.html
2017-04-25 12:08:16 -07:00

1668 lines
47 KiB
HTML

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<!-- PAGE GENERATED FROM template.html and content-for-cookbook.html BY poki. -->
<!-- PLEASE MAKE CHANGES THERE AND THEN RE-RUN poki. -->
<head>
<meta http-equiv="Content-type" content="text/html;charset=UTF-8"/>
<meta name="description" content="Miller documentation"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/> <!-- mobile-friendly -->
<meta name="keywords"
content="John Kerl, Kerl, Miller, miller, mlr, OLAP, data analysis software, regression, correlation, variance, data tools, " />
<title> Cookbook part 1 </title>
<link rel="stylesheet" type="text/css" href="css/miller.css"/>
<link rel="stylesheet" type="text/css" href="css/poki-callbacks.css"/>
</head>
<!-- ================================================================ -->
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
try {
var pageTracker = _gat._getTracker("UA-15651652-1");
pageTracker._trackPageview();
} catch(err) {}
</script>
<!-- ================================================================ -->
<script type="text/javascript">
function toggle_div(div) {
if (div != null) {
if (div.id.startsWith("section_toggle_")) {
var state = div.style.display;
if (state == "block") {
div.style.display = "none";
} else {
div.style.display = "block";
}
}
}
}
function expand_div(div) {
if (div != null) {
if (div.id.startsWith("section_toggle_")) {
div.style.display = "block";
}
}
}
function collapse_div(div) {
if (div != null) {
if (div.id.startsWith("section_toggle_")) {
div.style.display = "none";
}
}
}
function toggle_by_name(divName) {
toggle_div(document.getElementById(divName));
}
function expand_by_name(divName) {
expand_div(document.getElementById(divName));
}
function collapse_by_name(divName) {
collapse_div(document.getElementById(divName));
}
function expand_all() {
var divs = document.getElementsByTagName("div");
for(var i = 0; i < divs.length; i++) {
expand_div(divs[i]);
}
}
function collapse_all() {
var divs = document.getElementsByTagName("div");
for(var i = 0; i < divs.length; i++){
collapse_div(divs[i]);
}
}
</script>
<!--
The background image is from a screenshot of a Google search for "data analysis
tools", lightened and sepia-toned. Over this was placed a Mac Terminal app with
very light-grey font and translucent background, in which a few statistical
Miller commands were run with pretty-print-tabular output format.
<body background="pix/sepia-overlay.jpg">
-->
<body bgcolor="#ffffff">
<!-- ================================================================ -->
<table width="100%">
<tr>
<!-- navbar -->
<td width="15%">
<!--
<img src="pix/mlr.jpg" />
<img style="border-width:1px; color:black;" src="pix/mlr.jpg" />
-->
<div class="pokinav">
<center><titleinbody>Miller</titleinbody></center>
<!-- PAGE LIST GENERATED FROM template.html BY poki -->
<br/><b>Overview:</b>
<br/>&bull;&nbsp;<a href="index.html">About Miller</a>
<br/>&bull;&nbsp;<a href="10-min.html">Miller in 10 minutes</a>
<br/>&bull;&nbsp;<a href="file-formats.html">File formats</a>
<br/>&bull;&nbsp;<a href="feature-comparison.html">Miller features in the context of the Unix toolkit</a>
<br/>&bull;&nbsp;<a href="record-heterogeneity.html">Record-heterogeneity</a>
<br/>&bull;&nbsp;<a href="internationalization.html">Internationalization</a>
<br/><b>Using Miller:</b>
<br/>&bull;&nbsp;<a href="faq.html">FAQ</a>
<br/>&bull;&nbsp;<a href="cookbook.html"><b>Cookbook part 1</b></a>
<br/>&bull;&nbsp;<a href="cookbook2.html">Cookbook part 2</a>
<br/>&bull;&nbsp;<a href="cookbook3.html">Cookbook part 3</a>
<br/>&bull;&nbsp;<a href="data-examples.html">Data-diving examples</a>
<br/>&bull;&nbsp;<a href="manpage.html">Manpage</a>
<br/>&bull;&nbsp;<a href="reference.html">Reference</a>
<br/>&bull;&nbsp;<a href="reference-verbs.html">Reference: Verbs</a>
<br/>&bull;&nbsp;<a href="reference-dsl.html">Reference: DSL</a>
<br/>&bull;&nbsp;<a href="release-docs.html">Documents by release</a>
<br/>&bull;&nbsp;<a href="build.html">Installation, portability, dependencies, and testing</a>
<br/><b>Background:</b>
<br/>&bull;&nbsp;<a href="why.html">Why?</a>
<br/>&bull;&nbsp;<a href="whyc.html">Why C?</a>
<br/>&bull;&nbsp;<a href="etymology.html">Why call it Miller?</a>
<br/>&bull;&nbsp;<a href="originality.html">How original is Miller?</a>
<br/>&bull;&nbsp;<a href="performance.html">Performance</a>
<br/><b>Repository:</b>
<br/>&bull;&nbsp;<a href="to-do.html">Things to do</a>
<br/>&bull;&nbsp;<a href="contact.html">Contact information</a>
<br/>&bull;&nbsp;<a href="https://github.com/johnkerl/miller">GitHub repo</a>
<br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/>
<br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/>
<br/> <br/> <br/> <br/> <br/> <br/>
</div>
</td>
<!-- page body -->
<td>
<!--
This is a visually gorgeous feature (here & in the CSS): it allows for
independent scroll of the nav and body panels. In particular the nav
stays on-screen as you scroll the body.
However, two problems:
(1) In Firefox & Chrome both I get janky end-of-body scrolls: there is
more content but I can't scroll down to it unless I repeatedly retry the
scrolldown. Which is weird.
(2) Worse, only the first page renders in PDF (again, Firefox & Chrome).
For now I'm disabling this separate-scroll feature. A frontender, I am
not ... maybe someday I'll find a config which gets *all* the features
I want; for now, it's a tradeoff.
-->
<!-- Implementation details: one bit is right here:
div style="overflow-y:scroll;height:1500px"
and the other bit is in css/poki-callbacks.css:
.pokinav {
display: inline-block;
background: #e8d9bc;
border: 1;
box-shadow: 0px 0px 3px 3px #C9C9C9;
margin: 10px;
padding-top: 10px;
padding-bottom: 10px;
padding-left: 10px;
padding-right: 10px;
overflow-y: scroll; < - - - - - - here
height: 1500px;
}
-->
<div>
<center> <titleinbody> Cookbook part 1 </titleinbody> </center>
<p/>
<!-- BODY COPIED FROM content-for-cookbook.html BY poki -->
<p/>
<center>
<boldmaroon>Common patterns</boldmaroon>
</center>
<div class="pokitoc">
<center><b>Contents:</b></center>
&bull;&nbsp;<a href="#Headerless_CSV_on_input_or_output">Headerless CSV on input or output</a><br/>
&bull;&nbsp;<a href="#Bulk_rename_of_fields">Bulk rename of fields</a><br/>
&bull;&nbsp;<a href="#Full_field_renames_and_reassigns">Full field renames and reassigns</a><br/>
&bull;&nbsp;<a href="#Numbering_and_renumbering_records">Numbering and renumbering records</a><br/>
&bull;&nbsp;<a href="#Data-cleaning_examples">Data-cleaning examples</a><br/>
&bull;&nbsp;<a href="#Splitting_nested_fields">Splitting nested fields</a><br/>
&bull;&nbsp;<a href="#Showing_differences_between_successive_queries">Showing differences between successive queries</a><br/>
&bull;&nbsp;<a href="#Finding_missing_dates">Finding missing dates</a><br/>
&bull;&nbsp;<a href="#Two-pass_algorithms">Two-pass algorithms</a><br/>
&nbsp;&nbsp;&nbsp;&nbsp;&bull;&nbsp;<a href="#Two-pass_algorithms:_computation_of_percentages">Two-pass algorithms: computation of percentages</a><br/>
&nbsp;&nbsp;&nbsp;&nbsp;&bull;&nbsp;<a href="#Two-pass_algorithms:_line-number_ratios">Two-pass algorithms: line-number ratios</a><br/>
&nbsp;&nbsp;&nbsp;&nbsp;&bull;&nbsp;<a href="#Two-pass_algorithms:_records_having_max_value">Two-pass algorithms: records having max value</a><br/>
&bull;&nbsp;<a href="#Rectangularizing_data">Rectangularizing data</a><br/>
&bull;&nbsp;<a href="#Regularizing_ragged_CSV">Regularizing ragged CSV</a><br/>
&bull;&nbsp;<a href="#Feature-counting">Feature-counting</a><br/>
&bull;&nbsp;<a href="#Unsparsing">Unsparsing</a><br/>
&bull;&nbsp;<a href="#Parsing_log-file_output">Parsing log-file output</a><br/>
&bull;&nbsp;<a href="#Memoization_with_out-of-stream_variables">Memoization with out-of-stream variables</a><br/>
</div>
<p/>
<p/>
<button style="font-weight:bold;color:maroon;border:0" onclick="expand_all();" href="javascript:;">Expand all sections</button>
<button style="font-weight:bold;color:maroon;border:0" onclick="collapse_all();" href="javascript:;">Collapse all sections</button>
<!-- ================================================================ -->
<a id="Headerless_CSV_on_input_or_output"/><h1>Headerless CSV on input or output</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_headerless_csv');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_headerless_csv" style="display: block">
<p/>Sometimes we get CSV files which lack a header. For example:
<p/>
<div class="pokipanel">
<pre>
$ cat data/headerless.csv
John,23,present
Fred,34,present
Alice,56,missing
Carol,45,present
</pre>
</div>
<p/>
<p/> You can use Miller to add a header. The <tt>--implicit-csv-header</tt> applies positionally indexed labels:
<p/>
<div class="pokipanel">
<pre>
$ mlr --csv --implicit-csv-header cat data/headerless.csv
1,2,3
John,23,present
Fred,34,present
Alice,56,missing
Carol,45,present
</pre>
</div>
<p/>
<p/> Following that, you can rename the positionally indexed labels to names with meaning for your context.
For example:
<p/>
<div class="pokipanel">
<pre>
$ mlr --csv --implicit-csv-header label name,age,status data/headerless.csv
name,age,status
John,23,present
Fred,34,present
Alice,56,missing
Carol,45,present
</pre>
</div>
<p/>
<p/> Likewise, if you need to produce CSV which is lacking its header, you can pipe Miller&rsquo;s output
to the system command <tt>sed 1d</tt>, or you can use Miller&rsquo;s <tt>--headerless-csv-output</tt> option:
<p/>
<div class="pokipanel">
<pre>
$ head -5 data/colored-shapes.dkvp | mlr --ocsv cat
color,shape,flag,i,u,v,w,x
yellow,triangle,1,11,0.6321695890307647,0.9887207810889004,0.4364983936735774,5.7981881667050565
red,square,1,15,0.21966833570651523,0.001257332190235938,0.7927778364718627,2.944117399716207
red,circle,1,16,0.20901671281497636,0.29005231936593445,0.13810280912907674,5.065034003400998
red,square,0,48,0.9562743938458542,0.7467203085342884,0.7755423050923582,7.117831369597269
purple,triangle,0,51,0.4355354501763202,0.8591292672156728,0.8122903963006748,5.753094629505863
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ head -5 data/colored-shapes.dkvp | mlr --ocsv --headerless-csv-output cat
yellow,triangle,1,11,0.6321695890307647,0.9887207810889004,0.4364983936735774,5.7981881667050565
red,square,1,15,0.21966833570651523,0.001257332190235938,0.7927778364718627,2.944117399716207
red,circle,1,16,0.20901671281497636,0.29005231936593445,0.13810280912907674,5.065034003400998
red,square,0,48,0.9562743938458542,0.7467203085342884,0.7755423050923582,7.117831369597269
purple,triangle,0,51,0.4355354501763202,0.8591292672156728,0.8122903963006748,5.753094629505863
</pre>
</div>
<p/>
<p/> Lastly, often we say &ldquo;CSV&rdquo; or &ldquo;TSV&rdquo; when we have
positionally indexed data in columns which are separated by commas or tabs,
respectively. In this case it&rsquo;s perhaps simpler to <b>just use NIDX
format</b> which was designed for this purpose. (See also
<a href="file-formats.html">File formats</a>.) For example:
<p/>
<div class="pokipanel">
<pre>
$ mlr --inidx --ifs comma --oxtab cut -f 1,3 data/headerless.csv
1 John
3 present
1 Fred
3 present
1 Alice
3 missing
1 Carol
3 present
</pre>
</div>
<p/>
<!-- ================================================================ -->
</div>
<a id="Bulk_rename_of_fields"/><h1>Bulk rename of fields</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_bulk_rename');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_bulk_rename" style="display: block">
<p/>Suppose you want to replace spaces with underscores in your column names:
<p/>
<div class="pokipanel">
<pre>
$ cat data/spaces.csv
a b c,def,g h i
123,4567,890
2468,1357,3579
9987,3312,4543
</pre>
</div>
<p/>
<p/>The simplest way is to use <tt>mlr rename</tt> with <tt>-g</tt> (for global
replace, not just first occurrence of space within each field) and <tt>-r</tt>
for pattern-matching (rather than explicit single-column renames):
<p/>
<div class="pokipanel">
<pre>
$ mlr --csv rename -g -r ' ,_' data/spaces.csv
a_b_c,def,g_h_i
123,4567,890
2468,1357,3579
9987,3312,4543
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --csv --opprint rename -g -r ' ,_' data/spaces.csv
a_b_c def g_h_i
123 4567 890
2468 1357 3579
9987 3312 4543
</pre>
</div>
<p/>
<p/>You can also do this with a for-loop:
<p/>
<div class="pokipanel">
<pre>
$ cat data/bulk-rename-for-loop.mlr
map newrec = {};
for (oldk, v in $*) {
newrec[gsub(oldk, " ", "_")] = v;
}
$* = newrec
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --icsv --opprint put -f data/bulk-rename-for-loop.mlr data/spaces.csv
a_b_c def g_h_i
123 4567 890
2468 1357 3579
9987 3312 4543
</pre>
</div>
<p/>
</div>
<!-- ================================================================ -->
<a id="Full_field_renames_and_reassigns"/><h1>Full field renames and reassigns</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_full_renames_reassigns');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_full_renames_reassigns" style="display: block">
<p/>Using Miller 5.0.0&rsquo;s map literals and assigning to <tt>$*</tt>, you can fully generalize
<a href="reference-verbs.html#rename"><tt>mlr rename</tt></a>,
<a href="reference-verbs.html#reorder"><tt>mlr reorder</tt></a>,
etc.:
<p/>
<div class="pokipanel">
<pre>
$ cat data/small
a=pan,b=pan,i=1,x=0.3467901443380824,y=0.7268028627434533
a=eks,b=pan,i=2,x=0.7586799647899636,y=0.5221511083334797
a=wye,b=wye,i=3,x=0.20460330576630303,y=0.33831852551664776
a=eks,b=wye,i=4,x=0.38139939387114097,y=0.13418874328430463
a=wye,b=pan,i=5,x=0.5732889198020006,y=0.8636244699032729
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr put '
begin {
@i_cumu = 0;
}
@i_cumu += $i;
$* = {
"z": $x + y,
"KEYFIELD": $a,
"i": $i_cumu,
"b": $b,
"y": $x,
"x": $y,
};
' data/small
z=0.346790,KEYFIELD=pan,i=,b=pan,y=0.346790,x=0.726803
z=0.758680,KEYFIELD=eks,i=,b=pan,y=0.758680,x=0.522151
z=0.204603,KEYFIELD=wye,i=,b=wye,y=0.204603,x=0.338319
z=0.381399,KEYFIELD=eks,i=,b=wye,y=0.381399,x=0.134189
z=0.573289,KEYFIELD=wye,i=,b=pan,y=0.573289,x=0.863624
</pre>
</div>
<p/>
<!-- ================================================================ -->
</div>
<a id="Numbering_and_renumbering_records"/><h1>Numbering and renumbering records</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_renumbering_records');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_renumbering_records" style="display: block">
<p/> The <tt>awk</tt>-like built-in variable <tt>NR</tt> is incremented for each input record:
<p/>
<div class="pokipanel">
<pre>
$ cat data/small
a=pan,b=pan,i=1,x=0.3467901443380824,y=0.7268028627434533
a=eks,b=pan,i=2,x=0.7586799647899636,y=0.5221511083334797
a=wye,b=wye,i=3,x=0.20460330576630303,y=0.33831852551664776
a=eks,b=wye,i=4,x=0.38139939387114097,y=0.13418874328430463
a=wye,b=pan,i=5,x=0.5732889198020006,y=0.8636244699032729
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr put '$nr = NR' data/small
a=pan,b=pan,i=1,x=0.3467901443380824,y=0.7268028627434533,nr=1
a=eks,b=pan,i=2,x=0.7586799647899636,y=0.5221511083334797,nr=2
a=wye,b=wye,i=3,x=0.20460330576630303,y=0.33831852551664776,nr=3
a=eks,b=wye,i=4,x=0.38139939387114097,y=0.13418874328430463,nr=4
a=wye,b=pan,i=5,x=0.5732889198020006,y=0.8636244699032729,nr=5
</pre>
</div>
<p/>
<p/> However, this is the record number within the original input stream
&mdash; not after any filtering you may have done:
<p/>
<div class="pokipanel">
<pre>
$ mlr filter '$a == "wye"' then put '$nr = NR' data/small
a=wye,b=wye,i=3,x=0.20460330576630303,y=0.33831852551664776,nr=3
a=wye,b=pan,i=5,x=0.5732889198020006,y=0.8636244699032729,nr=5
</pre>
</div>
<p/>
<p/> There are two good options here. One is to use the <tt>cat</tt> verb with <tt>-n</tt>:
<p/>
<div class="pokipanel">
<pre>
$ mlr filter '$a == "wye"' then cat -n data/small
n=1,a=wye,b=wye,i=3,x=0.20460330576630303,y=0.33831852551664776
n=2,a=wye,b=pan,i=5,x=0.5732889198020006,y=0.8636244699032729
</pre>
</div>
<p/>
<p/> The other is to keep your own counter within the <tt>put</tt> DSL:
<p/>
<div class="pokipanel">
<pre>
$ mlr filter '$a == "wye"' then put 'begin {@n = 1} $n = @n; @n += 1' data/small
a=wye,b=wye,i=3,x=0.20460330576630303,y=0.33831852551664776,n=1
a=wye,b=pan,i=5,x=0.5732889198020006,y=0.8636244699032729,n=2
</pre>
</div>
<p/>
<p/> The difference is a matter of taste (although <tt>mlr cat -n</tt> puts the counter first).
<!-- ================================================================ -->
</div>
<a id="Data-cleaning_examples"/><h1>Data-cleaning examples</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_data_cleaning_examples');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_data_cleaning_examples" style="display: block">
<p/> Here are some ways to use the type-checking options as described in
the <a href="reference-dsl.html#Type-test_and_type-assertion_expressions">Reference: DSL</a>.
Suppose you have the following data file, with inconsistent typing for boolean.
(Also imagine that, for the sake of discussion, we have a million-line file
rather than a four-line file, so we can&rsquo;t see it all at once and some
automation is called for.)
<p/>
<div class="pokipanel">
<pre>
$ cat data/het-bool.csv
name,reachable
barney,false
betty,true
fred,true
wilma,1
</pre>
</div>
<p/>
<p/> One option is to coerce everything to boolean, or integer:
<p/>
<div class="pokipanel">
<pre>
$ mlr --icsv --opprint put '$reachable = boolean($reachable)' data/het-bool.csv
name reachable
barney false
betty true
fred true
wilma true
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --icsv --opprint put '$reachable = int(boolean($reachable))' data/het-bool.csv
name reachable
barney 0
betty 1
fred 1
wilma 1
</pre>
</div>
<p/>
<p/> A second option is to flag badly formatted data within the output stream:
<p/>
<div class="pokipanel">
<pre>
$ mlr --icsv --opprint put '$format_ok = is_string($reachable)' data/het-bool.csv
name reachable format_ok
barney false true
betty true true
fred true true
wilma 1 false
</pre>
</div>
<p/>
<p/> Or perhaps to flag badly formatted data outside the output stream:
<p/>
<div class="pokipanel">
<pre>
$ mlr --icsv --opprint put 'if (!is_string($reachable)) {eprint "Malformed at NR=".NR} ' data/het-bool.csv
Malformed at NR=4
name reachable
barney false
betty true
fred true
wilma 1
</pre>
</div>
<p/>
<p/> A third way is to abort the process on first instance of bad data:
<p/>
<div class="pokipanel">
<pre>
$ mlr --csv put '$reachable = asserting_string($reachable)' data/het-bool.csv
mlr: string type-assertion failed at NR=4 FNR=4 FILENAME=data/het-bool.csv
name,reachable
barney,false
betty,true
fred,true
</pre>
</div>
<p/>
</div>
<!-- ================================================================ -->
<a id="Splitting_nested_fields"/><h1>Splitting nested fields</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_splitting_nested');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_splitting_nested" style="display: block">
<p/> Suppose you have a TSV file like this:
<p/>
<div class="pokipanel">
<pre>
a b
x z
s u:v:w
</pre>
</div>
<p/>
<p/> The simplest option is to use <a href="reference-verbs.html#nest"><tt>mlr nest</tt></a>:
<p/>
<div class="pokipanel">
<pre>
$ mlr --tsv nest --explode --values --across-records -f b --nested-fs : data/nested.tsv
a b
x z
s u
s v
s w
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --tsv nest --explode --values --across-fields -f b --nested-fs : data/nested.tsv
a b_1
x z
a b_1 b_2 b_3
s u v w
</pre>
</div>
<p/>
<p/> While <tt>mlr nest</tt> is simplest, let&rsquo;s also take a look at a few ways to do this using the
<tt>put</tt> DSL.
<p/> One option to split out the colon-delimited values in the <tt>b</tt>
column is to use <tt>splitnv</tt> to create an integer-indexed map and loop
over it, adding new fields to the current record:
<p/>
<div class="pokipanel">
<pre>
$ mlr --from data/nested.tsv --itsv --oxtab put 'o=splitnv($b, ":"); for (k,v in o) {$["p".k]=v}'
a x
b z
p1 z
a s
b u:v:w
p1 u
p2 v
p3 w
</pre>
</div>
<p/>
<p/> while another is to loop over the same map from <tt>splitnv</tt> and use
it (with <tt>put -q</tt> to suppress printing the original record) to produce
multiple records:
<p/>
<div class="pokipanel">
<pre>
$ mlr --from data/nested.tsv --itsv --oxtab put -q 'o=splitnv($b, ":"); for (k,v in o) {emit mapsum($*, {"b":v})}'
a x
b z
a s
b u
a s
b v
a s
b w
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --from data/nested.tsv --tsv put -q 'o=splitnv($b, ":"); for (k,v in o) {emit mapsum($*, {"b":v})}'
a b
x z
s u
s v
s w
</pre>
</div>
<p/>
</div>
<!-- ================================================================ -->
<a id="Showing_differences_between_successive_queries"/><h1>Showing differences between successive queries</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_successive_query_deltas');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_successive_query_deltas" style="display: block">
<p/> Suppose you have a database query which you run at one point in time, producing the output on the
left, then again later producing the output on the right:
<table>
<tr>
<td>
<p/>
<div class="pokipanel">
<pre>
$ cat data/previous_counters.csv
color,count
red,3472
blue,6838
orange,694
purple,12
</pre>
</div>
<p/>
</td>
<td>
<p/>
<div class="pokipanel">
<pre>
$ cat data/current_counters.csv
color,count
red,3467
orange,670
yellow,27
blue,6944
</pre>
</div>
<p/>
</td>
</tr>
</table>
<p/> And, suppose you want to compute the differences in the counters between
adjacent keys. Since the color names aren&rsquo;t all in the same order, nor
are they all present on both sides, we can&rsquo;t just paste the two files
side-by-side and do some column-four-minus-column-two arithmetic.
<p/> First, rename counter columns to make them distinct:
<p/>
<div class="pokipanel">
<pre>
$ mlr --csv rename count,previous_count data/previous_counters.csv &gt; data/prevtemp.csv
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ cat data/prevtemp.csv
color,previous_count
red,3472
blue,6838
orange,694
purple,12
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --csv rename count,current_count data/current_counters.csv &gt; data/currtemp.csv
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ cat data/currtemp.csv
color,current_count
red,3467
orange,670
yellow,27
blue,6944
</pre>
</div>
<p/>
<p/> Then, join on the key field(s), and use unsparsify to zero-fill counters
absent on one side but present on the other. Use <tt>--ul</tt> and
<tt>--ur</tt> to emit unpaired records (namely, purple on the left and yellow on the right):
<p/>
<div class="pokipanel">
<pre>
$ mlr --icsv --opprint \
join -j color --ul --ur -f data/prevtemp.csv \
then unsparsify --fill-with 0 \
then put '$count_delta = $current_count - $previous_count' \
data/currtemp.csv
color previous_count current_count count_delta
red 3472 3467 -5
orange 694 670 -24
yellow 0 27 27
blue 6838 6944 106
purple 12 0 -12
</pre>
</div>
<p/>
</div>
<!-- ================================================================ -->
<a id="Finding_missing_dates"/><h1>Finding missing dates</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_finding_missing_dates');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_finding_missing_dates" style="display: block">
<p/>Suppose you have some date-stamped data which may (or may not) be missing entries for one or more dates:
<p/>
<div class="pokipanel">
<pre>
$ head -n 10 data/miss-date.csv
date,qoh
2012-03-05,10055
2012-03-06,10486
2012-03-07,10430
2012-03-08,10674
2012-03-09,10880
2012-03-10,10718
2012-03-11,10795
2012-03-12,11043
2012-03-13,11177
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ wc -l data/miss-date.csv
1372 data/miss-date.csv
</pre>
</div>
<p/>
<p/>Since there are 1372 lines in the data file, some automation is called for.
To find the missing dates, you can convert the dates to seconds since the epoch
using <tt>strptime</tt>, then compute adjacent differences (the <tt>cat -n</tt>
simply inserts record-counters):
<p/>
<div class="pokipanel">
<pre>
$ mlr --from data/miss-date.csv --icsv \
cat -n \
then put '$datestamp = strptime($date, "%Y-%m-%d")' \
then step -a delta -f datestamp \
| head
n=1,date=2012-03-05,qoh=10055,datestamp=1330905600.000000,datestamp_delta=0
n=2,date=2012-03-06,qoh=10486,datestamp=1330992000.000000,datestamp_delta=86400.000000
n=3,date=2012-03-07,qoh=10430,datestamp=1331078400.000000,datestamp_delta=86400.000000
n=4,date=2012-03-08,qoh=10674,datestamp=1331164800.000000,datestamp_delta=86400.000000
n=5,date=2012-03-09,qoh=10880,datestamp=1331251200.000000,datestamp_delta=86400.000000
n=6,date=2012-03-10,qoh=10718,datestamp=1331337600.000000,datestamp_delta=86400.000000
n=7,date=2012-03-11,qoh=10795,datestamp=1331424000.000000,datestamp_delta=86400.000000
n=8,date=2012-03-12,qoh=11043,datestamp=1331510400.000000,datestamp_delta=86400.000000
n=9,date=2012-03-13,qoh=11177,datestamp=1331596800.000000,datestamp_delta=86400.000000
n=10,date=2012-03-14,qoh=11498,datestamp=1331683200.000000,datestamp_delta=86400.000000
</pre>
</div>
<p/>
<p/>Then, filter for adjacent difference not being 86400 (the number of seconds in a day):
<p/>
<div class="pokipanel">
<pre>
$ mlr --from data/miss-date.csv --icsv \
cat -n \
then put '$datestamp = strptime($date, "%Y-%m-%d")' \
then step -a delta -f datestamp \
then filter '$datestamp_delta != 86400 &amp;&amp; $n != 1'
n=774,date=2014-04-19,qoh=130140,datestamp=1397865600.000000,datestamp_delta=259200.000000
n=1119,date=2015-03-31,qoh=181625,datestamp=1427760000.000000,datestamp_delta=172800.000000
</pre>
</div>
<p/>
<p/> Given this, it&rsquo;s now easy to see where the gaps are:
<p/>
<div class="pokipanel">
<pre>
$ mlr cat -n then filter '$n &gt;= 770 &amp;&amp; $n &lt;= 780' data/miss-date.csv
n=770,1=2014-04-12,2=129435
n=771,1=2014-04-13,2=129868
n=772,1=2014-04-14,2=129797
n=773,1=2014-04-15,2=129919
n=774,1=2014-04-16,2=130181
n=775,1=2014-04-19,2=130140
n=776,1=2014-04-20,2=130271
n=777,1=2014-04-21,2=130368
n=778,1=2014-04-22,2=130368
n=779,1=2014-04-23,2=130849
n=780,1=2014-04-24,2=131026
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr cat -n then filter '$n &gt;= 1115 &amp;&amp; $n &lt;= 1125' data/miss-date.csv
n=1115,1=2015-03-25,2=181006
n=1116,1=2015-03-26,2=180995
n=1117,1=2015-03-27,2=181043
n=1118,1=2015-03-28,2=181112
n=1119,1=2015-03-29,2=181306
n=1120,1=2015-03-31,2=181625
n=1121,1=2015-04-01,2=181494
n=1122,1=2015-04-02,2=181718
n=1123,1=2015-04-03,2=181835
n=1124,1=2015-04-04,2=182104
n=1125,1=2015-04-05,2=182528
</pre>
</div>
<p/>
</div>
<!-- ================================================================ -->
<a id="Two-pass_algorithms"/><h1>Two-pass algorithms</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_two_pass_algorithms');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_two_pass_algorithms" style="display: block">
<p/>Miller is a streaming record processor; commands are performed once per
record. This makes Miller particularly suitable for single-pass algorithms,
allowing many of its verbs to process files that are (much) larger than the
amount of RAM present in your system. (Of course, Miller verbs such as
<tt>sort</tt>, </tt>tac</tt>, etc. all must ingest and retain all input records
before emitting any output records.) You can also use out-of-stream variables
to perform multi-pass computations, at the price of retaining all input records
in memory.
<a id="Two-pass_algorithms:_computation_of_percentages"/><h2>Two-pass algorithms: computation of percentages</h2>
<p/> For example, mapping numeric values down a column to the percentage
between their min and max values is two-pass: on the first pass you find the
min and max values, then on the second, map each record&rsquo;s value to a
percentage.
<p/>
<div class="pokipanel">
<pre>
$ mlr --from data/small --opprint put -q '
# These are executed once per record, which is the first pass.
# The key is to use NR to index an out-of-stream variable to
# retain all the x-field values.
@x_min = min($x, @x_min);
@x_max = max($x, @x_max);
@x[NR] = $x;
# The second pass is in a for-loop in an end-block.
end {
for (nr, x in @x) {
@x_pct[nr] = 100 * (x - @x_min) / (@x_max - @x_min);
}
emit (@x, @x_pct), "NR"
}
'
NR x x_pct
1 0.346790 25.661943
2 0.758680 100.000000
3 0.204603 0.000000
4 0.381399 31.908236
5 0.573289 66.540542
</pre>
</div>
<p/>
<a id="Two-pass_algorithms:_line-number_ratios"/><h2>Two-pass algorithms: line-number ratios</h2>
<p/>Similarly, finding the total record count requires first reading through
all the data:
<p/>
<div class="pokipanel">
<pre>
$ mlr --opprint --from data/small put -q '
@records[NR] = $*;
end {
for((I,k),v in @records) {
@records[I]["I"] = I;
@records[I]["N"] = NR;
@records[I]["PCT"] = 100*I/NR
}
emit @records,"I"
}
' then reorder -f I,N,PCT
I N PCT a b i x y
1 5 20 pan pan 1 0.3467901443380824 0.7268028627434533
2 5 40 eks pan 2 0.7586799647899636 0.5221511083334797
3 5 60 wye wye 3 0.20460330576630303 0.33831852551664776
4 5 80 eks wye 4 0.38139939387114097 0.13418874328430463
5 5 100 wye pan 5 0.5732889198020006 0.8636244699032729
</pre>
</div>
<p/>
<a id="Two-pass_algorithms:_records_having_max_value"/><h2>Two-pass algorithms: records having max value</h2>
<p/>The idea is to retain records having the largest value of <tt>n</tt> in the
following data:
<p/>
<div class="pokipanel">
<pre>
$ mlr --itsv --opprint cat data/maxrows.tsv
a b n score
purple red 5 0.743231
blue purple 2 0.093710
red purple 2 0.802103
purple red 5 0.389055
red purple 2 0.880457
orange red 2 0.540349
purple purple 1 0.634451
orange purple 5 0.257223
orange purple 5 0.693499
red red 4 0.981355
blue purple 5 0.157052
purple purple 1 0.441784
red purple 1 0.124912
orange blue 1 0.921944
blue purple 4 0.490909
purple red 5 0.454779
green purple 4 0.198278
orange blue 5 0.705700
red red 3 0.940705
purple red 5 0.072936
orange blue 3 0.389463
orange purple 2 0.664985
blue purple 1 0.371813
red purple 4 0.984571
green purple 5 0.203577
green purple 3 0.900873
purple purple 0 0.965677
blue purple 2 0.208785
purple purple 1 0.455077
red purple 4 0.477187
blue red 4 0.007487
</pre>
</div>
<p/>
<p/>Of course, the largest value of <tt>n</tt> isn&rsquo;t known until after
all data have been read. Using an out-of-stream variable we can retain all
records as they are read, then filter them at the end:
<p/>
<div class="pokipanel">
<pre>
$ cat data/maxrows.mlr
# Retain all records
@records[NR] = $*;
# Track max value of n
@maxn = max(@maxn, $n);
# After all records have been read, loop through retained records
# and print those with the max n value.
end {
for (int nr in @records) {
map record = @records[nr];
if (record["n"] == @maxn) {
emit record;
}
}
}
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --itsv --opprint put -q -f data/maxrows.mlr data/maxrows.tsv
a b n score
purple red 5 0.743231
purple red 5 0.389055
orange purple 5 0.257223
orange purple 5 0.693499
blue purple 5 0.157052
purple red 5 0.454779
orange blue 5 0.705700
purple red 5 0.072936
green purple 5 0.203577
</pre>
</div>
<p/>
<!-- ================================================================ -->
</div>
<a id="Rectangularizing_data"/><h1>Rectangularizing data</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_rectangularizing_data');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_rectangularizing_data" style="display: block">
<p/>Suppose you have a method (in whatever language) which is printing things of the form
<p/>
<div class="pokipanel">
<pre>
outer=1
outer=2
outer=3
</pre>
</div>
<p/>
and then calls another method which prints things of the form
<p/>
<div class="pokipanel">
<pre>
middle=10
middle=11
middle=12
middle=20
middle=21
middle=30
middle=31
</pre>
</div>
<p/>
and then, perhaps, that second method calls a third method which prints things of the form
<p/>
<div class="pokipanel">
<pre>
inner1=100,inner2=101
inner1=120,inner2=121
inner1=200,inner2=201
inner1=210,inner2=211
inner1=300,inner2=301
inner1=312
inner1=313,inner2=314
</pre>
</div>
<p/>
with the result that your program&rsquo;s output is
<p/>
<div class="pokipanel">
<pre>
outer=1
middle=10
inner1=100,inner2=101
middle=11
middle=12
inner1=120,inner2=121
outer=2
middle=20
inner1=200,inner2=201
middle=21
inner1=210,inner2=211
outer=3
middle=30
inner1=300,inner2=301
middle=31
inner1=312
inner1=313,inner2=314
</pre>
</div>
<p/>
The idea here is that middles starting with a 1 belong to the outer value of 1,
and so on. (For example, the outer values might be account IDs, the middle
values might be invoice IDs, and the inner values might be invoice line-items.)
If you want all the middle and inner lines to have the context of which outers
they belong to, you can modify your software to pass all those through your
methods. Alternatively, don&rsquo;t refactor your code just to handle some
ad-hoc log-data formatting &mdash; instead, use the following to rectangularize
the data. The idea is to use an out-of-stream variable to accumulate fields
across records. Clear that variable when you see an outer ID; accumulate
fields; emit output when you see the inner IDs.
<p/>
<div class="pokipanel">
<pre>
$ mlr --from data/rect.txt put -q '
is_present($outer) {
unset @r
}
for (k, v in $*) {
@r[k] = v
}
is_present($inner1) {
emit @r
}'
outer=1,middle=10,inner1=100,inner2=101
outer=1,middle=12,inner1=120,inner2=121
outer=2,middle=20,inner1=200,inner2=201
outer=2,middle=21,inner1=210,inner2=211
outer=3,middle=30,inner1=300,inner2=301
outer=3,middle=31,inner1=312,inner2=301
outer=3,middle=31,inner1=313,inner2=314
</pre>
</div>
<p/>
<!-- ================================================================ -->
</div>
<a id="Regularizing_ragged_CSV"/><h1>Regularizing ragged CSV</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_rectangularizing_ragged_csv');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_rectangularizing_ragged_csv" style="display: block">
<p/>Miller handles compliant CSV: in particular, it&rsquo;s an error if the
number of data fields in a given data line don&rsquo;t match the number of
header lines. But in the event that you have a CSV file in which some lines
have less than the full number of fields, you can use Miller to pad them out.
The trick is to use NIDX format, for which each line stands on its own without
respect to a header line.
<p/>
<div class="pokipanel">
<pre>
$ cat data/ragged.csv
a,b,c
1,2,3
4,5
6
7,8,9
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --from data/ragged.csv --fs comma --nidx put '
@maxnf = max(@maxnf, NF);
@nf = NF;
while(@nf &lt; @maxnf) {
@nf += 1;
$[@nf] = ""
}
'
a,b,c
1,2,3
4,5,
6,,
7,8,9
</pre>
</div>
<p/>
or, more simply,
<p/>
<div class="pokipanel">
<pre>
$ mlr --from data/ragged.csv --fs comma --nidx put '
@maxnf = max(@maxnf, NF);
while(NF &lt; @maxnf) {
$[NF+1] = "";
}
'
a,b,c
1,2,3
4,5,
6,,
7,8,9
</pre>
</div>
<p/>
<!-- ================================================================ -->
</div>
<a id="Feature-counting"/><h1>Feature-counting</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_feature_counting');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_feature_counting" style="display: block">
<p/>Suppose you have some heterogeneous data like this:
<p/>
<div class="pokipanel">
<pre>
{ "qoh": 29874, "rate": 1.68, "latency": 0.02 }
{ "name": "alice", "uid": 572 }
{ "qoh": 1227, "rate": 1.01, "latency": 0.07 }
{ "qoh": 13458, "rate": 1.72, "latency": 0.04 }
{ "qoh": 56782, "rate": 1.64 }
{ "qoh": 23512, "rate": 1.71, "latency": 0.03 }
{ "qoh": 9876, "rate": 1.89, "latency": 0.08 }
{ "name": "bill", "uid": 684 }
{ "name": "chuck", "uid2": 908 }
{ "name": "dottie", "uid": 440 }
{ "qoh": 0, "rate": 0.40, "latency": 0.01 }
{ "qoh": 5438, "rate": 1.56, "latency": 0.17 }
</pre>
</div>
<p/>
<p/> A reasonable question to ask is, how many occurrences of each field are
there? And, what percentage of total row count has each of them? Since the
denominator of the percentage is not known until the end, this is a two-pass
algorithm:
<p/>
<div class="pokipanel">
<pre>
for (key in $*) {
@key_counts[key] += 1;
}
@record_count += 1;
end {
for (key in @key_counts) {
@key_fraction[key] = @key_counts[key] / @record_count
}
emit @record_count;
emit @key_counts, "key";
emit @key_fraction,"key"
}
</pre>
</div>
<p/>
<p/> Then
<p/>
<div class="pokipanel">
<pre>
$ mlr --json put -q -f data/feature-count.mlr data/features.json
{ "record_count": 12 }
{ "key": "qoh", "key_counts": 8 }
{ "key": "rate", "key_counts": 8 }
{ "key": "latency", "key_counts": 7 }
{ "key": "name", "key_counts": 4 }
{ "key": "uid", "key_counts": 3 }
{ "key": "uid2", "key_counts": 1 }
{ "key": "qoh", "key_fraction": 0.666667 }
{ "key": "rate", "key_fraction": 0.666667 }
{ "key": "latency", "key_fraction": 0.583333 }
{ "key": "name", "key_fraction": 0.333333 }
{ "key": "uid", "key_fraction": 0.250000 }
{ "key": "uid2", "key_fraction": 0.083333 }
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --ijson --opprint put -q -f data/feature-count.mlr data/features.json
record_count
12
key key_counts
qoh 8
rate 8
latency 7
name 4
uid 3
uid2 1
key key_fraction
qoh 0.666667
rate 0.666667
latency 0.583333
name 0.333333
uid 0.250000
uid2 0.083333
</pre>
</div>
<p/>
<!-- ================================================================ -->
</div>
<a id="Unsparsing"/><h1>Unsparsing</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_unsparsing');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_unsparsing" style="display: block">
<p/> The previous section discussed how to fill out missing data fields within
CSV with full header line &mdash; so the list of all field names is present
within the header line. Next, let&rsquo;s look at a related problem: we have
data where each record has various key names but we want to produce rectangular
output having the union of all key names.
<p/> For example, suppose you have JSON input like this:
<p/>
<div class="pokipanel">
<pre>
$ cat data/sparse.json
{"a":1,"b":2,"v":3}
{"u":1,"b":2}
{"a":1,"v":2,"x":3}
{"v":1,"w":2}
</pre>
</div>
<p/>
<p/>There are field names <tt>a</tt>, <tt>b</tt>, <tt>v</tt>, <tt>u</tt>,
<tt>x</tt>, <tt>w</tt> in the data &mdash; but not all in every record. Since
we don&rsquo;t know the names of all the keys until we&rsquo;ve read them all,
this needs to be a two-pass algorithm. On the first pass, remember all the
unique key names and all the records; on the second pass, loop through the
records filling in absent values, then producing output. Use <tt>put -q</tt>
since we don&rsquo;t want to produce per-record output, only emitting output in
the <tt>end</tt> block:
<p/>
<div class="pokipanel">
<pre>
$ cat data/unsparsify.mlr
# First pass:
# Remember all unique key names:
for (k in $*) {
@all_keys[k] = 1;
}
# Remember all input records:
@records[NR] = $*;
# Second pass:
end {
for (nr in @records) {
# Get the sparsely keyed input record:
irecord = @records[nr];
# Fill in missing keys with empty string:
map orecord = {};
for (k in @all_keys) {
if (haskey(irecord, k)) {
orecord[k] = irecord[k];
} else {
orecord[k] = "";
}
}
# Produce the output:
emit orecord;
}
}
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --json put -q -f data/unsparsify.mlr data/sparse.json
{ "a": 1, "b": 2, "v": 3, "u": "", "x": "", "w": "" }
{ "a": "", "b": 2, "v": "", "u": 1, "x": "", "w": "" }
{ "a": 1, "b": "", "v": 2, "u": "", "x": 3, "w": "" }
{ "a": "", "b": "", "v": 1, "u": "", "x": "", "w": 2 }
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --ijson --ocsv put -q -f data/unsparsify.mlr data/sparse.json
a,b,v,u,x,w
1,2,3,,,
,2,,1,,
1,,2,,3,
,,1,,,2
</pre>
</div>
<p/>
<p/>
<div class="pokipanel">
<pre>
$ mlr --ijson --opprint put -q -f data/unsparsify.mlr data/sparse.json
a b v u x w
1 2 3 - - -
- 2 - 1 - -
1 - 2 - 3 -
- - 1 - - 2
</pre>
</div>
<p/>
<p/> There is a keystroke-saving verb for this: <a href="reference-verbs.html#unsparsify"><tt>mlr unsparsify</tt></a>.
</div>
<!-- ================================================================ -->
<a id="Parsing_log-file_output"/><h1>Parsing log-file output</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_parsing_log_file_output');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_parsing_log_file_output" style="display: block">
<p/>This, of course, depends highly on what&rsquo;s in your log files. But, as
an example, suppose you have log-file lines such as
<p/>
<div class="pokipanel">
<pre>
2015-10-08 08:29:09,445 INFO com.company.path.to.ClassName @ [sometext] various/sorts/of data {&amp; punctuation} hits=1 status=0 time=2.378
</pre>
</div>
<p/>
I prefer to pre-filter with <tt>grep</tt> and/or <tt>sed</tt> to extract the structured text, then hand that to Miller. Example:
<p/>
<div class="pokipanel">
<pre>
grep 'various sorts' *.log | sed 's/.*} //' | mlr --fs space --repifs --oxtab stats1 -a min,p10,p50,p90,max -f time -g status
</pre>
</div>
<p/>
</div>
<!-- ================================================================ -->
<a id="Memoization_with_out-of-stream_variables"/><h1>Memoization with out-of-stream variables</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_memoization_with_oosvars');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_memoization_with_oosvars" style="display: block">
<p/> The recursive function for the Fibonacci sequence is famous for its computational complexity.
Namely, using
<i>f</i>(0)=1,
<i>f</i>(1)=1,
<i>f</i>(<i>n</i>)=<i>f</i>(<i>n</i>-1)+<i>f</i>(<i>n</i>-2) for <i>n</i>&ge;2,
the evaluation tree branches left as well as right at each non-trivial level, resulting in millions
or more paths to the root 0/1 nodes for larger <i>n</i>. This program
<p/>
<div class="pokipanel">
<pre>
mlr --ofmt '%.9lf' --opprint seqgen --start 1 --stop 28 then put '
func f(n) {
@fcount += 1; # count number of calls to the function
if (n &lt; 2) {
return 1
} else {
return f(n-1) + f(n-2) # recurse
}
}
@fcount = 0;
$o = f($i);
$fcount = @fcount;
' then put '$seconds=systime()' then step -a delta -f seconds then cut -x -f seconds
</pre>
</div>
<p/>
<p/> produces output like this:
<p/>
<div class="pokipanel"><pre>
i o fcount seconds_delta
1 1 1 0
2 2 3 0.000039101
3 3 5 0.000015974
4 5 9 0.000019073
5 8 15 0.000026941
6 13 25 0.000036955
7 21 41 0.000056028
8 34 67 0.000086069
9 55 109 0.000134945
10 89 177 0.000217915
11 144 287 0.000355959
12 233 465 0.000506163
13 377 753 0.000811815
14 610 1219 0.001297235
15 987 1973 0.001960993
16 1597 3193 0.003417969
17 2584 5167 0.006215811
18 4181 8361 0.008294106
19 6765 13529 0.012095928
20 10946 21891 0.019592047
21 17711 35421 0.031193972
22 28657 57313 0.057254076
23 46368 92735 0.080307961
24 75025 150049 0.129482031
25 121393 242785 0.213325977
26 196418 392835 0.334423065
27 317811 635621 0.605969906
28 514229 1028457 0.971235037
</pre></div>
<p/> Note that the time it takes to evaluate the function is blowing up exponentially as the input argument
increases. Using <tt>@</tt>-variables, which persist across records, we can cache and reuse the results
of previous computations:
<p/>
<div class="pokipanel">
<pre>
mlr --ofmt '%.9lf' --opprint seqgen --start 1 --stop 28 then put '
func f(n) {
@fcount += 1; # count number of calls to the function
if (is_present(@fcache[n])) { # cache hit
return @fcache[n]
} else { # cache miss
num rv = 1;
if (n &gt;= 2) {
rv = f(n-1) + f(n-2) # recurse
}
@fcache[n] = rv;
return rv
}
}
@fcount = 0;
$o = f($i);
$fcount = @fcount;
' then put '$seconds=systime()' then step -a delta -f seconds then cut -x -f seconds
</pre>
</div>
<p/>
<p/> with output like this:
<p/>
<div class="pokipanel"><pre>
i o fcount seconds_delta
1 1 1 0
2 2 3 0.000053883
3 3 3 0.000035048
4 5 3 0.000045061
5 8 3 0.000014067
6 13 3 0.000028849
7 21 3 0.000028133
8 34 3 0.000027895
9 55 3 0.000014067
10 89 3 0.000015020
11 144 3 0.000012875
12 233 3 0.000033140
13 377 3 0.000014067
14 610 3 0.000012875
15 987 3 0.000029087
16 1597 3 0.000013828
17 2584 3 0.000013113
18 4181 3 0.000012875
19 6765 3 0.000013113
20 10946 3 0.000012875
21 17711 3 0.000013113
22 28657 3 0.000013113
23 46368 3 0.000015974
24 75025 3 0.000012875
25 121393 3 0.000013113
26 196418 3 0.000012875
27 317811 3 0.000013113
28 514229 3 0.000012875
</pre></div>
</div>
</div>
</td>
</table>
</body>
</html>