mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-24 02:36:15 +00:00
328 lines
No EOL
15 KiB
HTML
328 lines
No EOL
15 KiB
HTML
|
||
<!DOCTYPE html>
|
||
|
||
<html>
|
||
<head>
|
||
<meta charset="utf-8" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||
<title>Miscellaneous examples — Miller 6.0.0-alpha documentation</title>
|
||
|
||
<link rel="stylesheet" href="_static/scrolls.css" type="text/css" />
|
||
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
|
||
<link rel="stylesheet" href="_static/print.css" type="text/css" />
|
||
|
||
<script id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
|
||
<script src="_static/jquery.js"></script>
|
||
<script src="_static/underscore.js"></script>
|
||
<script src="_static/doctools.js"></script>
|
||
<script src="_static/language_data.js"></script>
|
||
<script src="_static/theme_extras.js"></script>
|
||
<link rel="index" title="Index" href="genindex.html" />
|
||
<link rel="search" title="Search" href="search.html" />
|
||
<link rel="next" title="Why?" href="why.html" />
|
||
<link rel="prev" title="Programming-language examples" href="programming-examples.html" />
|
||
</head><body>
|
||
<div id="content">
|
||
<div class="header">
|
||
<h1 class="heading"><a href="index.html"
|
||
title="back to the documentation overview"><span>Miscellaneous examples</span></a></h1>
|
||
</div>
|
||
<div class="relnav" role="navigation" aria-label="related navigation">
|
||
<a href="programming-examples.html">« Programming-language examples</a> |
|
||
<a href="#">Miscellaneous examples</a>
|
||
| <a href="why.html">Why? »</a>
|
||
</div>
|
||
<div id="contentwrapper">
|
||
<div id="toc" role="navigation" aria-label="table of contents navigation">
|
||
<h3>Table of Contents</h3>
|
||
<ul>
|
||
<li><a class="reference internal" href="#">Miscellaneous examples</a><ul>
|
||
<li><a class="reference internal" href="#program-timing">Program timing</a></li>
|
||
<li><a class="reference internal" href="#showing-differences-between-successive-queries">Showing differences between successive queries</a></li>
|
||
<li><a class="reference internal" href="#memoization-with-out-of-stream-variables">Memoization with out-of-stream variables</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
|
||
</div>
|
||
<div role="main">
|
||
|
||
<div class="section" id="miscellaneous-examples">
|
||
<h1>Miscellaneous examples<a class="headerlink" href="#miscellaneous-examples" title="Permalink to this headline">¶</a></h1>
|
||
<p>Column select:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --csv cut -f hostname,uptime mydata.csv
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Add new columns as function of other columns:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --nidx put '$sum = $7 < 0.0 ? 3.5 : $7 + 2.1*$8' *.dat
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Row filter:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --csv filter '$status != "down" && $upsec >= 10000' *.csv
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Apply column labels and pretty-print:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> grep -v '^#' /etc/group | mlr --ifs : --nidx --opprint label group,pass,gid,member then sort -f group
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Join multiple data sources on key columns:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr join -j account_id -f accounts.dat then group-by account_name balances.dat
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Mulltiple formats including JSON:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --json put '$attr = sub($attr, "([0-9]+)_([0-9]+)_.*", "\1:\2")' data/*.json
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Aggregate per-column statistics:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr stats1 -a min,mean,max,p10,p50,p90 -f flag,u,v data/*
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Linear regression:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr stats2 -a linreg-pca -f u,v -g shape data/*
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Aggregate custom per-column statistics:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr put -q '@sum[$a][$b] += $x; end {emit @sum, "a", "b"}' data/*
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Iterate over data using DSL expressions:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --from estimates.tbl put '
|
||
</span> for (k,v in $*) {
|
||
if (is_numeric(v) && k =~ "^[t-z].*$") {
|
||
$sum += v; $count += 1
|
||
}
|
||
}
|
||
$mean = $sum / $count # no assignment if count unset
|
||
'
|
||
</pre></div>
|
||
</div>
|
||
<p>Run DSL expressions from a script file:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --from infile.dat put -f analyze.mlr
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Split/reduce output to multiple filenames:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --from infile.dat put 'tee > "./taps/data-".$a."-".$b, $*'
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Compressed I/O:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --from infile.dat put 'tee | "gzip > ./taps/data-".$a."-".$b.".gz", $*'
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Interoperate with other data-processing tools using standard pipes:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --from infile.dat put -q '@v=$*; dump | "jq .[]"'
|
||
</span></pre></div>
|
||
</div>
|
||
<p>Tap/trace:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --from infile.dat put '(NR % 1000 == 0) { print > stderr, "Checkpoint ".NR}'
|
||
</span></pre></div>
|
||
</div>
|
||
<div class="section" id="program-timing">
|
||
<h2>Program timing<a class="headerlink" href="#program-timing" title="Permalink to this headline">¶</a></h2>
|
||
<p>This admittedly artificial example demonstrates using Miller time and stats functions to introspectively acquire some information about Miller’s own runtime. The <code class="docutils literal notranslate"><span class="pre">delta</span></code> function computes the difference between successive timestamps.</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>$ ruby -e '10000.times{|i|puts "i=#{i+1}"}' > lines.txt
|
||
|
||
$ head -n 5 lines.txt
|
||
i=1
|
||
i=2
|
||
i=3
|
||
i=4
|
||
i=5
|
||
|
||
mlr --ofmt '%.9le' --opprint put '$t=systime()' then step -a delta -f t lines.txt | head -n 7
|
||
i t t_delta
|
||
1 1430603027.018016 1.430603027e+09
|
||
2 1430603027.018043 2.694129944e-05
|
||
3 1430603027.018048 5.006790161e-06
|
||
4 1430603027.018052 4.053115845e-06
|
||
5 1430603027.018055 2.861022949e-06
|
||
6 1430603027.018058 3.099441528e-06
|
||
|
||
mlr --ofmt '%.9le' --oxtab \
|
||
put '$t=systime()' then \
|
||
step -a delta -f t then \
|
||
filter '$i>1' then \
|
||
stats1 -a min,mean,max -f t_delta \
|
||
lines.txt
|
||
t_delta_min 2.861022949e-06
|
||
t_delta_mean 4.077508505e-06
|
||
t_delta_max 5.388259888e-05
|
||
</pre></div>
|
||
</div>
|
||
</div>
|
||
<div class="section" id="showing-differences-between-successive-queries">
|
||
<h2>Showing differences between successive queries<a class="headerlink" href="#showing-differences-between-successive-queries" title="Permalink to this headline">¶</a></h2>
|
||
<p>Suppose you have a database query which you run at one point in time, producing the output on the left, then again later producing the output on the right:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> cat data/previous_counters.csv
|
||
</span> color,count
|
||
red,3472
|
||
blue,6838
|
||
orange,694
|
||
purple,12
|
||
</pre></div>
|
||
</div>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> cat data/current_counters.csv
|
||
</span> color,count
|
||
red,3467
|
||
orange,670
|
||
yellow,27
|
||
blue,6944
|
||
</pre></div>
|
||
</div>
|
||
<p>And, suppose you want to compute the differences in the counters between adjacent keys. Since the color names aren’t all in the same order, nor are they all present on both sides, we can’t just paste the two files side-by-side and do some column-four-minus-column-two arithmetic.</p>
|
||
<p>First, rename counter columns to make them distinct:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --csv rename count,previous_count data/previous_counters.csv > data/prevtemp.csv
|
||
</span></pre></div>
|
||
</div>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> cat data/prevtemp.csv
|
||
</span> color,previous_count
|
||
red,3472
|
||
blue,6838
|
||
orange,694
|
||
purple,12
|
||
</pre></div>
|
||
</div>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --csv rename count,current_count data/current_counters.csv > data/currtemp.csv
|
||
</span></pre></div>
|
||
</div>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> cat data/currtemp.csv
|
||
</span> color,current_count
|
||
red,3467
|
||
orange,670
|
||
yellow,27
|
||
blue,6944
|
||
</pre></div>
|
||
</div>
|
||
<p>Then, join on the key field(s), and use unsparsify to zero-fill counters absent on one side but present on the other. Use <code class="docutils literal notranslate"><span class="pre">--ul</span></code> and <code class="docutils literal notranslate"><span class="pre">--ur</span></code> to emit unpaired records (namely, purple on the left and yellow on the right):</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span><span class="hll"> mlr --icsv --opprint \
|
||
</span><span class="hll"> join -j color --ul --ur -f data/prevtemp.csv \
|
||
</span><span class="hll"> then unsparsify --fill-with 0 \
|
||
</span><span class="hll"> then put '$count_delta = $current_count - $previous_count' \
|
||
</span><span class="hll"> data/currtemp.csv
|
||
</span> color previous_count current_count count_delta
|
||
red 3472 3467 -5
|
||
orange 694 670 -24
|
||
yellow 0 27 (error)
|
||
blue 6838 6944 106
|
||
purple 12 0 (error)
|
||
</pre></div>
|
||
</div>
|
||
</div>
|
||
<div class="section" id="memoization-with-out-of-stream-variables">
|
||
<span id="cookbook-memoization-with-oosvars"></span><h2>Memoization with out-of-stream variables<a class="headerlink" href="#memoization-with-out-of-stream-variables" title="Permalink to this headline">¶</a></h2>
|
||
<p>The recursive function for the Fibonacci sequence is famous for its computational complexity. Namely, using f(0)=1, f(1)=1, f(n)=f(n-1)+f(n-2) for n>=2, the evaluation tree branches left as well as right at each non-trivial level, resulting in millions or more paths to the root 0/1 nodes for larger n. This program</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>mlr --ofmt '%.9lf' --opprint seqgen --start 1 --stop 28 then put '
|
||
func f(n) {
|
||
@fcount += 1; # count number of calls to the function
|
||
if (n < 2) {
|
||
return 1
|
||
} else {
|
||
return f(n-1) + f(n-2) # recurse
|
||
}
|
||
}
|
||
|
||
@fcount = 0;
|
||
$o = f($i);
|
||
$fcount = @fcount;
|
||
|
||
' then put '$seconds=systime()' then step -a delta -f seconds then cut -x -f seconds
|
||
</pre></div>
|
||
</div>
|
||
<p>produces output like this:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>i o fcount seconds_delta
|
||
1 1 1 0
|
||
2 2 3 0.000039101
|
||
3 3 5 0.000015974
|
||
4 5 9 0.000019073
|
||
5 8 15 0.000026941
|
||
6 13 25 0.000036955
|
||
7 21 41 0.000056028
|
||
8 34 67 0.000086069
|
||
9 55 109 0.000134945
|
||
10 89 177 0.000217915
|
||
11 144 287 0.000355959
|
||
12 233 465 0.000506163
|
||
13 377 753 0.000811815
|
||
14 610 1219 0.001297235
|
||
15 987 1973 0.001960993
|
||
16 1597 3193 0.003417969
|
||
17 2584 5167 0.006215811
|
||
18 4181 8361 0.008294106
|
||
19 6765 13529 0.012095928
|
||
20 10946 21891 0.019592047
|
||
21 17711 35421 0.031193972
|
||
22 28657 57313 0.057254076
|
||
23 46368 92735 0.080307961
|
||
24 75025 150049 0.129482031
|
||
25 121393 242785 0.213325977
|
||
26 196418 392835 0.334423065
|
||
27 317811 635621 0.605969906
|
||
28 514229 1028457 0.971235037
|
||
</pre></div>
|
||
</div>
|
||
<p>Note that the time it takes to evaluate the function is blowing up exponentially as the input argument increases. Using <code class="docutils literal notranslate"><span class="pre">@</span></code>-variables, which persist across records, we can cache and reuse the results of previous computations:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>mlr --ofmt '%.9lf' --opprint seqgen --start 1 --stop 28 then put '
|
||
func f(n) {
|
||
@fcount += 1; # count number of calls to the function
|
||
if (is_present(@fcache[n])) { # cache hit
|
||
return @fcache[n]
|
||
} else { # cache miss
|
||
num rv = 1;
|
||
if (n >= 2) {
|
||
rv = f(n-1) + f(n-2) # recurse
|
||
}
|
||
@fcache[n] = rv;
|
||
return rv
|
||
}
|
||
}
|
||
@fcount = 0;
|
||
$o = f($i);
|
||
$fcount = @fcount;
|
||
' then put '$seconds=systime()' then step -a delta -f seconds then cut -x -f seconds
|
||
</pre></div>
|
||
</div>
|
||
<p>with output like this:</p>
|
||
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>i o fcount seconds_delta
|
||
1 1 1 0
|
||
2 2 3 0.000053883
|
||
3 3 3 0.000035048
|
||
4 5 3 0.000045061
|
||
5 8 3 0.000014067
|
||
6 13 3 0.000028849
|
||
7 21 3 0.000028133
|
||
8 34 3 0.000027895
|
||
9 55 3 0.000014067
|
||
10 89 3 0.000015020
|
||
11 144 3 0.000012875
|
||
12 233 3 0.000033140
|
||
13 377 3 0.000014067
|
||
14 610 3 0.000012875
|
||
15 987 3 0.000029087
|
||
16 1597 3 0.000013828
|
||
17 2584 3 0.000013113
|
||
18 4181 3 0.000012875
|
||
19 6765 3 0.000013113
|
||
20 10946 3 0.000012875
|
||
21 17711 3 0.000013113
|
||
22 28657 3 0.000013113
|
||
23 46368 3 0.000015974
|
||
24 75025 3 0.000012875
|
||
25 121393 3 0.000013113
|
||
26 196418 3 0.000012875
|
||
27 317811 3 0.000013113
|
||
28 514229 3 0.000012875
|
||
</pre></div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="footer" role="contentinfo">
|
||
© Copyright 2021, John Kerl.
|
||
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 3.2.1.
|
||
</div>
|
||
</body>
|
||
</html> |