mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 18:25:45 +00:00
356 lines
13 KiB
HTML
356 lines
13 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="en">
|
|
|
|
<!-- PAGE GENERATED FROM template.html and content-for-data-examples.html BY poki. -->
|
|
<!-- PLEASE MAKE CHANGES THERE AND THEN RE-RUN poki. -->
|
|
<head>
|
|
<meta http-equiv="Content-type" content="text/html;charset=UTF-8"/>
|
|
<meta name="description" content="Miller documentation"/>
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"/> <!-- mobile-friendly -->
|
|
<meta name="keywords"
|
|
content="John Kerl, Kerl, Miller, miller, mlr, OLAP, data analysis software, regression, correlation, variance, data tools, " />
|
|
|
|
<title> Data-diving examples </title>
|
|
<link rel="stylesheet" type="text/css" href="css/miller.css"/>
|
|
<link rel="stylesheet" type="text/css" href="css/poki-callbacks.css"/>
|
|
</head>
|
|
|
|
<!-- ================================================================ -->
|
|
<script type="text/javascript">
|
|
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
|
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
|
</script>
|
|
<script type="text/javascript">
|
|
try {
|
|
var pageTracker = _gat._getTracker("UA-15651652-1");
|
|
pageTracker._trackPageview();
|
|
} catch(err) {}
|
|
</script>
|
|
<!-- ================================================================ -->
|
|
|
|
<body bgcolor="#ffffff">
|
|
|
|
<!-- ================================================================ -->
|
|
|
|
<!-- navbar -->
|
|
<div class="pokinav">
|
|
<center><titleinbody>Miller</titleinbody></center>
|
|
|
|
<!-- NAVBAR GENERATED FROM template.html BY poki -->
|
|
<br/>
|
|
<a class="poki-navbar-element" href="index.html">Overview</a>
|
|
|
|
<a class="poki-navbar-element" href="faq.html"><b>Using</b></a>
|
|
|
|
<a class="poki-navbar-element" href="reference.html">Reference</a>
|
|
|
|
<a class="poki-navbar-element" href="why.html">Background</a>
|
|
|
|
<a class="poki-navbar-element" href="contact.html">Repository</a>
|
|
|
|
<br/>
|
|
<br/><a href="faq.html">FAQ</a>
|
|
<br/><a href="customization.html">Customization: .mlrrc</a>
|
|
<br/><a href="data-sharing.html">Mixing with other languages</a>
|
|
<br/><a href="cookbook.html">Cookbook part 1</a>
|
|
<br/><a href="cookbook2.html">Cookbook part 2</a>
|
|
<br/><a href="cookbook3.html">Cookbook part 3</a>
|
|
<br/><a href="data-examples.html"><b>Data-diving examples</b></a>
|
|
</div>
|
|
|
|
<!-- page body -->
|
|
<p/>
|
|
|
|
<!-- BODY COPIED FROM content-for-data-examples.html BY poki -->
|
|
<div class="pokitoc">
|
|
<center><titleinbody>Data-diving examples</titleinbody></center>
|
|
• <a href="#flins_data">flins data</a><br/>
|
|
• <a href="#Color/shape_data">Color/shape data</a><br/>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>
|
|
<button style="font-weight:bold;color:maroon;border:0" onclick="bodyToggler.expandAll();" href="javascript:;">Expand all sections</button>
|
|
<button style="font-weight:bold;color:maroon;border:0" onclick="bodyToggler.collapseAll();" href="javascript:;">Collapse all sections</button>
|
|
|
|
<a id="flins_data"/><h1>flins data</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_flins');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_flins" style="display: block">
|
|
|
|
<p/> The <a href="data/flins.csv">flins.csv</a> file is some sample data
|
|
obtained from <a href="https://support.spatialkey.com/spatialkey-sample-csv-data">https://support.spatialkey.com/spatialkey-sample-csv-data</a>.
|
|
|
|
<p/>Vertical-tabular format is good for a quick look at CSV data layout — seeing what columns you have to work with:
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ head -n 2 data/flins.csv | mlr --icsv --oxtab cat
|
|
county Seminole
|
|
tiv_2011 22890.55
|
|
tiv_2012 20848.71
|
|
line Residential
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/> A few simple queries:
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --from data/flins.csv --icsv --opprint count-distinct -f county | head
|
|
county count
|
|
Seminole 1
|
|
Miami Dade 2
|
|
Palm Beach 1
|
|
Highlands 2
|
|
Duval 1
|
|
St. Johns 1
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --from data/flins.csv --icsv --opprint count-distinct -f construction,line
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/> Categorization of total insured value:
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --from data/flins.csv --icsv --opprint stats1 -a min,mean,max -f tiv_2012
|
|
tiv_2012_min tiv_2012_mean tiv_2012_max
|
|
19757.910000 1061531.463750 2785551.630000
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --from data/flins.csv --icsv --opprint stats1 -a min,mean,max -f tiv_2012 -g construction,line
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --from data/flins.csv --icsv --oxtab stats1 -a p0,p10,p50,p90,p95,p99,p100 -f hu_site_deductible
|
|
hu_site_deductible_p0
|
|
hu_site_deductible_p10
|
|
hu_site_deductible_p50
|
|
hu_site_deductible_p90
|
|
hu_site_deductible_p95
|
|
hu_site_deductible_p99
|
|
hu_site_deductible_p100
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --from data/flins.csv --icsv --opprint stats1 -a p95,p99,p100 -f hu_site_deductible -g county then sort -f county | head
|
|
county hu_site_deductible_p95 hu_site_deductible_p99 hu_site_deductible_p100
|
|
Duval - - -
|
|
Highlands - - -
|
|
Miami Dade - - -
|
|
Palm Beach - - -
|
|
Seminole - - -
|
|
St. Johns - - -
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --from data/flins.csv --icsv --oxtab stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012
|
|
tiv_2011_tiv_2012_corr 0.935363
|
|
tiv_2011_tiv_2012_ols_m 1.089091
|
|
tiv_2011_tiv_2012_ols_b 103095.523356
|
|
tiv_2011_tiv_2012_ols_n 8
|
|
tiv_2011_tiv_2012_r2 0.874904
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --from data/flins.csv --icsv --opprint stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012 -g county
|
|
county tiv_2011_tiv_2012_corr tiv_2011_tiv_2012_ols_m tiv_2011_tiv_2012_ols_b tiv_2011_tiv_2012_ols_n tiv_2011_tiv_2012_r2
|
|
Seminole - - - 1 -
|
|
Miami Dade 1.000000 0.930643 -2311.154328 2 1.000000
|
|
Palm Beach - - - 1 -
|
|
Highlands 1.000000 1.055693 -4529.793939 2 1.000000
|
|
Duval - - - 1 -
|
|
St. Johns - - - 1 -
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
</div>
|
|
<a id="Color/shape_data"/><h1>Color/shape data</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_color_shape');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_color_shape" style="display: block">
|
|
|
|
<p/> The <a href="data/colored-shapes.dkvp">colored-shapes.dkvp</a> file is some sample data produced by the
|
|
<a href="https://github.com/johnkerl/miller/blob/master/doc/datagen/mkdat2">mkdat2</a> script. The idea is
|
|
<ul>
|
|
<li> Produce some data with known distributions and correlations, and verify that Miller recovers those properties empirically.
|
|
<li> Each record is labeled with one of a few colors and one of a few shapes.
|
|
<li> The <code>flag</code> field is 0 or 1, with probability dependent on color
|
|
<li> The <code>u</code> field is plain uniform on the unit interval.
|
|
<li> The <code>v</code> field is the same, except tightly correlated with <code>u</code> for red circles.
|
|
<li> The <code>w</code> field is autocorrelated for each color/shape pair.
|
|
<li> The <code>x</code> field is boring Gaussian with mean 5 and standard deviation about 1.2, with no dependence on color or shape.
|
|
</ul>
|
|
|
|
<p/> Peek at the data:
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ wc -l data/colored-shapes.dkvp
|
|
10078 data/colored-shapes.dkvp
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ head -n 6 data/colored-shapes.dkvp | mlr --opprint cat
|
|
color shape flag i u v w x
|
|
yellow triangle 1 11 0.6321695890307647 0.9887207810889004 0.4364983936735774 5.7981881667050565
|
|
red square 1 15 0.21966833570651523 0.001257332190235938 0.7927778364718627 2.944117399716207
|
|
red circle 1 16 0.20901671281497636 0.29005231936593445 0.13810280912907674 5.065034003400998
|
|
red square 0 48 0.9562743938458542 0.7467203085342884 0.7755423050923582 7.117831369597269
|
|
purple triangle 0 51 0.4355354501763202 0.8591292672156728 0.8122903963006748 5.753094629505863
|
|
red square 0 64 0.2015510269821953 0.9531098083420033 0.7719912015786777 5.612050466474166
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/> Look at uncategorized stats (using <a href="https://github.com/johnkerl/scripts/blob/master/fundam/creach"><code>creach</code></a> for spacing).
|
|
Here it looks reasonable that <code>u</code> is unit-uniform; something’s up with <code>v</code> but we can’t yet see what:
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --oxtab stats1 -a min,mean,max -f flag,u,v data/colored-shapes.dkvp | creach 3
|
|
flag_min 0
|
|
flag_mean 0.398889
|
|
flag_max 1
|
|
|
|
u_min 0.000044
|
|
u_mean 0.498326
|
|
u_max 0.999969
|
|
|
|
v_min -0.092709
|
|
v_mean 0.497787
|
|
v_max 1.072500
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>The histogram shows the different distribution of 0/1 flags:
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint histogram -f flag,u,v --lo -0.1 --hi 1.1 --nbins 12 data/colored-shapes.dkvp
|
|
bin_lo bin_hi flag_count u_count v_count
|
|
-0.100000 0.000000 6058 0 36
|
|
0.000000 0.100000 0 1062 988
|
|
0.100000 0.200000 0 985 1003
|
|
0.200000 0.300000 0 1024 1014
|
|
0.300000 0.400000 0 1002 991
|
|
0.400000 0.500000 0 989 1041
|
|
0.500000 0.600000 0 1001 1016
|
|
0.600000 0.700000 0 972 962
|
|
0.700000 0.800000 0 1035 1070
|
|
0.800000 0.900000 0 995 993
|
|
0.900000 1.000000 4020 1013 939
|
|
1.000000 1.100000 0 0 25
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/> Look at univariate stats by color and shape. In particular,
|
|
color-dependent flag probabilities pop out, aligning with their original
|
|
Bernoulli probablities from the data-generator script:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint stats1 -a min,mean,max -f flag,u,v -g color then sort -f color data/colored-shapes.dkvp
|
|
color flag_min flag_mean flag_max u_min u_mean u_max v_min v_mean v_max
|
|
blue 0 0.584354 1 0.000044 0.517717 0.999969 0.001489 0.491056 0.999576
|
|
green 0 0.209197 1 0.000488 0.504861 0.999936 0.000501 0.499085 0.999676
|
|
orange 0 0.521452 1 0.001235 0.490532 0.998885 0.002449 0.487764 0.998475
|
|
purple 0 0.090193 1 0.000266 0.494005 0.999647 0.000364 0.497051 0.999975
|
|
red 0 0.303167 1 0.000671 0.492560 0.999882 -0.092709 0.496535 1.072500
|
|
yellow 0 0.892427 1 0.001300 0.497129 0.999923 0.000711 0.510627 0.999919
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint stats1 -a min,mean,max -f flag,u,v -g shape then sort -f shape data/colored-shapes.dkvp
|
|
shape flag_min flag_mean flag_max u_min u_mean u_max v_min v_mean v_max
|
|
circle 0 0.399846 1 0.000044 0.498555 0.999923 -0.092709 0.495524 1.072500
|
|
square 0 0.396112 1 0.000188 0.499385 0.999969 0.000089 0.496538 0.999975
|
|
triangle 0 0.401542 1 0.000881 0.496859 0.999661 0.000717 0.501050 0.999995
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/> Look at bivariate stats by color and shape. In particular, <code>u,v</code> pairwise correlation for red circles pops out:
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint --right stats2 -a corr -f u,v,w,x data/colored-shapes.dkvp
|
|
u_v_corr w_x_corr
|
|
0.133418 -0.011320
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint --right stats2 -a corr -f u,v,w,x -g color,shape then sort -nr u_v_corr data/colored-shapes.dkvp
|
|
color shape u_v_corr w_x_corr
|
|
red circle 0.980798 -0.018565
|
|
orange square 0.176858 -0.071044
|
|
green circle 0.057644 0.011795
|
|
red square 0.055745 -0.000680
|
|
yellow triangle 0.044573 0.024605
|
|
yellow square 0.043792 -0.044623
|
|
purple circle 0.035874 0.134112
|
|
blue square 0.032412 -0.053508
|
|
blue triangle 0.015356 -0.000608
|
|
orange circle 0.010519 -0.162795
|
|
red triangle 0.008098 0.012486
|
|
purple triangle 0.005155 -0.045058
|
|
purple square -0.025680 0.057694
|
|
green square -0.025776 -0.003265
|
|
orange triangle -0.030457 -0.131870
|
|
yellow circle -0.064773 0.073695
|
|
blue circle -0.102348 -0.030529
|
|
green triangle -0.109018 -0.048488
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
</div>
|
|
|
|
<!-- ================================================================ -->
|
|
<script type="text/javascript" src="js/miller-doc-toggler.js"></script>
|
|
<!-- wtf -->
|
|
<script type="text/javascript">
|
|
// Put this at the bottom of the page since its constructor scans the
|
|
// document's div tags to find the toggleables.
|
|
const bodyToggler = new MillerDocToggler(
|
|
"body_section_toggle_",
|
|
'maroon',
|
|
'maroon',
|
|
);
|
|
</script>
|
|
|
|
</body>
|
|
</html>
|