miller/doc/index.html

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">

<!-- PAGE GENERATED FROM template.html and content-for-index.html BY poki. -->
<!-- PLEASE MAKE CHANGES THERE AND THEN RE-RUN poki. -->
<head>
  <meta http-equiv="Content-type" content="text/html;charset=UTF-8"/>
  <meta name="description" content="Miller documentation"/>
  <meta name="viewport" content="width=device-width, initial-scale=1.0"/> <!-- mobile-friendly -->
  <meta name="keywords"
  content="John Kerl, Kerl, Miller, miller, mlr, OLAP, data analysis software, regression, correlation, variance, data tools, " />

  <title> About Miller </title>
  <link rel="stylesheet" type="text/css" href="css/miller.css"/>
  <link rel="stylesheet" type="text/css" href="css/poki-callbacks.css"/>
</head>

<!-- ================================================================ -->
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
try {
var pageTracker = _gat._getTracker("UA-15651652-1");
pageTracker._trackPageview();
} catch(err) {}
</script>

<!-- ================================================================ -->
<script type="text/javascript">
  function toggle_div(div) {
    if (div != null) {
      if (div.id.startsWith("section_toggle_")) {
        var state = div.style.display;
        if (state == "block") {
          div.style.display = "none";
        } else {
          div.style.display = "block";
        }
      }
    }
  }
  function expand_div(div) {
    if (div != null) {
      if (div.id.startsWith("section_toggle_")) {
        div.style.display = "block";
      }
    }
  }
  function collapse_div(div) {
    if (div != null) {
      if (div.id.startsWith("section_toggle_")) {
        div.style.display = "none";
      }
    }
  }

  function toggle_by_name(divName) {
    toggle_div(document.getElementById(divName));
  }
  function expand_by_name(divName) {
    expand_div(document.getElementById(divName));
  }
  function collapse_by_name(divName) {
    collapse_div(document.getElementById(divName));
  }

  function expand_all() {
    var divs = document.getElementsByTagName("div");
    for(var i = 0; i < divs.length; i++) {
      expand_div(divs[i]);
    }
  }
  function collapse_all() {
    var divs = document.getElementsByTagName("div");
    for(var i = 0; i < divs.length; i++){
      collapse_div(divs[i]);
    }
  }
</script>

<!--
The background image is from a screenshot of a Google search for "data analysis
tools", lightened and sepia-toned. Over this was placed a Mac Terminal app with
very light-grey font and translucent background, in which a few statistical
Miller commands were run with pretty-print-tabular output format.
<body background="pix/sepia-overlay.jpg">
-->
<body bgcolor="#ffffff">

<!-- ================================================================ -->
<table width="100%">
<tr>

  <!-- navbar -->
  <td width="15%">
    <!--
    <img src="pix/mlr.jpg" />
    <img style="border-width:1px; color:black;" src="pix/mlr.jpg" />
    -->

    <div class="pokinav">
      <center><titleinbody>Miller</titleinbody></center>

<!-- PAGE LIST GENERATED FROM template.html BY poki -->
<br/><b>Overview:</b>
<br/>&bull;&nbsp;<a href="index.html"><b>About Miller</b></a>
<br/>&bull;&nbsp;<a href="10-min.html">Miller in 10 minutes</a>
<br/>&bull;&nbsp;<a href="file-formats.html">File formats</a>
<br/>&bull;&nbsp;<a href="feature-comparison.html">Miller features in the context of the Unix toolkit</a>
<br/>&bull;&nbsp;<a href="record-heterogeneity.html">Record-heterogeneity</a>
<br/>&bull;&nbsp;<a href="internationalization.html">Internationalization</a>
<br/><b>Using Miller:</b>
<br/>&bull;&nbsp;<a href="faq.html">FAQ</a>
<br/>&bull;&nbsp;<a href="cookbook.html">Cookbook part 1</a>
<br/>&bull;&nbsp;<a href="cookbook2.html">Cookbook part 2</a>
<br/>&bull;&nbsp;<a href="cookbook3.html">Cookbook part 3</a>
<br/>&bull;&nbsp;<a href="data-examples.html">Data-diving examples</a>
<br/>&bull;&nbsp;<a href="manpage.html">Manpage</a>
<br/>&bull;&nbsp;<a href="reference.html">Reference</a>
<br/>&bull;&nbsp;<a href="reference-verbs.html">Reference: Verbs</a>
<br/>&bull;&nbsp;<a href="reference-dsl.html">Reference: DSL</a>
<br/>&bull;&nbsp;<a href="release-docs.html">Documents by release</a>
<br/>&bull;&nbsp;<a href="build.html">Installation, portability, dependencies, and testing</a>
<br/><b>Background:</b>
<br/>&bull;&nbsp;<a href="why.html">Why?</a>
<br/>&bull;&nbsp;<a href="whyc.html">Why C?</a>
<br/>&bull;&nbsp;<a href="etymology.html">Why call it Miller?</a>
<br/>&bull;&nbsp;<a href="originality.html">How original is Miller?</a>
<br/>&bull;&nbsp;<a href="performance.html">Performance</a>
<br/><b>Repository:</b>
<br/>&bull;&nbsp;<a href="to-do.html">Things to do</a>
<br/>&bull;&nbsp;<a href="contact.html">Contact information</a>
<br/>&bull;&nbsp;<a href="https://github.com/johnkerl/miller">GitHub repo</a>
      <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/>
      <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/>
      <br/> <br/> <br/> <br/> <br/> <br/>
    </div>
  </td>

  <!-- page body -->
  <td>
    <!--
      This is a visually gorgeous feature (here & in the CSS): it allows for
      independent scroll of the nav and body panels. In particular the nav
      stays on-screen as you scroll the body.

      However, two problems:

      (1) In Firefox & Chrome both I get janky end-of-body scrolls: there is
      more content but I can't scroll down to it unless I repeatedly retry the
      scrolldown. Which is weird.

      (2) Worse, only the first page renders in PDF (again, Firefox & Chrome).

      For now I'm disabling this separate-scroll feature. A frontender, I am
      not ... maybe someday I'll find a config which gets *all* the features
      I want; for now, it's a tradeoff.
    -->

    <!-- Implementation details: one bit is right here:

    div style="overflow-y:scroll;height:1500px"

    and the other bit is in css/poki-callbacks.css:

    .pokinav {
      display:        inline-block;
      background:     #e8d9bc;
      border:         1;
      box-shadow:     0px 0px 3px 3px #C9C9C9;
      margin:         10px;
      padding-top:    10px;
      padding-bottom: 10px;
      padding-left:   10px;
      padding-right:  10px;
      overflow-y:     scroll;  < - - - - - - here
      height: 1500px;
    }

    -->
    <div>
    <center> <titleinbody> About Miller </titleinbody> </center>
    <p/>

<!-- BODY COPIED FROM content-for-index.html BY poki -->
<p/>Miller is like awk, sed, cut, join, and sort for <b>name-indexed data such as
CSV, TSV, and tabular JSON</b>. You get to work with your data using named
fields, without needing to count positional column indices.

<p/>This is something the Unix toolkit always could have done, and arguably
always should have done.  It operates on key-value-pair data while the familiar
Unix tools operate on integer-indexed fields: if the natural data structure for
the latter is the array, then Miller&rsquo;s natural data structure is the
insertion-ordered hash map.  This encompasses a <b>variety of data formats</b>,
including but not limited to the familiar CSV, TSV, and JSON.  (Miller can handle
<b>positionally-indexed data</b> as a special case.)

<p/> Features:
<ul>

<li/> Miller is <b>multi-purpose</b>: it&rsquo;s useful for <b>data
cleaning</b>, <b>data reduction</b>, <b>statistical reporting</b>,
<b>devops</b>, <b>system administration</b>, <b>log-file processing</b>,
<b>format conversion</b>, and <b>database-query post-processing</b>.

<li/> You can use Miller to snarf and munge <b>log-file data</b>, including
selecting out relevant substreams, then produce CSV format and load that into
all-in-memory/data-frame utilities for further statistical and/or graphical
processing.

<li/> Miller complements <b>data-analysis tools</b> such as <b>R</b>,
<b>pandas</b>, etc.: you can use Miller to <b>clean</b> and <b>prepare</b> your
data. While you can do <b>basic statistics</b> entirely in Miller, its
streaming-data feature and single-pass algorithms enable you to <b>reduce very
large data sets</b>.

<li/> Miller complements SQL <b>databases</b>: you can slice, dice, and
reformat data on the client side on its way into or out of a database. You can
also reap some of the benefits of databases for quick, setup-free one-off tasks
when you just need to query some data in disk files in a hurry.

<li/> Miller also goes beyond the classic Unix tools by stepping fully into our
modern, <b>no-SQL</b> world: its essential record-heterogeneity property allows
Miller to operate on data where records with different schema (field names) are
interleaved.

<li/> Miller is <b>streaming</b>: most operations need only a single record in
memory at a time, rather than ingesting all input before producing any output.
For those operations which require deeper retention (<tt>sort</tt>,
<tt>tac</tt>, <tt>stats1</tt>), Miller retains only as much data as needed.
This means that whenever functionally possible, you can operate on files which
are larger than your system&rsquo;s available RAM, and you can use Miller in
<b>tail -f</b> contexts.

<li/> Miller is <b>pipe-friendly</b> and interoperates with the Unix toolkit

<li/> Miller&rsquo;s I/O formats include <b>tabular pretty-printing</b>,
<b>positionally indexed</b> (Unix-toolkit style), CSV, JSON, and others

<li/> Miller does <b>conversion</b> between formats

<li/> Miller&rsquo;s <b>processing is format-aware</b>: e.g. CSV <tt>sort</tt>
and <tt>tac</tt> keep header lines first

<li/> Miller has high-throughput <b>performance</b> on par with the Unix toolkit

<li/> Not unlike <a href="http://stedolan.github.io/jq/">jq</a> (for JSON),
Miller is written in portable, modern C, with <b>zero runtime dependencies</b>.
You can download or compile a single binary, <tt>scp</tt> it to a faraway
machine, and expect it to work.

</ul>

<p>Releases and release notes:
<a href="https://github.com/johnkerl/miller/releases">https://github.com/johnkerl/miller/releases</a>.

<p/> Examples:

<div class="pokipanel">
<pre>
# Column select
% mlr --csv cut -f hostname,uptime mydata.csv
</pre>
</div>

<div class="pokipanel">
<pre>
# Add new columns as function of other columns
% mlr --nidx put '$sum = $7 < 0.0 ? 3.5 : $7 + 2.1*$8' *.dat
</pre>
</div>

<div class="pokipanel">
<pre>
# Row filter
% mlr --csv filter '$status != "down" && $upsec >= 10000' *.csv
</pre>
</div>

<div class="pokipanel">
<pre>
# Apply column labels and pretty-print
% grep -v '^#' /etc/group | mlr --ifs : --nidx --opprint label group,pass,gid,member then sort -f group
</pre>
</div>

<div class="pokipanel">
<pre>
# Join multiple data sources on key columns
% mlr join -j account_id -f accounts.dat then group-by account_name balances.dat
</pre>
</div>

<div class="pokipanel">
<pre>
# Multiple formats including JSON
% mlr --json put '$attr = sub($attr, "([0-9]+)_([0-9]+)_.*", "\1:\2")' data/*.json
</pre>
</div>

<div class="pokipanel">
<pre>
# Aggregate per-column statistics
% mlr stats1 -a min,mean,max,p10,p50,p90 -f flag,u,v data/*
</pre>
</div>

<div class="pokipanel">
<pre>
# Linear regression
% mlr stats2 -a linreg-pca -f u,v -g shape data/*
</pre>
</div>

<div class="pokipanel">
<pre>
# Aggregate custom per-column statistics
% mlr put -q '@sum[$a][$b] += $x; end {emit @sum, "a", "b"}' data/*
</pre>
</div>

<div class="pokipanel">
<pre>
# Iterate over data using DSL expressions
% mlr --from estimates.tbl put '
  for (k,v in $*) {
    if (is_numeric(v) && k =~ "^[t-z].*$") {
      $sum += v; $count += 1
    }
  }
  $mean = $sum / $count # no assignment if count unset
'
</pre>
</div>

<div class="pokipanel">
<pre>
# Run DSL expressions from a script file
% mlr --from infile.dat put -f analyze.mlr
</pre>
</div>

<div class="pokipanel">
<pre>
# Split/reduce output to multiple filenames
% mlr --from infile.dat put 'tee > "./taps/data-".$a."-".$b, $*'
</pre>
</div>

<div class="pokipanel">
<pre>
# Compressed I/O
% mlr --from infile.dat put 'tee | "gzip > ./taps/data-".$a."-".$b.".gz", $*'
</pre>
</div>

<div class="pokipanel">
<pre>
# Interoperate with other data-processing tools using standard pipes
% mlr --from infile.dat put -q '@v=$*; dump | "jq .[]"'
</pre>
</div>

<div class="pokipanel">
<pre>
# Tap/trace
% mlr --from infile.dat put  '(NR % 1000 == 0) { print > stderr, "Checkpoint ".NR}'
</pre>
</div>
    </div>
  </td>

</table>
</body>
</html>