miller/doc/whyc.html
2017-04-14 21:56:56 -04:00

456 lines
18 KiB
HTML

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<!-- PAGE GENERATED FROM template.html and content-for-whyc.html BY poki. -->
<!-- PLEASE MAKE CHANGES THERE AND THEN RE-RUN poki. -->
<head>
<meta http-equiv="Content-type" content="text/html;charset=UTF-8"/>
<meta name="description" content="Miller documentation"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/> <!-- mobile-friendly -->
<meta name="keywords"
content="John Kerl, Kerl, Miller, miller, mlr, OLAP, data analysis software, regression, correlation, variance, data tools, " />
<title> Why C? </title>
<link rel="stylesheet" type="text/css" href="css/miller.css"/>
<link rel="stylesheet" type="text/css" href="css/poki-callbacks.css"/>
</head>
<!-- ================================================================ -->
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
try {
var pageTracker = _gat._getTracker("UA-15651652-1");
pageTracker._trackPageview();
} catch(err) {}
</script>
<!-- ================================================================ -->
<script type="text/javascript">
function toggle_div(div) {
if (div != null) {
if (div.id.startsWith("section_toggle_")) {
var state = div.style.display;
if (state == "block") {
div.style.display = "none";
} else {
div.style.display = "block";
}
}
}
}
function expand_div(div) {
if (div != null) {
if (div.id.startsWith("section_toggle_")) {
div.style.display = "block";
}
}
}
function collapse_div(div) {
if (div != null) {
if (div.id.startsWith("section_toggle_")) {
div.style.display = "none";
}
}
}
function toggle_by_name(divName) {
toggle_div(document.getElementById(divName));
}
function expand_by_name(divName) {
expand_div(document.getElementById(divName));
}
function collapse_by_name(divName) {
collapse_div(document.getElementById(divName));
}
function expand_all() {
var divs = document.getElementsByTagName("div");
for(var i = 0; i < divs.length; i++) {
expand_div(divs[i]);
}
}
function collapse_all() {
var divs = document.getElementsByTagName("div");
for(var i = 0; i < divs.length; i++){
collapse_div(divs[i]);
}
}
</script>
<!--
The background image is from a screenshot of a Google search for "data analysis
tools", lightened and sepia-toned. Over this was placed a Mac Terminal app with
very light-grey font and translucent background, in which a few statistical
Miller commands were run with pretty-print-tabular output format.
<body background="pix/sepia-overlay.jpg">
-->
<body bgcolor="#ffffff">
<!-- ================================================================ -->
<table width="100%">
<tr>
<!-- navbar -->
<td width="15%">
<!--
<img src="pix/mlr.jpg" />
<img style="border-width:1px; color:black;" src="pix/mlr.jpg" />
-->
<div class="pokinav">
<center><titleinbody>Miller</titleinbody></center>
<!-- PAGE LIST GENERATED FROM template.html BY poki -->
<br/><b>Overview:</b>
<br/>&bull;&nbsp;<a href="index.html">About Miller</a>
<br/>&bull;&nbsp;<a href="10-min.html">Miller in 10 minutes</a>
<br/>&bull;&nbsp;<a href="file-formats.html">File formats</a>
<br/>&bull;&nbsp;<a href="feature-comparison.html">Miller features in the context of the Unix toolkit</a>
<br/>&bull;&nbsp;<a href="record-heterogeneity.html">Record-heterogeneity</a>
<br/>&bull;&nbsp;<a href="internationalization.html">Internationalization</a>
<br/><b>Using Miller:</b>
<br/>&bull;&nbsp;<a href="faq.html">FAQ</a>
<br/>&bull;&nbsp;<a href="cookbook.html">Cookbook part 1</a>
<br/>&bull;&nbsp;<a href="cookbook2.html">Cookbook part 2</a>
<br/>&bull;&nbsp;<a href="cookbook3.html">Cookbook part 3</a>
<br/>&bull;&nbsp;<a href="data-examples.html">Data-diving examples</a>
<br/>&bull;&nbsp;<a href="manpage.html">Manpage</a>
<br/>&bull;&nbsp;<a href="reference.html">Reference</a>
<br/>&bull;&nbsp;<a href="reference-verbs.html">Reference: Verbs</a>
<br/>&bull;&nbsp;<a href="reference-dsl.html">Reference: DSL</a>
<br/>&bull;&nbsp;<a href="release-docs.html">Documents by release</a>
<br/>&bull;&nbsp;<a href="build.html">Installation, portability, dependencies, and testing</a>
<br/><b>Background:</b>
<br/>&bull;&nbsp;<a href="why.html">Why?</a>
<br/>&bull;&nbsp;<a href="whyc.html"><b>Why C?</b></a>
<br/>&bull;&nbsp;<a href="etymology.html">Why call it Miller?</a>
<br/>&bull;&nbsp;<a href="originality.html">How original is Miller?</a>
<br/>&bull;&nbsp;<a href="performance.html">Performance</a>
<br/><b>Repository:</b>
<br/>&bull;&nbsp;<a href="to-do.html">Things to do</a>
<br/>&bull;&nbsp;<a href="contact.html">Contact information</a>
<br/>&bull;&nbsp;<a href="https://github.com/johnkerl/miller">GitHub repo</a>
<br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/>
<br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/>
<br/> <br/> <br/> <br/> <br/> <br/>
</div>
</td>
<!-- page body -->
<td>
<!--
This is a visually gorgeous feature (here & in the CSS): it allows for
independent scroll of the nav and body panels. In particular the nav
stays on-screen as you scroll the body.
However, two problems:
(1) In Firefox & Chrome both I get janky end-of-body scrolls: there is
more content but I can't scroll down to it unless I repeatedly retry the
scrolldown. Which is weird.
(2) Worse, only the first page renders in PDF (again, Firefox & Chrome).
For now I'm disabling this separate-scroll feature. A frontender, I am
not ... maybe someday I'll find a config which gets *all* the features
I want; for now, it's a tradeoff.
-->
<!-- Implementation details: one bit is right here:
div style="overflow-y:scroll;height:1500px"
and the other bit is in css/poki-callbacks.css:
.pokinav {
display: inline-block;
background: #e8d9bc;
border: 1;
box-shadow: 0px 0px 3px 3px #C9C9C9;
margin: 10px;
padding-top: 10px;
padding-bottom: 10px;
padding-left: 10px;
padding-right: 10px;
overflow-y: scroll; < - - - - - - here
height: 1500px;
}
-->
<div>
<center> <titleinbody> Why C? </titleinbody> </center>
<p/>
<!-- BODY COPIED FROM content-for-whyc.html BY poki -->
<div class="pokitoc">
<center><b>Contents:</b></center>
&bull;&nbsp;<a href="#Why_not_C?">Why not C?</a><br/>
&bull;&nbsp;<a href="#C_vs._Go,_D,_Rust,_etc.;_C_is_fast">C vs. Go, D, Rust, etc.; C is fast</a><br/>
&bull;&nbsp;<a href="#C_is_ubiquitous">C is ubiquitous</a><br/>
&bull;&nbsp;<a href="#C_is_old-school">C is old-school</a><br/>
&bull;&nbsp;<a href="#C_vs._C++">C vs. C++</a><br/>
&nbsp;&nbsp;&nbsp;&nbsp;&bull;&nbsp;<a href="#<tt>this</tt>_pointers_and_attributes"><tt>this</tt> pointers and attributes</a><br/>
&nbsp;&nbsp;&nbsp;&nbsp;&bull;&nbsp;<a href="#Interfaces_and_virtual-function_pointers">Interfaces and virtual-function pointers</a><br/>
</div>
<p/>
<p/>
<button style="font-weight:bold;color:maroon;border:0" onclick="expand_all();" href="javascript:;">Expand all sections</button>
<button style="font-weight:bold;color:maroon;border:0" onclick="collapse_all();" href="javascript:;">Collapse all sections</button>
<a id="Why_not_C?"/><h1>Why not C?</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_why_not_c');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_why_not_c" style="display: block">
<p/>C lacks many of the features found in modern, high-level languages such as
Java or Go: garbage collection, collections libraries, generics/near-generics,
hash-map/linked-list literals built into the language (e.g.
<tt>mymap={"a"=&gt;1,"b"=&gt;2}</tt> or <tt>mylist=[3,4,5]</tt>), autodoc (e.g.
Javadoc), and so on. Yet, while memory management is indeed Miller&rsquo;s
trickiest aspect, its garbage-collection needs are well-delineated and so the
absence of GC is no great loss. Miller&rsquo;s performance relies on
the principles of <i>touching each byte as few times as possible</i>, and
<i>copying bytes only when necessary</i>. This results in a baton-passing,
free-on-last-use memory-management pattern which works well enough. (See also
<a href="https://github.com/johnkerl/miller/blob/master/c/README.md">
https://github.com/johnkerl/miller/blob/master/c/README.md</a>.)
Miller doesn&rsquo;t require a complex collections library: mostly simple hash
maps, hash sets, and linked lists which aren&rsquo;t difficult to code.
Moreover, Miller&rsquo;s primary data structure, the
<a href="https://github.com/johnkerl/miller/blob/master/c/containers/lrec.h"><tt>lrec_t</tt></a>,
is hand-tuned to Miller&rsquo;s use case and would have required hand-coding in
any case.
</div>
<a id="C_vs._Go,_D,_Rust,_etc.;_C_is_fast"/><h1>C vs. Go, D, Rust, etc.; C is fast</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_c_is_fast');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_c_is_fast" style="display: block">
<p/>I love Go (<a href="http://golang.org">https://golang.org</a>): I think
it&rsquo;s one of the best things ever to happen to our craft, and I use it
often. The D language (<a href="http://dlang.org">http://dolang.org</a>) is an
exciting and elegant successor to C++ (more about which below) &mdash; D has
many of Go&rsquo;s strengths, with a tighter stylistic similarity to C. And initial
experiments with Rust are intriguing. Yet with none of them could I obtain the
throughput I get in C.
<p/a>Specifically, I did simple experiments in several languages &mdash; Ruby,
Python, Lua, Rust, Go, D. In one I just read lines and printed them back out
&mdash; a line-oriented <tt>cat</tt>. In another I consumed input lines like
<tt>x=1,y=2,z=3</tt> one at a time, split them on commas and equals signs to
populate hash maps, transformed them (e.g. remove the <tt>y</tt> field), and
emitted them. Basically <tt>mlr cut -x -f y</tt> with DKVP format. I
didn&rsquo;t do anything fancy &mdash; just using each language&rsquo;s
<tt>getline</tt>, string-split, hashmap-put, etc. And nothing was as fast as
C, so I used C. Here are the experiments I kept (I failed to keep the
Lua code, for example):
<a href="../perf/catc.c.txt">C cat</a>,
<a href="../perf/catc0.c.txt">another C cat</a>,
<a href="../perf/catd.d.txt">D cat</a>,
<a href="../perf/catgo.go.txt">Go cat</a>,
<a href="../perf/catgo2.go.txt">another Go cat</a>,
<a href="../perf/catrust.rs.txt">Rust cat</a>,
<a href="../perf/nimcat.nim.txt">Nim cat</a>,
<a href="../perf/cutd.d.txt">D cut</a>,
<a href="../perf/cutgo.go.txt">Go cut</a>,
<a href="../perf/nimcut.nim.txt">Nim cut</a>.
<p/>One of Go&rsquo;s most powerful features is the ease with which it allows
quick-to-code, error-free concurrency. Yet Miller, like most high-volume
text-processing tools, spends most of its time obtaining and parsing input
strings and negligible time doing all subsequent processing. Thus the absence
of in-process multiprocessing is only a slight penalty in this particular
application domain &mdash; parallelism here is more easily achieved by running
multiple single-threaded processes, each handling its own input files, either
on a single host or split across multiple hosts.
</div>
<a id="C_is_ubiquitous"/><h1>C is ubiquitous</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_c_is_ubiquitous');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_c_is_ubiquitous" style="display: block">
<p/>Every Unix-like system has a C compiler (or is an <tt>apt-get</tt> or
<tt>yum install</tt> away from it). This, I hope, bodes well for uptake
of Miller.
</div>
<a id="C_is_old-school"/><h1>C is old-school</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_c_is_old_school');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_c_is_old_school" style="display: block">
<p/>This alone is not enough reason to program in C, but since I find myself
coding in C due to the other reasons on this page, it&rsquo;s happy enough to
use a throwback language for a throwback tool (see
<a href="etymology.html">Why call it Miller?</a>). That said, Miller is coded in GNU
C99, it uses getopt-style command-line parsing, and for development work I make
use of modern tools such as <a href="http://valgrind.org">valgrind</a>.
K&amp;R was a long, long time ago. (I&rsquo;m writing plain C with <tt>//</tt>
comments; enough said.)
</div>
<a id="C_vs._C++"/><h1>C vs. C++</h1>
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_c_vs_cpp');" href="javascript:;">Toggle section visibility</button>
<div id="section_toggle_c_vs_cpp" style="display: block">
I have a strong personal distaste for C++: its syntax is an ugly layer over the
simplicity of C; templates and STL are even more awkward and even less
elegant. (Meanwhile I find Java, Go, and D to be both elegant and modern; I
ruled them out not for aesthetics but for performance as described above.)
Meanwhile all the positive features I would want from C++ are easily
implementable in C as follows:
<a id="<tt>this</tt>_pointers_and_attributes"/><h2><tt>this</tt> pointers and attributes</h2>
The C++ compiler implictly inserts <tt>this</tt> pointers into method calls:
for example
<pre>
class MyClass {
private:
char* a;
public:
MyClass(char* a) {
this-&gt;a = strdup(a);
}
~MyClass() {
free(a);
}
int myMethod(char* b) {
return strlen(a) + strlen(b);
}
};
...
MyClass* myObj = new MyClass("hello");
int x = myObj-&gt;myMethod("world");
</pre>
results in something like
<pre>
void MyClass$constructorcharptr(MyClass* this, char* a) {
this-&gt;a = strdup(a);
}
void MyClass$destructor(MyClass* this) {
free(this-&gt;a);
}
int MyClass$myMethod(MyClass* this, char* b) {
return strlen(this-&gt;a) + strlen(b);
}
MyClass* myObj = MyClass$constructorcharptr("hello");
int x = MyClass$myMethod(myObj, "world");
</pre>
It&rsquo;s easy enough to imitate this: simply use the coding convention of
prepending the class name to all methods, and placing this-pointers as the first arguments to methods.
Miller uses precisely this approach. For example:
<pre>
typedef struct _lrec_t {
...
} lrec_t;
// Constructors
lrec_t* lrec_csv_alloc(...) {
lrec_t* prec = malloc(sizeof(lrec_t);
...
prec-&gt;attribute = ...;
return prec;
}
lrec_t* lrec_dkvp_alloc(...) {
...
}
// Destructor
void lrec_free(lrec_t* prec) {
...
free(prec-&gt;attribute);
...
free(prec);
}
// Methods
int lrec_foo(lrec_t* prec, ...) {
return prec-&gt;...;
}
void lrec_bar(lrec_t* prec, ...) {
prec-&gt;...;
}
</pre>
<p/> This implements the object-oriented principle of <b>encapsulation</b>.
<a id="Interfaces_and_virtual-function_pointers"/><h2>Interfaces and virtual-function pointers</h2>
Coding conventions again do most of the work, here accompanied by typdeffed function pointers.
For example, here is Miller&rsquo;s record-reader interface:
<pre>
#include &lt;stdio.h&gt;
#include &lt;containers/lrec.h&gt;
typedef lrec_t* reader_func_t(FILE* fp, void* pvstate, context_t* pctx);
typedef void reset_func_t(void* pvstate);
typedef void reader_free_func_t(void* pvstate);
typedef struct _reader_t {
void* pvstate;
reader_func_t* preader_func; // Interface method
reset_func_t* preset_func; // Interface method
reader_free_func_t* pfree_func; // Interface method
} reader_t;
</pre>
<p/>A class implementing this interface might look like
<pre>
// Attributes are private to this file
typedef struct _reader_csv_state_t {
...
} reader_csv_state_t;
// Implementation of interface methods. Marked static (file-scope) to not
// pollute the global namespace; exposed only via function pointers.
static lrec_t* reader_csv_func(FILE* input_stream, void* pvstate, context_t* pctx) {
reader_csv_state_t* pstate = pvstate;
... use various pstate-&gt;attributes ...
}
static void reset_csv_func(void* pvstate) {
reader_csv_state_t* pstate = pvstate;
... use various pstate-&gt;attributes ...
}
static void reader_csv_free(void* pvstate) {
... use various pstate-&gt;attributes ...
}
// Constructor
reader_t* reader_csv_alloc(...) {
reader_t* preader = mlr_malloc_or_die(sizeof(reader_t));
reader_csv_state_t* pstate = mlr_malloc_or_die(sizeof(reader_csv_state_t));
... set various pstate-&gt;attributes ...
preader-&gt;pvstate = (void*)pstate;
preader-&gt;preader_func = &amp;reader_csv_func;
preader-&gt;preset_func = &amp;reset_csv_func;
preader-&gt;pfree_func = &amp;reader_csv_free;
return preader;
}
// Factory method
...
reader_t* preader = reader_csv_alloc(...);
...
// Method call
...
lrec_t* pinrec = preader-&gt;preader_func(input_stream, preader-&gt;pvstate, pctx);
...
</pre>
<p/> This implements the object-oriented principles of <b>polymorphism</b> and
<b>runtime binding</b>.
<p/>More details are at
<a href="https://github.com/johnkerl/miller/tree/master/c/containers">https://github.com/johnkerl/miller/tree/master/c/containers</a>.
</div>
</div>
</td>
</table>
</body>
</html>