mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 18:25:45 +00:00
456 lines
18 KiB
HTML
456 lines
18 KiB
HTML
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
|
<html lang="en">
|
|
|
|
<!-- PAGE GENERATED FROM template.html and content-for-whyc.html BY poki. -->
|
|
<!-- PLEASE MAKE CHANGES THERE AND THEN RE-RUN poki. -->
|
|
<head>
|
|
<meta http-equiv="Content-type" content="text/html;charset=UTF-8"/>
|
|
<meta name="description" content="Miller documentation"/>
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"/> <!-- mobile-friendly -->
|
|
<meta name="keywords"
|
|
content="John Kerl, Kerl, Miller, miller, mlr, OLAP, data analysis software, regression, correlation, variance, data tools, " />
|
|
|
|
<title> Why C? </title>
|
|
<link rel="stylesheet" type="text/css" href="css/miller.css"/>
|
|
<link rel="stylesheet" type="text/css" href="css/poki-callbacks.css"/>
|
|
</head>
|
|
|
|
<!-- ================================================================ -->
|
|
<script type="text/javascript">
|
|
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
|
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
|
</script>
|
|
<script type="text/javascript">
|
|
try {
|
|
var pageTracker = _gat._getTracker("UA-15651652-1");
|
|
pageTracker._trackPageview();
|
|
} catch(err) {}
|
|
</script>
|
|
|
|
<!-- ================================================================ -->
|
|
<script type="text/javascript">
|
|
function toggle_div(div) {
|
|
if (div != null) {
|
|
if (div.id.startsWith("section_toggle_")) {
|
|
var state = div.style.display;
|
|
if (state == "block") {
|
|
div.style.display = "none";
|
|
} else {
|
|
div.style.display = "block";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
function expand_div(div) {
|
|
if (div != null) {
|
|
if (div.id.startsWith("section_toggle_")) {
|
|
div.style.display = "block";
|
|
}
|
|
}
|
|
}
|
|
function collapse_div(div) {
|
|
if (div != null) {
|
|
if (div.id.startsWith("section_toggle_")) {
|
|
div.style.display = "none";
|
|
}
|
|
}
|
|
}
|
|
|
|
function toggle_by_name(divName) {
|
|
toggle_div(document.getElementById(divName));
|
|
}
|
|
function expand_by_name(divName) {
|
|
expand_div(document.getElementById(divName));
|
|
}
|
|
function collapse_by_name(divName) {
|
|
collapse_div(document.getElementById(divName));
|
|
}
|
|
|
|
function expand_all() {
|
|
var divs = document.getElementsByTagName("div");
|
|
for(var i = 0; i < divs.length; i++) {
|
|
expand_div(divs[i]);
|
|
}
|
|
}
|
|
function collapse_all() {
|
|
var divs = document.getElementsByTagName("div");
|
|
for(var i = 0; i < divs.length; i++){
|
|
collapse_div(divs[i]);
|
|
}
|
|
}
|
|
</script>
|
|
|
|
<!--
|
|
The background image is from a screenshot of a Google search for "data analysis
|
|
tools", lightened and sepia-toned. Over this was placed a Mac Terminal app with
|
|
very light-grey font and translucent background, in which a few statistical
|
|
Miller commands were run with pretty-print-tabular output format.
|
|
<body background="pix/sepia-overlay.jpg">
|
|
-->
|
|
<body bgcolor="#ffffff">
|
|
|
|
<!-- ================================================================ -->
|
|
<table width="100%">
|
|
<tr>
|
|
|
|
<!-- navbar -->
|
|
<td width="15%">
|
|
<!--
|
|
<img src="pix/mlr.jpg" />
|
|
<img style="border-width:1px; color:black;" src="pix/mlr.jpg" />
|
|
-->
|
|
|
|
<div class="pokinav">
|
|
<center><titleinbody>Miller</titleinbody></center>
|
|
|
|
<!-- PAGE LIST GENERATED FROM template.html BY poki -->
|
|
<br/><b>Overview:</b>
|
|
<br/>• <a href="index.html">About Miller</a>
|
|
<br/>• <a href="10-min.html">Miller in 10 minutes</a>
|
|
<br/>• <a href="file-formats.html">File formats</a>
|
|
<br/>• <a href="feature-comparison.html">Miller features in the context of the Unix toolkit</a>
|
|
<br/>• <a href="record-heterogeneity.html">Record-heterogeneity</a>
|
|
<br/>• <a href="internationalization.html">Internationalization</a>
|
|
<br/><b>Using Miller:</b>
|
|
<br/>• <a href="faq.html">FAQ</a>
|
|
<br/>• <a href="cookbook.html">Cookbook part 1</a>
|
|
<br/>• <a href="cookbook2.html">Cookbook part 2</a>
|
|
<br/>• <a href="cookbook3.html">Cookbook part 3</a>
|
|
<br/>• <a href="data-examples.html">Data-diving examples</a>
|
|
<br/>• <a href="manpage.html">Manpage</a>
|
|
<br/>• <a href="reference.html">Reference</a>
|
|
<br/>• <a href="reference-verbs.html">Reference: Verbs</a>
|
|
<br/>• <a href="reference-dsl.html">Reference: DSL</a>
|
|
<br/>• <a href="release-docs.html">Documents by release</a>
|
|
<br/>• <a href="build.html">Installation, portability, dependencies, and testing</a>
|
|
<br/><b>Background:</b>
|
|
<br/>• <a href="why.html">Why?</a>
|
|
<br/>• <a href="whyc.html"><b>Why C?</b></a>
|
|
<br/>• <a href="etymology.html">Why call it Miller?</a>
|
|
<br/>• <a href="originality.html">How original is Miller?</a>
|
|
<br/>• <a href="performance.html">Performance</a>
|
|
<br/><b>Repository:</b>
|
|
<br/>• <a href="to-do.html">Things to do</a>
|
|
<br/>• <a href="contact.html">Contact information</a>
|
|
<br/>• <a href="https://github.com/johnkerl/miller">GitHub repo</a>
|
|
<br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/>
|
|
<br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/>
|
|
<br/> <br/> <br/> <br/> <br/> <br/>
|
|
</div>
|
|
</td>
|
|
|
|
<!-- page body -->
|
|
<td>
|
|
<!--
|
|
This is a visually gorgeous feature (here & in the CSS): it allows for
|
|
independent scroll of the nav and body panels. In particular the nav
|
|
stays on-screen as you scroll the body.
|
|
|
|
However, two problems:
|
|
|
|
(1) In Firefox & Chrome both I get janky end-of-body scrolls: there is
|
|
more content but I can't scroll down to it unless I repeatedly retry the
|
|
scrolldown. Which is weird.
|
|
|
|
(2) Worse, only the first page renders in PDF (again, Firefox & Chrome).
|
|
|
|
For now I'm disabling this separate-scroll feature. A frontender, I am
|
|
not ... maybe someday I'll find a config which gets *all* the features
|
|
I want; for now, it's a tradeoff.
|
|
-->
|
|
|
|
<!-- Implementation details: one bit is right here:
|
|
|
|
div style="overflow-y:scroll;height:1500px"
|
|
|
|
and the other bit is in css/poki-callbacks.css:
|
|
|
|
.pokinav {
|
|
display: inline-block;
|
|
background: #e8d9bc;
|
|
border: 1;
|
|
box-shadow: 0px 0px 3px 3px #C9C9C9;
|
|
margin: 10px;
|
|
padding-top: 10px;
|
|
padding-bottom: 10px;
|
|
padding-left: 10px;
|
|
padding-right: 10px;
|
|
overflow-y: scroll; < - - - - - - here
|
|
height: 1500px;
|
|
}
|
|
|
|
-->
|
|
<div>
|
|
<center> <titleinbody> Why C? </titleinbody> </center>
|
|
<p/>
|
|
|
|
<!-- BODY COPIED FROM content-for-whyc.html BY poki -->
|
|
<div class="pokitoc">
|
|
<center><b>Contents:</b></center>
|
|
• <a href="#Why_not_C?">Why not C?</a><br/>
|
|
• <a href="#C_vs._Go,_D,_Rust,_etc.;_C_is_fast">C vs. Go, D, Rust, etc.; C is fast</a><br/>
|
|
• <a href="#C_is_ubiquitous">C is ubiquitous</a><br/>
|
|
• <a href="#C_is_old-school">C is old-school</a><br/>
|
|
• <a href="#C_vs._C++">C vs. C++</a><br/>
|
|
• <a href="#<tt>this</tt>_pointers_and_attributes"><tt>this</tt> pointers and attributes</a><br/>
|
|
• <a href="#Interfaces_and_virtual-function_pointers">Interfaces and virtual-function pointers</a><br/>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>
|
|
<button style="font-weight:bold;color:maroon;border:0" onclick="expand_all();" href="javascript:;">Expand all sections</button>
|
|
<button style="font-weight:bold;color:maroon;border:0" onclick="collapse_all();" href="javascript:;">Collapse all sections</button>
|
|
|
|
<a id="Why_not_C?"/><h1>Why not C?</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_why_not_c');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_why_not_c" style="display: block">
|
|
|
|
<p/>C lacks many of the features found in modern, high-level languages such as
|
|
Java or Go: garbage collection, collections libraries, generics/near-generics,
|
|
hash-map/linked-list literals built into the language (e.g.
|
|
<tt>mymap={"a"=>1,"b"=>2}</tt> or <tt>mylist=[3,4,5]</tt>), autodoc (e.g.
|
|
Javadoc), and so on. Yet, while memory management is indeed Miller’s
|
|
trickiest aspect, its garbage-collection needs are well-delineated and so the
|
|
absence of GC is no great loss. Miller’s performance relies on
|
|
the principles of <i>touching each byte as few times as possible</i>, and
|
|
<i>copying bytes only when necessary</i>. This results in a baton-passing,
|
|
free-on-last-use memory-management pattern which works well enough. (See also
|
|
<a href="https://github.com/johnkerl/miller/blob/master/c/README.md">
|
|
https://github.com/johnkerl/miller/blob/master/c/README.md</a>.)
|
|
Miller doesn’t require a complex collections library: mostly simple hash
|
|
maps, hash sets, and linked lists which aren’t difficult to code.
|
|
Moreover, Miller’s primary data structure, the
|
|
<a href="https://github.com/johnkerl/miller/blob/master/c/containers/lrec.h"><tt>lrec_t</tt></a>,
|
|
is hand-tuned to Miller’s use case and would have required hand-coding in
|
|
any case.
|
|
|
|
</div>
|
|
<a id="C_vs._Go,_D,_Rust,_etc.;_C_is_fast"/><h1>C vs. Go, D, Rust, etc.; C is fast</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_c_is_fast');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_c_is_fast" style="display: block">
|
|
|
|
<p/>I love Go (<a href="http://golang.org">https://golang.org</a>): I think
|
|
it’s one of the best things ever to happen to our craft, and I use it
|
|
often. The D language (<a href="http://dlang.org">http://dolang.org</a>) is an
|
|
exciting and elegant successor to C++ (more about which below) — D has
|
|
many of Go’s strengths, with a tighter stylistic similarity to C. And initial
|
|
experiments with Rust are intriguing. Yet with none of them could I obtain the
|
|
throughput I get in C.
|
|
|
|
<p/a>Specifically, I did simple experiments in several languages — Ruby,
|
|
Python, Lua, Rust, Go, D. In one I just read lines and printed them back out
|
|
— a line-oriented <tt>cat</tt>. In another I consumed input lines like
|
|
<tt>x=1,y=2,z=3</tt> one at a time, split them on commas and equals signs to
|
|
populate hash maps, transformed them (e.g. remove the <tt>y</tt> field), and
|
|
emitted them. Basically <tt>mlr cut -x -f y</tt> with DKVP format. I
|
|
didn’t do anything fancy — just using each language’s
|
|
<tt>getline</tt>, string-split, hashmap-put, etc. And nothing was as fast as
|
|
C, so I used C. Here are the experiments I kept (I failed to keep the
|
|
Lua code, for example):
|
|
<a href="../perf/catc.c.txt">C cat</a>,
|
|
<a href="../perf/catc0.c.txt">another C cat</a>,
|
|
<a href="../perf/catd.d.txt">D cat</a>,
|
|
<a href="../perf/catgo.go.txt">Go cat</a>,
|
|
<a href="../perf/catgo2.go.txt">another Go cat</a>,
|
|
<a href="../perf/catrust.rs.txt">Rust cat</a>,
|
|
<a href="../perf/nimcat.nim.txt">Nim cat</a>,
|
|
<a href="../perf/cutd.d.txt">D cut</a>,
|
|
<a href="../perf/cutgo.go.txt">Go cut</a>,
|
|
<a href="../perf/nimcut.nim.txt">Nim cut</a>.
|
|
|
|
<p/>One of Go’s most powerful features is the ease with which it allows
|
|
quick-to-code, error-free concurrency. Yet Miller, like most high-volume
|
|
text-processing tools, spends most of its time obtaining and parsing input
|
|
strings and negligible time doing all subsequent processing. Thus the absence
|
|
of in-process multiprocessing is only a slight penalty in this particular
|
|
application domain — parallelism here is more easily achieved by running
|
|
multiple single-threaded processes, each handling its own input files, either
|
|
on a single host or split across multiple hosts.
|
|
|
|
</div>
|
|
<a id="C_is_ubiquitous"/><h1>C is ubiquitous</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_c_is_ubiquitous');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_c_is_ubiquitous" style="display: block">
|
|
|
|
<p/>Every Unix-like system has a C compiler (or is an <tt>apt-get</tt> or
|
|
<tt>yum install</tt> away from it). This, I hope, bodes well for uptake
|
|
of Miller.
|
|
|
|
</div>
|
|
<a id="C_is_old-school"/><h1>C is old-school</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_c_is_old_school');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_c_is_old_school" style="display: block">
|
|
|
|
<p/>This alone is not enough reason to program in C, but since I find myself
|
|
coding in C due to the other reasons on this page, it’s happy enough to
|
|
use a throwback language for a throwback tool (see
|
|
<a href="etymology.html">Why call it Miller?</a>). That said, Miller is coded in GNU
|
|
C99, it uses getopt-style command-line parsing, and for development work I make
|
|
use of modern tools such as <a href="http://valgrind.org">valgrind</a>.
|
|
K&R was a long, long time ago. (I’m writing plain C with <tt>//</tt>
|
|
comments; enough said.)
|
|
|
|
</div>
|
|
<a id="C_vs._C++"/><h1>C vs. C++</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="toggle_by_name('section_toggle_c_vs_cpp');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="section_toggle_c_vs_cpp" style="display: block">
|
|
|
|
I have a strong personal distaste for C++: its syntax is an ugly layer over the
|
|
simplicity of C; templates and STL are even more awkward and even less
|
|
elegant. (Meanwhile I find Java, Go, and D to be both elegant and modern; I
|
|
ruled them out not for aesthetics but for performance as described above.)
|
|
Meanwhile all the positive features I would want from C++ are easily
|
|
implementable in C as follows:
|
|
|
|
<a id="<tt>this</tt>_pointers_and_attributes"/><h2><tt>this</tt> pointers and attributes</h2>
|
|
The C++ compiler implictly inserts <tt>this</tt> pointers into method calls:
|
|
for example
|
|
|
|
<pre>
|
|
class MyClass {
|
|
private:
|
|
char* a;
|
|
public:
|
|
MyClass(char* a) {
|
|
this->a = strdup(a);
|
|
}
|
|
~MyClass() {
|
|
free(a);
|
|
}
|
|
int myMethod(char* b) {
|
|
return strlen(a) + strlen(b);
|
|
}
|
|
};
|
|
...
|
|
MyClass* myObj = new MyClass("hello");
|
|
int x = myObj->myMethod("world");
|
|
</pre>
|
|
results in something like
|
|
<pre>
|
|
void MyClass$constructorcharptr(MyClass* this, char* a) {
|
|
this->a = strdup(a);
|
|
}
|
|
void MyClass$destructor(MyClass* this) {
|
|
free(this->a);
|
|
}
|
|
int MyClass$myMethod(MyClass* this, char* b) {
|
|
return strlen(this->a) + strlen(b);
|
|
}
|
|
MyClass* myObj = MyClass$constructorcharptr("hello");
|
|
int x = MyClass$myMethod(myObj, "world");
|
|
</pre>
|
|
|
|
It’s easy enough to imitate this: simply use the coding convention of
|
|
prepending the class name to all methods, and placing this-pointers as the first arguments to methods.
|
|
Miller uses precisely this approach. For example:
|
|
<pre>
|
|
typedef struct _lrec_t {
|
|
...
|
|
} lrec_t;
|
|
// Constructors
|
|
lrec_t* lrec_csv_alloc(...) {
|
|
lrec_t* prec = malloc(sizeof(lrec_t);
|
|
...
|
|
prec->attribute = ...;
|
|
return prec;
|
|
}
|
|
lrec_t* lrec_dkvp_alloc(...) {
|
|
...
|
|
}
|
|
// Destructor
|
|
void lrec_free(lrec_t* prec) {
|
|
...
|
|
free(prec->attribute);
|
|
...
|
|
free(prec);
|
|
}
|
|
// Methods
|
|
int lrec_foo(lrec_t* prec, ...) {
|
|
return prec->...;
|
|
}
|
|
void lrec_bar(lrec_t* prec, ...) {
|
|
prec->...;
|
|
}
|
|
</pre>
|
|
|
|
<p/> This implements the object-oriented principle of <b>encapsulation</b>.
|
|
|
|
|
|
<a id="Interfaces_and_virtual-function_pointers"/><h2>Interfaces and virtual-function pointers</h2>
|
|
|
|
Coding conventions again do most of the work, here accompanied by typdeffed function pointers.
|
|
For example, here is Miller’s record-reader interface:
|
|
<pre>
|
|
#include <stdio.h>
|
|
#include <containers/lrec.h>
|
|
typedef lrec_t* reader_func_t(FILE* fp, void* pvstate, context_t* pctx);
|
|
typedef void reset_func_t(void* pvstate);
|
|
typedef void reader_free_func_t(void* pvstate);
|
|
|
|
typedef struct _reader_t {
|
|
void* pvstate;
|
|
reader_func_t* preader_func; // Interface method
|
|
reset_func_t* preset_func; // Interface method
|
|
reader_free_func_t* pfree_func; // Interface method
|
|
} reader_t;
|
|
</pre>
|
|
|
|
<p/>A class implementing this interface might look like
|
|
<pre>
|
|
// Attributes are private to this file
|
|
typedef struct _reader_csv_state_t {
|
|
...
|
|
} reader_csv_state_t;
|
|
|
|
// Implementation of interface methods. Marked static (file-scope) to not
|
|
// pollute the global namespace; exposed only via function pointers.
|
|
static lrec_t* reader_csv_func(FILE* input_stream, void* pvstate, context_t* pctx) {
|
|
reader_csv_state_t* pstate = pvstate;
|
|
... use various pstate->attributes ...
|
|
}
|
|
static void reset_csv_func(void* pvstate) {
|
|
reader_csv_state_t* pstate = pvstate;
|
|
... use various pstate->attributes ...
|
|
}
|
|
static void reader_csv_free(void* pvstate) {
|
|
... use various pstate->attributes ...
|
|
}
|
|
|
|
// Constructor
|
|
reader_t* reader_csv_alloc(...) {
|
|
reader_t* preader = mlr_malloc_or_die(sizeof(reader_t));
|
|
|
|
reader_csv_state_t* pstate = mlr_malloc_or_die(sizeof(reader_csv_state_t));
|
|
... set various pstate->attributes ...
|
|
|
|
preader->pvstate = (void*)pstate;
|
|
preader->preader_func = &reader_csv_func;
|
|
preader->preset_func = &reset_csv_func;
|
|
preader->pfree_func = &reader_csv_free;
|
|
|
|
return preader;
|
|
}
|
|
|
|
// Factory method
|
|
...
|
|
reader_t* preader = reader_csv_alloc(...);
|
|
...
|
|
// Method call
|
|
...
|
|
lrec_t* pinrec = preader->preader_func(input_stream, preader->pvstate, pctx);
|
|
...
|
|
</pre>
|
|
|
|
<p/> This implements the object-oriented principles of <b>polymorphism</b> and
|
|
<b>runtime binding</b>.
|
|
|
|
<p/>More details are at
|
|
<a href="https://github.com/johnkerl/miller/tree/master/c/containers">https://github.com/johnkerl/miller/tree/master/c/containers</a>.
|
|
|
|
</div>
|
|
</div>
|
|
</td>
|
|
|
|
</table>
|
|
</body>
|
|
</html>
|