mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 10:15:36 +00:00
322 lines
13 KiB
HTML
322 lines
13 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="en">
|
|
|
|
<!-- PAGE GENERATED FROM template.html and content-for-whyc.html BY poki. -->
|
|
<!-- PLEASE MAKE CHANGES THERE AND THEN RE-RUN poki. -->
|
|
<head>
|
|
<meta http-equiv="Content-type" content="text/html;charset=UTF-8"/>
|
|
<meta name="description" content="Miller documentation"/>
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"/> <!-- mobile-friendly -->
|
|
<meta name="keywords"
|
|
content="John Kerl, Kerl, Miller, miller, mlr, OLAP, data analysis software, regression, correlation, variance, data tools, " />
|
|
|
|
<title> Why C? </title>
|
|
<link rel="stylesheet" type="text/css" href="css/miller.css"/>
|
|
<link rel="stylesheet" type="text/css" href="css/poki-callbacks.css"/>
|
|
</head>
|
|
|
|
<!-- ================================================================ -->
|
|
<script type="text/javascript">
|
|
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
|
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
|
</script>
|
|
<script type="text/javascript">
|
|
try {
|
|
var pageTracker = _gat._getTracker("UA-15651652-1");
|
|
pageTracker._trackPageview();
|
|
} catch(err) {}
|
|
</script>
|
|
<!-- ================================================================ -->
|
|
|
|
<body bgcolor="#ffffff">
|
|
|
|
<!-- ================================================================ -->
|
|
|
|
<!-- navbar -->
|
|
<div class="pokinav">
|
|
<center><titleinbody>Miller</titleinbody></center>
|
|
|
|
<!-- NAVBAR GENERATED FROM template.html BY poki -->
|
|
<br/>
|
|
<a class="poki-navbar-element" href="index.html">Overview</a>
|
|
|
|
<a class="poki-navbar-element" href="faq.html">Using</a>
|
|
|
|
<a class="poki-navbar-element" href="reference.html">Reference</a>
|
|
|
|
<a class="poki-navbar-element" href="why.html"><b>Background</b></a>
|
|
|
|
<a class="poki-navbar-element" href="contact.html">Repository</a>
|
|
|
|
<br/>
|
|
<br/><a href="why.html">Why?</a>
|
|
<br/><a href="whyc.html"><b>Why C?</b></a>
|
|
<br/><a href="whyc-details.html">Why C: details</a>
|
|
<br/><a href="etymology.html">Why call it Miller?</a>
|
|
<br/><a href="originality.html">How original is Miller?</a>
|
|
<br/><a href="performance.html">Performance</a>
|
|
</div>
|
|
|
|
<!-- page body -->
|
|
<p/>
|
|
|
|
<!-- BODY COPIED FROM content-for-whyc.html BY poki -->
|
|
<div class="pokitoc">
|
|
<center><titleinbody>Why C?</titleinbody></center>
|
|
• <a href="#Why_not_C?">Why not C?</a><br/>
|
|
• <a href="#C_vs._Go,_D,_Rust,_etc.;_C_is_fast">C vs. Go, D, Rust, etc.; C is fast</a><br/>
|
|
• <a href="#C_is_ubiquitous">C is ubiquitous</a><br/>
|
|
• <a href="#C_is_old-school">C is old-school</a><br/>
|
|
• <a href="#C_vs._C++">C vs. C++</a><br/>
|
|
• <a href="#<code>this</code>_pointers_and_attributes"><code>this</code> pointers and attributes</a><br/>
|
|
• <a href="#Interfaces_and_virtual-function_pointers">Interfaces and virtual-function pointers</a><br/>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>
|
|
<button style="font-weight:bold;color:maroon;border:0" onclick="bodyToggler.expandAll();" href="javascript:;">Expand all sections</button>
|
|
<button style="font-weight:bold;color:maroon;border:0" onclick="bodyToggler.collapseAll();" href="javascript:;">Collapse all sections</button>
|
|
|
|
<a id="Why_not_C?"/><h1>Why not C?</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_why_not_c');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_why_not_c" style="display: block">
|
|
|
|
<p/>C lacks many of the features found in modern, high-level languages such as
|
|
Java or Go: garbage collection, collections libraries, generics/near-generics,
|
|
hash-map/linked-list literals built into the language (e.g.
|
|
<code>mymap={"a"=>1,"b"=>2}</code> or <code>mylist=[3,4,5]</code>), autodoc (e.g.
|
|
Javadoc), and so on. Yet, while memory management is indeed Miller’s
|
|
trickiest aspect, its garbage-collection needs are well-delineated and so the
|
|
absence of GC is no great loss. Miller’s performance relies on
|
|
the principles of <i>touching each byte as few times as possible</i>, and
|
|
<i>copying bytes only when necessary</i>. This results in a baton-passing,
|
|
free-on-last-use memory-management pattern which works well enough. (See also
|
|
<a href="https://github.com/johnkerl/miller/blob/master/c/README.md">miller/c/README.md</a>.)
|
|
Miller doesn’t require a complex collections library: mostly simple hash
|
|
maps, hash sets, and linked lists which aren’t difficult to code.
|
|
Moreover, Miller’s primary data structure, the
|
|
<a href="https://github.com/johnkerl/miller/blob/master/c/containers/lrec.h"><code>lrec_t</code></a>,
|
|
is hand-tuned to Miller’s use case and would have required hand-coding in
|
|
any case.
|
|
|
|
</div>
|
|
<a id="C_vs._Go,_D,_Rust,_etc.;_C_is_fast"/><h1>C vs. Go, D, Rust, etc.; C is fast</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_c_is_fast');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_c_is_fast" style="display: block">
|
|
|
|
<p/>I love Go (<a href="http://golang.org">https://golang.org</a>): I think
|
|
it’s one of the best things ever to happen to our craft, and I use it
|
|
often. The D language (<a href="http://dlang.org">http://dolang.org</a>) is an
|
|
exciting and elegant successor to C++ (more about which below) — D has
|
|
many of Go’s strengths, with a tighter stylistic similarity to C. And initial
|
|
experiments with Rust are intriguing. Yet with none of them could I obtain the
|
|
throughput I get in C.
|
|
|
|
<p/a>Details are written up at <a href="whyc-details.html">Why C: details</a>.
|
|
|
|
<p/>One of Go’s most powerful features is the ease with which it allows
|
|
quick-to-code, error-free concurrency. Yet Miller, like most high-volume
|
|
text-processing tools, spends most of its time obtaining and parsing input
|
|
strings and negligible time doing all subsequent processing. Thus the absence
|
|
of in-process multiprocessing is only a slight penalty in this particular
|
|
application domain — parallelism here is more easily achieved by running
|
|
multiple single-threaded processes, each handling its own input files, either
|
|
on a single host or split across multiple hosts.
|
|
|
|
</div>
|
|
<a id="C_is_ubiquitous"/><h1>C is ubiquitous</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_c_is_ubiquitous');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_c_is_ubiquitous" style="display: block">
|
|
|
|
<p/>Every Unix-like system has a C compiler (or is an <code>apt-get</code> or
|
|
<code>yum install</code> away from it). This, I hope, bodes well for uptake
|
|
of Miller.
|
|
|
|
</div>
|
|
<a id="C_is_old-school"/><h1>C is old-school</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_c_is_old_school');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_c_is_old_school" style="display: block">
|
|
|
|
<p/>This alone is not enough reason to program in C, but since I find myself
|
|
coding in C due to the other reasons on this page, it’s happy enough to
|
|
use a throwback language for a throwback tool (see
|
|
<a href="etymology.html">Why call it Miller?</a>). That said, Miller is coded in GNU
|
|
C99, it uses getopt-style command-line parsing, and for development work I make
|
|
use of modern tools such as <a href="http://valgrind.org">valgrind</a>.
|
|
K&R was a long, long time ago. (I’m writing plain C with <code>//</code>
|
|
comments; enough said.)
|
|
|
|
</div>
|
|
<a id="C_vs._C++"/><h1>C vs. C++</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_c_vs_cpp');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_c_vs_cpp" style="display: block">
|
|
|
|
I have a strong personal distaste for C++: its syntax is an ugly layer over the
|
|
simplicity of C; templates and STL are even more awkward and even less
|
|
elegant. (Meanwhile I find Java, Go, and D to be both elegant and modern; I
|
|
ruled them out not for aesthetics but for performance as described above.)
|
|
Meanwhile all the positive features I would want from C++ are easily
|
|
implementable in C as follows:
|
|
|
|
<a id="<code>this</code>_pointers_and_attributes"/><h2><code>this</code> pointers and attributes</h2>
|
|
The C++ compiler implictly inserts <code>this</code> pointers into method calls:
|
|
for example
|
|
|
|
<pre>
|
|
class MyClass {
|
|
private:
|
|
char* a;
|
|
public:
|
|
MyClass(char* a) {
|
|
this->a = strdup(a);
|
|
}
|
|
~MyClass() {
|
|
free(a);
|
|
}
|
|
int myMethod(char* b) {
|
|
return strlen(a) + strlen(b);
|
|
}
|
|
};
|
|
...
|
|
MyClass* myObj = new MyClass("hello");
|
|
int x = myObj->myMethod("world");
|
|
</pre>
|
|
results in something like
|
|
<pre>
|
|
void MyClass$constructorcharptr(MyClass* this, char* a) {
|
|
this->a = strdup(a);
|
|
}
|
|
void MyClass$destructor(MyClass* this) {
|
|
free(this->a);
|
|
}
|
|
int MyClass$myMethod(MyClass* this, char* b) {
|
|
return strlen(this->a) + strlen(b);
|
|
}
|
|
MyClass* myObj = MyClass$constructorcharptr("hello");
|
|
int x = MyClass$myMethod(myObj, "world");
|
|
</pre>
|
|
|
|
It’s easy enough to imitate this: simply use the coding convention of
|
|
prepending the class name to all methods, and placing this-pointers as the first arguments to methods.
|
|
Miller uses precisely this approach. For example:
|
|
<pre>
|
|
typedef struct _lrec_t {
|
|
...
|
|
} lrec_t;
|
|
// Constructors
|
|
lrec_t* lrec_csv_alloc(...) {
|
|
lrec_t* prec = malloc(sizeof(lrec_t);
|
|
...
|
|
prec->attribute = ...;
|
|
return prec;
|
|
}
|
|
lrec_t* lrec_dkvp_alloc(...) {
|
|
...
|
|
}
|
|
// Destructor
|
|
void lrec_free(lrec_t* prec) {
|
|
...
|
|
free(prec->attribute);
|
|
...
|
|
free(prec);
|
|
}
|
|
// Methods
|
|
int lrec_foo(lrec_t* prec, ...) {
|
|
return prec->...;
|
|
}
|
|
void lrec_bar(lrec_t* prec, ...) {
|
|
prec->...;
|
|
}
|
|
</pre>
|
|
|
|
<p/> This implements the object-oriented principle of <b>encapsulation</b>.
|
|
|
|
|
|
<a id="Interfaces_and_virtual-function_pointers"/><h2>Interfaces and virtual-function pointers</h2>
|
|
|
|
Coding conventions again do most of the work, here accompanied by typdeffed function pointers.
|
|
For example, here is Miller’s record-reader interface:
|
|
<pre>
|
|
#include <stdio.h>
|
|
#include <containers/lrec.h>
|
|
typedef lrec_t* reader_func_t(FILE* fp, void* pvstate, context_t* pctx);
|
|
typedef void reset_func_t(void* pvstate);
|
|
typedef void reader_free_func_t(void* pvstate);
|
|
|
|
typedef struct _reader_t {
|
|
void* pvstate;
|
|
reader_func_t* preader_func; // Interface method
|
|
reset_func_t* preset_func; // Interface method
|
|
reader_free_func_t* pfree_func; // Interface method
|
|
} reader_t;
|
|
</pre>
|
|
|
|
<p/>A class implementing this interface might look like
|
|
<pre>
|
|
// Attributes are private to this file
|
|
typedef struct _reader_csv_state_t {
|
|
...
|
|
} reader_csv_state_t;
|
|
|
|
// Implementation of interface methods. Marked static (file-scope) to not
|
|
// pollute the global namespace; exposed only via function pointers.
|
|
static lrec_t* reader_csv_func(FILE* input_stream, void* pvstate, context_t* pctx) {
|
|
reader_csv_state_t* pstate = pvstate;
|
|
... use various pstate->attributes ...
|
|
}
|
|
static void reset_csv_func(void* pvstate) {
|
|
reader_csv_state_t* pstate = pvstate;
|
|
... use various pstate->attributes ...
|
|
}
|
|
static void reader_csv_free(void* pvstate) {
|
|
... use various pstate->attributes ...
|
|
}
|
|
|
|
// Constructor
|
|
reader_t* reader_csv_alloc(...) {
|
|
reader_t* preader = mlr_malloc_or_die(sizeof(reader_t));
|
|
|
|
reader_csv_state_t* pstate = mlr_malloc_or_die(sizeof(reader_csv_state_t));
|
|
... set various pstate->attributes ...
|
|
|
|
preader->pvstate = (void*)pstate;
|
|
preader->preader_func = &reader_csv_func;
|
|
preader->preset_func = &reset_csv_func;
|
|
preader->pfree_func = &reader_csv_free;
|
|
|
|
return preader;
|
|
}
|
|
|
|
// Factory method
|
|
...
|
|
reader_t* preader = reader_csv_alloc(...);
|
|
...
|
|
// Method call
|
|
...
|
|
lrec_t* pinrec = preader->preader_func(input_stream, preader->pvstate, pctx);
|
|
...
|
|
</pre>
|
|
|
|
<p/> This implements the object-oriented principles of <b>polymorphism</b> and
|
|
<b>runtime binding</b>.
|
|
|
|
<p/>More details are at
|
|
<a href="https://github.com/johnkerl/miller/tree/master/c/containers">https://github.com/johnkerl/miller/tree/master/c/containers</a>.
|
|
|
|
</div>
|
|
|
|
<!-- ================================================================ -->
|
|
<script type="text/javascript" src="js/miller-doc-toggler.js"></script>
|
|
<!-- wtf -->
|
|
<script type="text/javascript">
|
|
// Put this at the bottom of the page since its constructor scans the
|
|
// document's div tags to find the toggleables.
|
|
const bodyToggler = new MillerDocToggler(
|
|
"body_section_toggle_",
|
|
'maroon',
|
|
'maroon',
|
|
);
|
|
</script>
|
|
|
|
</body>
|
|
</html>
|