From 00692557ea50bf4c6ff0d1afce8d27d230a00fe8 Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Thu, 14 May 2015 14:05:45 -0400
Subject: [PATCH] doc neatens

---
 c/cli/mlrcli.c                          |  3 +-
 c/todo.txt                              | 24 ++--------
 doc/content-for-feature-comparison.html |  7 +++
 doc/content-for-performance.html        |  4 +-
 doc/feature-comparison.html             |  7 +++
 doc/performance.html                    |  4 +-
 python/sketch.py                        | 63 +++++++++++++------------
 7 files changed, 59 insertions(+), 53 deletions(-)

diff --git a/c/cli/mlrcli.c b/c/cli/mlrcli.c
index cfabab37f..edcc31a49 100644
--- a/c/cli/mlrcli.c
+++ b/c/cli/mlrcli.c
@@ -59,10 +59,11 @@ static void main_usage(char* argv0, int exit_code) {
 		}
 		if ((i > 0) && (linelen > 0))
 			fprintf(o, " ");
+		else
+			fprintf(o, "   ");
 		fprintf(o, "%s", mapper_lookup_table[i]->verb);
 	}
 	fprintf(o, "\n");
-	fprintf(o, "\n");
 	fprintf(o, "Please use \"%s {verb name} --help\" for verb-specific help.\n", argv0);
 	fprintf(o, "Please use \"%s --help-all-verbs\" for help on all verbs.\n", argv0);
 
diff --git a/c/todo.txt b/c/todo.txt
index 49c548d48..7f3940714 100644
--- a/c/todo.txt
+++ b/c/todo.txt
@@ -21,6 +21,8 @@ FEATURES
 
    > definitely put nidx before dkvp in the mlrwik/formats page
 
+   > "index-numbered" -> "implicitly index-numbered" in mlrwik
+
 !! use 1-(|l2|/|l1|)^2 as pca quality metric? verify against r2 in munch plots.
   -> after pgr legend fix
 
@@ -28,8 +30,9 @@ FEATURES
 
 ! ordered cut (a la reorder). either a new command (yeck) or cut option (e.g. cut -o)
 
-* stats1 mode: what about "1"=="1.0"? doc this, or impl option
-  w/ temporary sscanf & reformat @ maxlen
+! rip through filenames @ start & abend unless -f each: fail fast.
+
+! mlr sort CLI opt for choice of heap/merge/quick -- ?
 
 * mod op (either c-like, or sane) and put into wikidoc if so.
 
@@ -46,16 +49,12 @@ NEATEN
 
 !! xxx's in the code
 * source hygiene: top-of-header comments, readme re memory management, etc.
-* check all usage messages
 
 * prune pix/ dir
 * makefile go/d/rs stuff -> language-comparisons/ dir
 * catc.c/catc0.c -> language-comparisons/ dir
 * remove/coalesce/gzip the large data files
 * mk perfcomp dir
-* "index-numbered" -> "implicitly index-numbered" in mlrwik
-* rip through filenames @ start & abend unless -f each: fail fast.
-* play with python sketch.py (& rename -- mlr.py??) & make sure it's at least usable for something
 
 * trawl around sysadmin docs etc. looking for more use-cases. not just data analysis or devops/logdive.
 
@@ -65,7 +64,6 @@ NEATEN
 ================================================================
 ONLINE HELP
 
-* then-chaining note into mlr online help
 * put/filter: have a categorized function lister -- by string/math or arity, or some such ...
 
 ================================================================
@@ -73,10 +71,6 @@ IMPROVEMENTS
 * pprint reader: read using field widths?!? with strip ... that would solve the embedded-whitespace problem.
   lmhsi: column name -> char index??
 * null-handling everywhere!! :/
-* 80-column-wrap slls printer (for mlr --help)
-* char -> char* for RS/FS/PS. then, NEWLINENEWLINE + repifs -> xtab is a more or less special case of dkvp.
-  BUT this makes mlr_get_line harder :/
-* mlr sort CLI opt for choice of heap/merge/quick -- ?
 
 ================================================================
 TESTING
@@ -91,11 +85,7 @@ DOC
   dkvp cat completely stateless. stats1/2 retain only agg state. tac/sort/etc. obvious retain all lines.
   you can snarf a 20GB file with 4GB RAM no problem.
 * performance: Go/D/Rust #'s ... also GH link from mlrwik
-* intro note about more structuring: grep/sed/ruby/perl/sed etc. for some structuring, then pipe to miller?
 * maybe restore mem-mgmt page for dev-info? or a readme?!?
-* somewhere in mlrwik put some realistic log-dive data -- not abixy & not just resource/ok.
-* at the top link to data/small/etc. make small.txt hardlinks for browser download.
-* emph all variables are stream variables and all functions are stream functions
 * meaneb: assumes uncorr. code/doc link to appxb :)
 
 ================================================================
@@ -140,10 +130,6 @@ UT/REG
 * multi-csv I/O: include --icsv --odkvp and --idkvp --ocsv, as well as --csv cases
 * het-xtab out
 
-================================================================
-I/O
-? right-align only some in --opprint?
-
 ================================================================
 INTERNAL DOCS (e.g. README)
 * sllv==NULL vs. sllv_single(NULL); and mem-mgmt cmts thruout
diff --git a/doc/content-for-feature-comparison.html b/doc/content-for-feature-comparison.html
index a3f48eeda..5bdfb908e 100644
--- a/doc/content-for-feature-comparison.html
+++ b/doc/content-for-feature-comparison.html
@@ -42,6 +42,13 @@ Its domain-specific languages are limited to the <tt>filter</tt> and
 <tt>put</tt> syntax. Futher programmability comes from chaining with
 <tt>then</tt>.
 
+<li/> Unlike with <tt>awk</tt>, all variables are stream variables and all
+functions are stream functions.  This means <tt>NF</tt>, <tt>NR</tt>, etc.
+change from one line to another, <tt>$x</tt> is a label for field <tt>x</tt> in
+the current record, and the input to <tt>sqrt($x)</tt> changes from one record
+to the next.  Miller doesn&rsquo;t let you set, say, <tt>sum=0</tt> and then
+update that on each record.
+
 <li/> Miller is faster than <tt>awk</tt>, <tt>cut</tt>, and so on (depending on
 platform; see also POKI_PUT_LINK_FOR_PAGE(performance.html)HERE). In
 particular, Miller&rsquo;s DSL syntax is parsed into C control structures at
diff --git a/doc/content-for-performance.html b/doc/content-for-performance.html
index ceaa9d42a..23740030a 100644
--- a/doc/content-for-performance.html
+++ b/doc/content-for-performance.html
@@ -52,4 +52,6 @@ rather, I attempted only to show that Miller&rsquo;s processing time here is com
 
 <h1>Conclusion</h1>
 
-For record-oriented data transformations, Miller is worth consideration. Field renames are worth doing as a pre-pipe or post-pipe using <tt>sed</tt>.
+For record-oriented data transformations, Miller meets or beats the Unix
+toolkit in many contexts. Field renames in particular are worth doing as a
+pre-pipe or post-pipe using <tt>sed</tt>.
diff --git a/doc/feature-comparison.html b/doc/feature-comparison.html
index 219f50a68..e4e49a747 100644
--- a/doc/feature-comparison.html
+++ b/doc/feature-comparison.html
@@ -166,6 +166,13 @@ Its domain-specific languages are limited to the <tt>filter</tt> and
 <tt>put</tt> syntax. Futher programmability comes from chaining with
 <tt>then</tt>.
 
+<li/> Unlike with <tt>awk</tt>, all variables are stream variables and all
+functions are stream functions.  This means <tt>NF</tt>, <tt>NR</tt>, etc.
+change from one line to another, <tt>$x</tt> is a label for field <tt>x</tt> in
+the current record, and the input to <tt>sqrt($x)</tt> changes from one record
+to the next.  Miller doesn&rsquo;t let you set, say, <tt>sum=0</tt> and then
+update that on each record.
+
 <li/> Miller is faster than <tt>awk</tt>, <tt>cut</tt>, and so on (depending on
 platform; see also <a href="performance.html">Performance</a>). In
 particular, Miller&rsquo;s DSL syntax is parsed into C control structures at
diff --git a/doc/performance.html b/doc/performance.html
index a14b3855e..9ca7f62f9 100644
--- a/doc/performance.html
+++ b/doc/performance.html
@@ -206,7 +206,9 @@ rather, I attempted only to show that Miller&rsquo;s processing time here is com
 
 <a id="Conclusion"/><h1>Conclusion</h1>
 
-For record-oriented data transformations, Miller is worth consideration. Field renames are worth doing as a pre-pipe or post-pipe using <tt>sed</tt>.
+For record-oriented data transformations, Miller meets or beats the Unix
+toolkit in many contexts. Field renames in particular are worth doing as a
+pre-pipe or post-pipe using <tt>sed</tt>.
     </div>
   </td>
 
diff --git a/python/sketch.py b/python/sketch.py
index e99e6f199..81a5621ef 100755
--- a/python/sketch.py
+++ b/python/sketch.py
@@ -34,17 +34,23 @@ def usage():
    print >> sys.stderr, "  -P {ps}   Input/output key-value-pair separator"
    print >> sys.stderr, "  -v {name=value} xxx needs more doc"
    print >> sys.stderr, ""
-   print >> sys.stderr, "  --idfl      Input  format is delimited by IRS,IFS,IPS"
-   print >> sys.stderr, "  --odfl      Output format is delimited by IRS,IFS,IPS"
-   print >> sys.stderr, "  --ihdrdata  Input  format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
-   print >> sys.stderr, "  --ohdrdata  Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
-   print >> sys.stderr, "  --iidx      Input  format is implicitly integer-indexed (awk-style)"
-   print >> sys.stderr, "  --oidx      Output format is implicitly integer-indexed (awk-style)"
-   print >> sys.stderr, "  --itbl      Input  format is tabular-pretty-print"
-   print >> sys.stderr, "  --otbl      Output format is tabular-pretty-print"
-   print >> sys.stderr, "  --ixtbl     Input  format is transposed-tabular-pretty-print"
-   print >> sys.stderr, "  --oxtbl     Output format is transposed-tabular-pretty-print"
-   print >> sys.stderr, "Modulator-spec help is TBD."
+   print >> sys.stderr, "  --idkvp  Input  format is delimited by IRS,IFS,IPS"
+   print >> sys.stderr, "  --odkvp  Output format is delimited by IRS,IFS,IPS"
+   print >> sys.stderr, "  --icsv   Input  format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
+   print >> sys.stderr, "  --ocsv   Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
+   print >> sys.stderr, "  --inidx  Input  format is implicitly integer-indexed (awk-style)"
+   print >> sys.stderr, "  --onidx  Output format is implicitly integer-indexed (awk-style)"
+   print >> sys.stderr, "  --ixtab  Input  format is transposed-tabular-pretty-print"
+   print >> sys.stderr, "  --oxtab  Output format is transposed-tabular-pretty-print"
+   print >> sys.stderr, "Modulator specs:"
+   print >> sys.stderr, '--cat'
+   print >> sys.stderr, '--tac'
+   print >> sys.stderr, '--cut'
+   print >> sys.stderr, '--cutx'
+   print >> sys.stderr, '--sortfields'
+   print >> sys.stderr, '--sortfieldsup'
+   print >> sys.stderr, '--sortfieldsdown'
+
    sys.exit(1)
 
 # ----------------------------------------------------------------
@@ -56,8 +62,8 @@ def parse_command_line():
 
    try:
       optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [
-		  'help', 'idfl', 'odfl', 'ihdrdata', 'ohdrdata', 'iidx', 'oidx', 'itbl', 'otbl', 'ixtbl',
-		  'oxtbl', 'cat', 'tac', 'inclflds=', 'exclflds=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
+		  'help', 'idkvp', 'odkvp', 'icsv', 'ocsv', 'inidx', 'onidx', 'ixtab', 'oxtab',
+		  'cat', 'tac', 'cut=', 'cutx=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
 
    except getopt.GetoptError, err:
       print str(err)
@@ -78,35 +84,35 @@ def parse_command_line():
          kv = string.split(arg, "=", 1)
          namespace.put(kv[0], kv[1])
 
-      elif opt == '--idfl':
+      elif opt == '--idkvp':
          rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
-      elif opt == '--odfl':
+      elif opt == '--odkvp':
          rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
 
-      elif opt == '--ihdrdata':
+      elif opt == '--icsv':
          rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
-      elif opt == '--ohdrdata':
+      elif opt == '--ocsv':
          rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
 
-      elif opt == '--iidx':
+      elif opt == '--inidx':
          rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
-      elif opt == '--oidx':
+      elif opt == '--onidx':
          rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
 
-      #elif opt == '--ixtbl':
+      #elif opt == '--ixtab':
       #   pass
-      elif opt == '--oxtbl':
+      elif opt == '--oxtab':
          rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!?
 
       elif opt == '--cat':
          rmodulator = CatModulator()
       elif opt == '--tac':
          rmodulator = TacModulator()
-      elif opt == '--inclflds':
+      elif opt == '--cut':
          rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS")))
-      elif opt == '--exclflds':
+      elif opt == '--cutx':
          rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
-      elif opt == '--exclflds':
+      elif opt == '--cutx':
          rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
       elif opt == '--sortfields':
          rmodulator = SortFieldsInRecordModulator(True)
@@ -114,8 +120,6 @@ def parse_command_line():
          rmodulator = SortFieldsInRecordModulator(True)
       elif opt == '--sortfieldsdown':
          rmodulator = SortFieldsInRecordModulator(False)
-      #--mean i,x,y@a,b ... *NOT* the @-sign!
-      #rmodulator = MeanModulator(["i","x","y"],["a","b"])
 
       elif opt == '--help':
          usage()
@@ -138,8 +142,8 @@ def main():
    options = parse_command_line()
 
    # parse ARGV:
-   # * --ifmt: dfl,hdr1st,iidxed,align,xposealign
-   # * --ofmt: dfl,hdr1st,iidxed,align,xposealign
+   # * --ifmt: dkvp,hdr1st,iidxed,align,xposealign
+   # * --ofmt: dkvp,hdr1st,iidxed,align,xposealign
    # * which-control-language spec?!?
    # * modulators/script ... this is the key decision area for language(s) design.
    # * filenames
@@ -466,9 +470,6 @@ class MeanModulator:
 class StreamModulator:
    def __init__(self):
       pass
-   # xxx clearly define duck-ops for istream & ostream.
-   # * sys.stdin, sys.stdout, file ops need to impl it (maybe need to decorate them to do so).
-   # * likewise need to be able to compose one stream modulator inside another. e.g. sort(sum(inclflds(...)...)...).
    def modulate(self, rreader, rmodulator, rwriter):
       while True:
          in_record = rreader.read()