doc neatens

2026-01-23 02:14:13 +00:00 · 2015-05-14 14:05:45 -04:00 · 2015-05-14 14:05:45 -04:00 · 00692557ea
commit 00692557ea
parent ad3ca6c04b
7 changed files with 59 additions and 53 deletions
--- a/c/cli/mlrcli.c
+++ b/c/cli/mlrcli.c
@ -59,10 +59,11 @@ static void main_usage(char* argv0, int exit_code) {
 		}
 		if ((i > 0) && (linelen > 0))
 			fprintf(o, " ");
+		else
+			fprintf(o, "   ");
 		fprintf(o, "%s", mapper_lookup_table[i]->verb);
 	}
 	fprintf(o, "\n");
-	fprintf(o, "\n");
 	fprintf(o, "Please use \"%s {verb name} --help\" for verb-specific help.\n", argv0);
 	fprintf(o, "Please use \"%s --help-all-verbs\" for help on all verbs.\n", argv0);

--- a/c/todo.txt
+++ b/c/todo.txt
@ -21,6 +21,8 @@ FEATURES

   > definitely put nidx before dkvp in the mlrwik/formats page

+   > "index-numbered" -> "implicitly index-numbered" in mlrwik
+
 !! use 1-(|l2|/|l1|)^2 as pca quality metric? verify against r2 in munch plots.
  -> after pgr legend fix

@ -28,8 +30,9 @@ FEATURES

 ! ordered cut (a la reorder). either a new command (yeck) or cut option (e.g. cut -o)

-* stats1 mode: what about "1"=="1.0"? doc this, or impl option
-  w/ temporary sscanf & reformat @ maxlen
+! rip through filenames @ start & abend unless -f each: fail fast.
+
+! mlr sort CLI opt for choice of heap/merge/quick -- ?

 * mod op (either c-like, or sane) and put into wikidoc if so.

@ -46,16 +49,12 @@ NEATEN

 !! xxx's in the code
 * source hygiene: top-of-header comments, readme re memory management, etc.
-* check all usage messages

 * prune pix/ dir
 * makefile go/d/rs stuff -> language-comparisons/ dir
 * catc.c/catc0.c -> language-comparisons/ dir
 * remove/coalesce/gzip the large data files
 * mk perfcomp dir
-* "index-numbered" -> "implicitly index-numbered" in mlrwik
-* rip through filenames @ start & abend unless -f each: fail fast.
-* play with python sketch.py (& rename -- mlr.py??) & make sure it's at least usable for something

 * trawl around sysadmin docs etc. looking for more use-cases. not just data analysis or devops/logdive.

@ -65,7 +64,6 @@ NEATEN
 ================================================================
 ONLINE HELP

-* then-chaining note into mlr online help
 * put/filter: have a categorized function lister -- by string/math or arity, or some such ...

 ================================================================
@ -73,10 +71,6 @@ IMPROVEMENTS
 * pprint reader: read using field widths?!? with strip ... that would solve the embedded-whitespace problem.
  lmhsi: column name -> char index??
 * null-handling everywhere!! :/
-* 80-column-wrap slls printer (for mlr --help)
-* char -> char* for RS/FS/PS. then, NEWLINENEWLINE + repifs -> xtab is a more or less special case of dkvp.
-  BUT this makes mlr_get_line harder :/
-* mlr sort CLI opt for choice of heap/merge/quick -- ?

 ================================================================
 TESTING
@ -91,11 +85,7 @@ DOC
  dkvp cat completely stateless. stats1/2 retain only agg state. tac/sort/etc. obvious retain all lines.
  you can snarf a 20GB file with 4GB RAM no problem.
 * performance: Go/D/Rust #'s ... also GH link from mlrwik
-* intro note about more structuring: grep/sed/ruby/perl/sed etc. for some structuring, then pipe to miller?
 * maybe restore mem-mgmt page for dev-info? or a readme?!?
-* somewhere in mlrwik put some realistic log-dive data -- not abixy & not just resource/ok.
-* at the top link to data/small/etc. make small.txt hardlinks for browser download.
-* emph all variables are stream variables and all functions are stream functions
 * meaneb: assumes uncorr. code/doc link to appxb :)

 ================================================================
@ -140,10 +130,6 @@ UT/REG
 * multi-csv I/O: include --icsv --odkvp and --idkvp --ocsv, as well as --csv cases
 * het-xtab out

-================================================================
-I/O
-? right-align only some in --opprint?
-
 ================================================================
 INTERNAL DOCS (e.g. README)
 * sllv==NULL vs. sllv_single(NULL); and mem-mgmt cmts thruout
--- a/doc/content-for-feature-comparison.html
+++ b/doc/content-for-feature-comparison.html
@ -42,6 +42,13 @@ Its domain-specific languages are limited to the <tt>filter</tt> and
 <tt>put</tt> syntax. Futher programmability comes from chaining with
 <tt>then</tt>.

+<li/> Unlike with <tt>awk</tt>, all variables are stream variables and all
+functions are stream functions.  This means <tt>NF</tt>, <tt>NR</tt>, etc.
+change from one line to another, <tt>$x</tt> is a label for field <tt>x</tt> in
+the current record, and the input to <tt>sqrt($x)</tt> changes from one record
+to the next.  Miller doesn&rsquo;t let you set, say, <tt>sum=0</tt> and then
+update that on each record.
+
 <li/> Miller is faster than <tt>awk</tt>, <tt>cut</tt>, and so on (depending on
 platform; see also POKI_PUT_LINK_FOR_PAGE(performance.html)HERE). In
 particular, Miller&rsquo;s DSL syntax is parsed into C control structures at
--- a/doc/content-for-performance.html
+++ b/doc/content-for-performance.html
@ -52,4 +52,6 @@ rather, I attempted only to show that Miller&rsquo;s processing time here is com

 <h1>Conclusion</h1>

-For record-oriented data transformations, Miller is worth consideration. Field renames are worth doing as a pre-pipe or post-pipe using <tt>sed</tt>.
+For record-oriented data transformations, Miller meets or beats the Unix
+toolkit in many contexts. Field renames in particular are worth doing as a
+pre-pipe or post-pipe using <tt>sed</tt>.
--- a/doc/feature-comparison.html
+++ b/doc/feature-comparison.html
@ -166,6 +166,13 @@ Its domain-specific languages are limited to the <tt>filter</tt> and
 <tt>put</tt> syntax. Futher programmability comes from chaining with
 <tt>then</tt>.

+<li/> Unlike with <tt>awk</tt>, all variables are stream variables and all
+functions are stream functions.  This means <tt>NF</tt>, <tt>NR</tt>, etc.
+change from one line to another, <tt>$x</tt> is a label for field <tt>x</tt> in
+the current record, and the input to <tt>sqrt($x)</tt> changes from one record
+to the next.  Miller doesn&rsquo;t let you set, say, <tt>sum=0</tt> and then
+update that on each record.
+
 <li/> Miller is faster than <tt>awk</tt>, <tt>cut</tt>, and so on (depending on
 platform; see also <a href="performance.html">Performance</a>). In
 particular, Miller&rsquo;s DSL syntax is parsed into C control structures at
--- a/doc/performance.html
+++ b/doc/performance.html
@ -206,7 +206,9 @@ rather, I attempted only to show that Miller&rsquo;s processing time here is com

 <a id="Conclusion"/><h1>Conclusion</h1>

-For record-oriented data transformations, Miller is worth consideration. Field renames are worth doing as a pre-pipe or post-pipe using <tt>sed</tt>.
+For record-oriented data transformations, Miller meets or beats the Unix
+toolkit in many contexts. Field renames in particular are worth doing as a
+pre-pipe or post-pipe using <tt>sed</tt>.
    </div>
  </td>

--- a/python/sketch.py
+++ b/python/sketch.py
@ -34,17 +34,23 @@ def usage():
   print >> sys.stderr, "  -P {ps}   Input/output key-value-pair separator"
   print >> sys.stderr, "  -v {name=value} xxx needs more doc"
   print >> sys.stderr, ""
-   print >> sys.stderr, "  --idfl      Input  format is delimited by IRS,IFS,IPS"
-   print >> sys.stderr, "  --odfl      Output format is delimited by IRS,IFS,IPS"
-   print >> sys.stderr, "  --ihdrdata  Input  format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
-   print >> sys.stderr, "  --ohdrdata  Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
-   print >> sys.stderr, "  --iidx      Input  format is implicitly integer-indexed (awk-style)"
-   print >> sys.stderr, "  --oidx      Output format is implicitly integer-indexed (awk-style)"
-   print >> sys.stderr, "  --itbl      Input  format is tabular-pretty-print"
-   print >> sys.stderr, "  --otbl      Output format is tabular-pretty-print"
-   print >> sys.stderr, "  --ixtbl     Input  format is transposed-tabular-pretty-print"
-   print >> sys.stderr, "  --oxtbl     Output format is transposed-tabular-pretty-print"
-   print >> sys.stderr, "Modulator-spec help is TBD."
+   print >> sys.stderr, "  --idkvp  Input  format is delimited by IRS,IFS,IPS"
+   print >> sys.stderr, "  --odkvp  Output format is delimited by IRS,IFS,IPS"
+   print >> sys.stderr, "  --icsv   Input  format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
+   print >> sys.stderr, "  --ocsv   Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
+   print >> sys.stderr, "  --inidx  Input  format is implicitly integer-indexed (awk-style)"
+   print >> sys.stderr, "  --onidx  Output format is implicitly integer-indexed (awk-style)"
+   print >> sys.stderr, "  --ixtab  Input  format is transposed-tabular-pretty-print"
+   print >> sys.stderr, "  --oxtab  Output format is transposed-tabular-pretty-print"
+   print >> sys.stderr, "Modulator specs:"
+   print >> sys.stderr, '--cat'
+   print >> sys.stderr, '--tac'
+   print >> sys.stderr, '--cut'
+   print >> sys.stderr, '--cutx'
+   print >> sys.stderr, '--sortfields'
+   print >> sys.stderr, '--sortfieldsup'
+   print >> sys.stderr, '--sortfieldsdown'
+
   sys.exit(1)

 # ----------------------------------------------------------------
@ -56,8 +62,8 @@ def parse_command_line():

   try:
      optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [
-		  'help', 'idfl', 'odfl', 'ihdrdata', 'ohdrdata', 'iidx', 'oidx', 'itbl', 'otbl', 'ixtbl',
-		  'oxtbl', 'cat', 'tac', 'inclflds=', 'exclflds=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
+		  'help', 'idkvp', 'odkvp', 'icsv', 'ocsv', 'inidx', 'onidx', 'ixtab', 'oxtab',
+		  'cat', 'tac', 'cut=', 'cutx=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])

   except getopt.GetoptError, err:
      print str(err)
@ -78,35 +84,35 @@ def parse_command_line():
         kv = string.split(arg, "=", 1)
         namespace.put(kv[0], kv[1])

-      elif opt == '--idfl':
+      elif opt == '--idkvp':
         rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
-      elif opt == '--odfl':
+      elif opt == '--odkvp':
         rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))

-      elif opt == '--ihdrdata':
+      elif opt == '--icsv':
         rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
-      elif opt == '--ohdrdata':
+      elif opt == '--ocsv':
         rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))

-      elif opt == '--iidx':
+      elif opt == '--inidx':
         rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
-      elif opt == '--oidx':
+      elif opt == '--onidx':
         rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))

-      #elif opt == '--ixtbl':
+      #elif opt == '--ixtab':
      #   pass
-      elif opt == '--oxtbl':
+      elif opt == '--oxtab':
         rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!?

      elif opt == '--cat':
         rmodulator = CatModulator()
      elif opt == '--tac':
         rmodulator = TacModulator()
-      elif opt == '--inclflds':
+      elif opt == '--cut':
         rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS")))
-      elif opt == '--exclflds':
+      elif opt == '--cutx':
         rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
-      elif opt == '--exclflds':
+      elif opt == '--cutx':
         rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
      elif opt == '--sortfields':
         rmodulator = SortFieldsInRecordModulator(True)
@ -114,8 +120,6 @@ def parse_command_line():
         rmodulator = SortFieldsInRecordModulator(True)
      elif opt == '--sortfieldsdown':
         rmodulator = SortFieldsInRecordModulator(False)
-      #--mean i,x,y@a,b ... *NOT* the @-sign!
-      #rmodulator = MeanModulator(["i","x","y"],["a","b"])

      elif opt == '--help':
         usage()
@ -138,8 +142,8 @@ def main():
   options = parse_command_line()

   # parse ARGV:
-   # * --ifmt: dfl,hdr1st,iidxed,align,xposealign
-   # * --ofmt: dfl,hdr1st,iidxed,align,xposealign
+   # * --ifmt: dkvp,hdr1st,iidxed,align,xposealign
+   # * --ofmt: dkvp,hdr1st,iidxed,align,xposealign
   # * which-control-language spec?!?
   # * modulators/script ... this is the key decision area for language(s) design.
   # * filenames
@ -466,9 +470,6 @@ class MeanModulator:
 class StreamModulator:
   def __init__(self):
      pass
-   # xxx clearly define duck-ops for istream & ostream.
-   # * sys.stdin, sys.stdout, file ops need to impl it (maybe need to decorate them to do so).
-   # * likewise need to be able to compose one stream modulator inside another. e.g. sort(sum(inclflds(...)...)...).
   def modulate(self, rreader, rmodulator, rwriter):
      while True:
         in_record = rreader.read()