From 00692557ea50bf4c6ff0d1afce8d27d230a00fe8 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Thu, 14 May 2015 14:05:45 -0400 Subject: [PATCH] doc neatens --- c/cli/mlrcli.c | 3 +- c/todo.txt | 24 ++-------- doc/content-for-feature-comparison.html | 7 +++ doc/content-for-performance.html | 4 +- doc/feature-comparison.html | 7 +++ doc/performance.html | 4 +- python/sketch.py | 63 +++++++++++++------------ 7 files changed, 59 insertions(+), 53 deletions(-) diff --git a/c/cli/mlrcli.c b/c/cli/mlrcli.c index cfabab37f..edcc31a49 100644 --- a/c/cli/mlrcli.c +++ b/c/cli/mlrcli.c @@ -59,10 +59,11 @@ static void main_usage(char* argv0, int exit_code) { } if ((i > 0) && (linelen > 0)) fprintf(o, " "); + else + fprintf(o, " "); fprintf(o, "%s", mapper_lookup_table[i]->verb); } fprintf(o, "\n"); - fprintf(o, "\n"); fprintf(o, "Please use \"%s {verb name} --help\" for verb-specific help.\n", argv0); fprintf(o, "Please use \"%s --help-all-verbs\" for help on all verbs.\n", argv0); diff --git a/c/todo.txt b/c/todo.txt index 49c548d48..7f3940714 100644 --- a/c/todo.txt +++ b/c/todo.txt @@ -21,6 +21,8 @@ FEATURES > definitely put nidx before dkvp in the mlrwik/formats page + > "index-numbered" -> "implicitly index-numbered" in mlrwik + !! use 1-(|l2|/|l1|)^2 as pca quality metric? verify against r2 in munch plots. -> after pgr legend fix @@ -28,8 +30,9 @@ FEATURES ! ordered cut (a la reorder). either a new command (yeck) or cut option (e.g. cut -o) -* stats1 mode: what about "1"=="1.0"? doc this, or impl option - w/ temporary sscanf & reformat @ maxlen +! rip through filenames @ start & abend unless -f each: fail fast. + +! mlr sort CLI opt for choice of heap/merge/quick -- ? * mod op (either c-like, or sane) and put into wikidoc if so. @@ -46,16 +49,12 @@ NEATEN !! xxx's in the code * source hygiene: top-of-header comments, readme re memory management, etc. -* check all usage messages * prune pix/ dir * makefile go/d/rs stuff -> language-comparisons/ dir * catc.c/catc0.c -> language-comparisons/ dir * remove/coalesce/gzip the large data files * mk perfcomp dir -* "index-numbered" -> "implicitly index-numbered" in mlrwik -* rip through filenames @ start & abend unless -f each: fail fast. -* play with python sketch.py (& rename -- mlr.py??) & make sure it's at least usable for something * trawl around sysadmin docs etc. looking for more use-cases. not just data analysis or devops/logdive. @@ -65,7 +64,6 @@ NEATEN ================================================================ ONLINE HELP -* then-chaining note into mlr online help * put/filter: have a categorized function lister -- by string/math or arity, or some such ... ================================================================ @@ -73,10 +71,6 @@ IMPROVEMENTS * pprint reader: read using field widths?!? with strip ... that would solve the embedded-whitespace problem. lmhsi: column name -> char index?? * null-handling everywhere!! :/ -* 80-column-wrap slls printer (for mlr --help) -* char -> char* for RS/FS/PS. then, NEWLINENEWLINE + repifs -> xtab is a more or less special case of dkvp. - BUT this makes mlr_get_line harder :/ -* mlr sort CLI opt for choice of heap/merge/quick -- ? ================================================================ TESTING @@ -91,11 +85,7 @@ DOC dkvp cat completely stateless. stats1/2 retain only agg state. tac/sort/etc. obvious retain all lines. you can snarf a 20GB file with 4GB RAM no problem. * performance: Go/D/Rust #'s ... also GH link from mlrwik -* intro note about more structuring: grep/sed/ruby/perl/sed etc. for some structuring, then pipe to miller? * maybe restore mem-mgmt page for dev-info? or a readme?!? -* somewhere in mlrwik put some realistic log-dive data -- not abixy & not just resource/ok. -* at the top link to data/small/etc. make small.txt hardlinks for browser download. -* emph all variables are stream variables and all functions are stream functions * meaneb: assumes uncorr. code/doc link to appxb :) ================================================================ @@ -140,10 +130,6 @@ UT/REG * multi-csv I/O: include --icsv --odkvp and --idkvp --ocsv, as well as --csv cases * het-xtab out -================================================================ -I/O -? right-align only some in --opprint? - ================================================================ INTERNAL DOCS (e.g. README) * sllv==NULL vs. sllv_single(NULL); and mem-mgmt cmts thruout diff --git a/doc/content-for-feature-comparison.html b/doc/content-for-feature-comparison.html index a3f48eeda..5bdfb908e 100644 --- a/doc/content-for-feature-comparison.html +++ b/doc/content-for-feature-comparison.html @@ -42,6 +42,13 @@ Its domain-specific languages are limited to the filter and put syntax. Futher programmability comes from chaining with then. +
  • Unlike with awk, all variables are stream variables and all +functions are stream functions. This means NF, NR, etc. +change from one line to another, $x is a label for field x in +the current record, and the input to sqrt($x) changes from one record +to the next. Miller doesn’t let you set, say, sum=0 and then +update that on each record. +
  • Miller is faster than awk, cut, and so on (depending on platform; see also POKI_PUT_LINK_FOR_PAGE(performance.html)HERE). In particular, Miller’s DSL syntax is parsed into C control structures at diff --git a/doc/content-for-performance.html b/doc/content-for-performance.html index ceaa9d42a..23740030a 100644 --- a/doc/content-for-performance.html +++ b/doc/content-for-performance.html @@ -52,4 +52,6 @@ rather, I attempted only to show that Miller’s processing time here is com

    Conclusion

    -For record-oriented data transformations, Miller is worth consideration. Field renames are worth doing as a pre-pipe or post-pipe using sed. +For record-oriented data transformations, Miller meets or beats the Unix +toolkit in many contexts. Field renames in particular are worth doing as a +pre-pipe or post-pipe using sed. diff --git a/doc/feature-comparison.html b/doc/feature-comparison.html index 219f50a68..e4e49a747 100644 --- a/doc/feature-comparison.html +++ b/doc/feature-comparison.html @@ -166,6 +166,13 @@ Its domain-specific languages are limited to the filter and put syntax. Futher programmability comes from chaining with then. +
  • Unlike with awk, all variables are stream variables and all +functions are stream functions. This means NF, NR, etc. +change from one line to another, $x is a label for field x in +the current record, and the input to sqrt($x) changes from one record +to the next. Miller doesn’t let you set, say, sum=0 and then +update that on each record. +
  • Miller is faster than awk, cut, and so on (depending on platform; see also Performance). In particular, Miller’s DSL syntax is parsed into C control structures at diff --git a/doc/performance.html b/doc/performance.html index a14b3855e..9ca7f62f9 100644 --- a/doc/performance.html +++ b/doc/performance.html @@ -206,7 +206,9 @@ rather, I attempted only to show that Miller’s processing time here is com

    Conclusion

    -For record-oriented data transformations, Miller is worth consideration. Field renames are worth doing as a pre-pipe or post-pipe using sed. +For record-oriented data transformations, Miller meets or beats the Unix +toolkit in many contexts. Field renames in particular are worth doing as a +pre-pipe or post-pipe using sed. diff --git a/python/sketch.py b/python/sketch.py index e99e6f199..81a5621ef 100755 --- a/python/sketch.py +++ b/python/sketch.py @@ -34,17 +34,23 @@ def usage(): print >> sys.stderr, " -P {ps} Input/output key-value-pair separator" print >> sys.stderr, " -v {name=value} xxx needs more doc" print >> sys.stderr, "" - print >> sys.stderr, " --idfl Input format is delimited by IRS,IFS,IPS" - print >> sys.stderr, " --odfl Output format is delimited by IRS,IFS,IPS" - print >> sys.stderr, " --ihdrdata Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)" - print >> sys.stderr, " --ohdrdata Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)" - print >> sys.stderr, " --iidx Input format is implicitly integer-indexed (awk-style)" - print >> sys.stderr, " --oidx Output format is implicitly integer-indexed (awk-style)" - print >> sys.stderr, " --itbl Input format is tabular-pretty-print" - print >> sys.stderr, " --otbl Output format is tabular-pretty-print" - print >> sys.stderr, " --ixtbl Input format is transposed-tabular-pretty-print" - print >> sys.stderr, " --oxtbl Output format is transposed-tabular-pretty-print" - print >> sys.stderr, "Modulator-spec help is TBD." + print >> sys.stderr, " --idkvp Input format is delimited by IRS,IFS,IPS" + print >> sys.stderr, " --odkvp Output format is delimited by IRS,IFS,IPS" + print >> sys.stderr, " --icsv Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)" + print >> sys.stderr, " --ocsv Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)" + print >> sys.stderr, " --inidx Input format is implicitly integer-indexed (awk-style)" + print >> sys.stderr, " --onidx Output format is implicitly integer-indexed (awk-style)" + print >> sys.stderr, " --ixtab Input format is transposed-tabular-pretty-print" + print >> sys.stderr, " --oxtab Output format is transposed-tabular-pretty-print" + print >> sys.stderr, "Modulator specs:" + print >> sys.stderr, '--cat' + print >> sys.stderr, '--tac' + print >> sys.stderr, '--cut' + print >> sys.stderr, '--cutx' + print >> sys.stderr, '--sortfields' + print >> sys.stderr, '--sortfieldsup' + print >> sys.stderr, '--sortfieldsdown' + sys.exit(1) # ---------------------------------------------------------------- @@ -56,8 +62,8 @@ def parse_command_line(): try: optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [ - 'help', 'idfl', 'odfl', 'ihdrdata', 'ohdrdata', 'iidx', 'oidx', 'itbl', 'otbl', 'ixtbl', - 'oxtbl', 'cat', 'tac', 'inclflds=', 'exclflds=', 'sortfields', 'sortfieldsup', 'sortfieldsdown']) + 'help', 'idkvp', 'odkvp', 'icsv', 'ocsv', 'inidx', 'onidx', 'ixtab', 'oxtab', + 'cat', 'tac', 'cut=', 'cutx=', 'sortfields', 'sortfieldsup', 'sortfieldsdown']) except getopt.GetoptError, err: print str(err) @@ -78,35 +84,35 @@ def parse_command_line(): kv = string.split(arg, "=", 1) namespace.put(kv[0], kv[1]) - elif opt == '--idfl': + elif opt == '--idkvp': rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS")) - elif opt == '--odfl': + elif opt == '--odkvp': rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS")) - elif opt == '--ihdrdata': + elif opt == '--icsv': rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS")) - elif opt == '--ohdrdata': + elif opt == '--ocsv': rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS")) - elif opt == '--iidx': + elif opt == '--inidx': rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS")) - elif opt == '--oidx': + elif opt == '--onidx': rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS")) - #elif opt == '--ixtbl': + #elif opt == '--ixtab': # pass - elif opt == '--oxtbl': + elif opt == '--oxtab': rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!? elif opt == '--cat': rmodulator = CatModulator() elif opt == '--tac': rmodulator = TacModulator() - elif opt == '--inclflds': + elif opt == '--cut': rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS"))) - elif opt == '--exclflds': + elif opt == '--cutx': rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS"))) - elif opt == '--exclflds': + elif opt == '--cutx': rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS"))) elif opt == '--sortfields': rmodulator = SortFieldsInRecordModulator(True) @@ -114,8 +120,6 @@ def parse_command_line(): rmodulator = SortFieldsInRecordModulator(True) elif opt == '--sortfieldsdown': rmodulator = SortFieldsInRecordModulator(False) - #--mean i,x,y@a,b ... *NOT* the @-sign! - #rmodulator = MeanModulator(["i","x","y"],["a","b"]) elif opt == '--help': usage() @@ -138,8 +142,8 @@ def main(): options = parse_command_line() # parse ARGV: - # * --ifmt: dfl,hdr1st,iidxed,align,xposealign - # * --ofmt: dfl,hdr1st,iidxed,align,xposealign + # * --ifmt: dkvp,hdr1st,iidxed,align,xposealign + # * --ofmt: dkvp,hdr1st,iidxed,align,xposealign # * which-control-language spec?!? # * modulators/script ... this is the key decision area for language(s) design. # * filenames @@ -466,9 +470,6 @@ class MeanModulator: class StreamModulator: def __init__(self): pass - # xxx clearly define duck-ops for istream & ostream. - # * sys.stdin, sys.stdout, file ops need to impl it (maybe need to decorate them to do so). - # * likewise need to be able to compose one stream modulator inside another. e.g. sort(sum(inclflds(...)...)...). def modulate(self, rreader, rmodulator, rwriter): while True: in_record = rreader.read()