diff --git a/c/cli/mlrcli.c b/c/cli/mlrcli.c
index cfabab37f..edcc31a49 100644
--- a/c/cli/mlrcli.c
+++ b/c/cli/mlrcli.c
@@ -59,10 +59,11 @@ static void main_usage(char* argv0, int exit_code) {
}
if ((i > 0) && (linelen > 0))
fprintf(o, " ");
+ else
+ fprintf(o, " ");
fprintf(o, "%s", mapper_lookup_table[i]->verb);
}
fprintf(o, "\n");
- fprintf(o, "\n");
fprintf(o, "Please use \"%s {verb name} --help\" for verb-specific help.\n", argv0);
fprintf(o, "Please use \"%s --help-all-verbs\" for help on all verbs.\n", argv0);
diff --git a/c/todo.txt b/c/todo.txt
index 49c548d48..7f3940714 100644
--- a/c/todo.txt
+++ b/c/todo.txt
@@ -21,6 +21,8 @@ FEATURES
> definitely put nidx before dkvp in the mlrwik/formats page
+ > "index-numbered" -> "implicitly index-numbered" in mlrwik
+
!! use 1-(|l2|/|l1|)^2 as pca quality metric? verify against r2 in munch plots.
-> after pgr legend fix
@@ -28,8 +30,9 @@ FEATURES
! ordered cut (a la reorder). either a new command (yeck) or cut option (e.g. cut -o)
-* stats1 mode: what about "1"=="1.0"? doc this, or impl option
- w/ temporary sscanf & reformat @ maxlen
+! rip through filenames @ start & abend unless -f each: fail fast.
+
+! mlr sort CLI opt for choice of heap/merge/quick -- ?
* mod op (either c-like, or sane) and put into wikidoc if so.
@@ -46,16 +49,12 @@ NEATEN
!! xxx's in the code
* source hygiene: top-of-header comments, readme re memory management, etc.
-* check all usage messages
* prune pix/ dir
* makefile go/d/rs stuff -> language-comparisons/ dir
* catc.c/catc0.c -> language-comparisons/ dir
* remove/coalesce/gzip the large data files
* mk perfcomp dir
-* "index-numbered" -> "implicitly index-numbered" in mlrwik
-* rip through filenames @ start & abend unless -f each: fail fast.
-* play with python sketch.py (& rename -- mlr.py??) & make sure it's at least usable for something
* trawl around sysadmin docs etc. looking for more use-cases. not just data analysis or devops/logdive.
@@ -65,7 +64,6 @@ NEATEN
================================================================
ONLINE HELP
-* then-chaining note into mlr online help
* put/filter: have a categorized function lister -- by string/math or arity, or some such ...
================================================================
@@ -73,10 +71,6 @@ IMPROVEMENTS
* pprint reader: read using field widths?!? with strip ... that would solve the embedded-whitespace problem.
lmhsi: column name -> char index??
* null-handling everywhere!! :/
-* 80-column-wrap slls printer (for mlr --help)
-* char -> char* for RS/FS/PS. then, NEWLINENEWLINE + repifs -> xtab is a more or less special case of dkvp.
- BUT this makes mlr_get_line harder :/
-* mlr sort CLI opt for choice of heap/merge/quick -- ?
================================================================
TESTING
@@ -91,11 +85,7 @@ DOC
dkvp cat completely stateless. stats1/2 retain only agg state. tac/sort/etc. obvious retain all lines.
you can snarf a 20GB file with 4GB RAM no problem.
* performance: Go/D/Rust #'s ... also GH link from mlrwik
-* intro note about more structuring: grep/sed/ruby/perl/sed etc. for some structuring, then pipe to miller?
* maybe restore mem-mgmt page for dev-info? or a readme?!?
-* somewhere in mlrwik put some realistic log-dive data -- not abixy & not just resource/ok.
-* at the top link to data/small/etc. make small.txt hardlinks for browser download.
-* emph all variables are stream variables and all functions are stream functions
* meaneb: assumes uncorr. code/doc link to appxb :)
================================================================
@@ -140,10 +130,6 @@ UT/REG
* multi-csv I/O: include --icsv --odkvp and --idkvp --ocsv, as well as --csv cases
* het-xtab out
-================================================================
-I/O
-? right-align only some in --opprint?
-
================================================================
INTERNAL DOCS (e.g. README)
* sllv==NULL vs. sllv_single(NULL); and mem-mgmt cmts thruout
diff --git a/doc/content-for-feature-comparison.html b/doc/content-for-feature-comparison.html
index a3f48eeda..5bdfb908e 100644
--- a/doc/content-for-feature-comparison.html
+++ b/doc/content-for-feature-comparison.html
@@ -42,6 +42,13 @@ Its domain-specific languages are limited to the filter and
put syntax. Futher programmability comes from chaining with
then.
+
Unlike with awk, all variables are stream variables and all
+functions are stream functions. This means NF, NR, etc.
+change from one line to another, $x is a label for field x in
+the current record, and the input to sqrt($x) changes from one record
+to the next. Miller doesn’t let you set, say, sum=0 and then
+update that on each record.
+
Miller is faster than awk, cut, and so on (depending on
platform; see also POKI_PUT_LINK_FOR_PAGE(performance.html)HERE). In
particular, Miller’s DSL syntax is parsed into C control structures at
diff --git a/doc/content-for-performance.html b/doc/content-for-performance.html
index ceaa9d42a..23740030a 100644
--- a/doc/content-for-performance.html
+++ b/doc/content-for-performance.html
@@ -52,4 +52,6 @@ rather, I attempted only to show that Miller’s processing time here is com
Conclusion
-For record-oriented data transformations, Miller is worth consideration. Field renames are worth doing as a pre-pipe or post-pipe using sed.
+For record-oriented data transformations, Miller meets or beats the Unix
+toolkit in many contexts. Field renames in particular are worth doing as a
+pre-pipe or post-pipe using sed.
diff --git a/doc/feature-comparison.html b/doc/feature-comparison.html
index 219f50a68..e4e49a747 100644
--- a/doc/feature-comparison.html
+++ b/doc/feature-comparison.html
@@ -166,6 +166,13 @@ Its domain-specific languages are limited to the filter and
put syntax. Futher programmability comes from chaining with
then.
+ Unlike with awk, all variables are stream variables and all
+functions are stream functions. This means NF, NR, etc.
+change from one line to another, $x is a label for field x in
+the current record, and the input to sqrt($x) changes from one record
+to the next. Miller doesn’t let you set, say, sum=0 and then
+update that on each record.
+
Miller is faster than awk, cut, and so on (depending on
platform; see also Performance). In
particular, Miller’s DSL syntax is parsed into C control structures at
diff --git a/doc/performance.html b/doc/performance.html
index a14b3855e..9ca7f62f9 100644
--- a/doc/performance.html
+++ b/doc/performance.html
@@ -206,7 +206,9 @@ rather, I attempted only to show that Miller’s processing time here is com
Conclusion
-For record-oriented data transformations, Miller is worth consideration. Field renames are worth doing as a pre-pipe or post-pipe using sed.
+For record-oriented data transformations, Miller meets or beats the Unix
+toolkit in many contexts. Field renames in particular are worth doing as a
+pre-pipe or post-pipe using sed.
diff --git a/python/sketch.py b/python/sketch.py
index e99e6f199..81a5621ef 100755
--- a/python/sketch.py
+++ b/python/sketch.py
@@ -34,17 +34,23 @@ def usage():
print >> sys.stderr, " -P {ps} Input/output key-value-pair separator"
print >> sys.stderr, " -v {name=value} xxx needs more doc"
print >> sys.stderr, ""
- print >> sys.stderr, " --idfl Input format is delimited by IRS,IFS,IPS"
- print >> sys.stderr, " --odfl Output format is delimited by IRS,IFS,IPS"
- print >> sys.stderr, " --ihdrdata Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
- print >> sys.stderr, " --ohdrdata Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
- print >> sys.stderr, " --iidx Input format is implicitly integer-indexed (awk-style)"
- print >> sys.stderr, " --oidx Output format is implicitly integer-indexed (awk-style)"
- print >> sys.stderr, " --itbl Input format is tabular-pretty-print"
- print >> sys.stderr, " --otbl Output format is tabular-pretty-print"
- print >> sys.stderr, " --ixtbl Input format is transposed-tabular-pretty-print"
- print >> sys.stderr, " --oxtbl Output format is transposed-tabular-pretty-print"
- print >> sys.stderr, "Modulator-spec help is TBD."
+ print >> sys.stderr, " --idkvp Input format is delimited by IRS,IFS,IPS"
+ print >> sys.stderr, " --odkvp Output format is delimited by IRS,IFS,IPS"
+ print >> sys.stderr, " --icsv Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
+ print >> sys.stderr, " --ocsv Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
+ print >> sys.stderr, " --inidx Input format is implicitly integer-indexed (awk-style)"
+ print >> sys.stderr, " --onidx Output format is implicitly integer-indexed (awk-style)"
+ print >> sys.stderr, " --ixtab Input format is transposed-tabular-pretty-print"
+ print >> sys.stderr, " --oxtab Output format is transposed-tabular-pretty-print"
+ print >> sys.stderr, "Modulator specs:"
+ print >> sys.stderr, '--cat'
+ print >> sys.stderr, '--tac'
+ print >> sys.stderr, '--cut'
+ print >> sys.stderr, '--cutx'
+ print >> sys.stderr, '--sortfields'
+ print >> sys.stderr, '--sortfieldsup'
+ print >> sys.stderr, '--sortfieldsdown'
+
sys.exit(1)
# ----------------------------------------------------------------
@@ -56,8 +62,8 @@ def parse_command_line():
try:
optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [
- 'help', 'idfl', 'odfl', 'ihdrdata', 'ohdrdata', 'iidx', 'oidx', 'itbl', 'otbl', 'ixtbl',
- 'oxtbl', 'cat', 'tac', 'inclflds=', 'exclflds=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
+ 'help', 'idkvp', 'odkvp', 'icsv', 'ocsv', 'inidx', 'onidx', 'ixtab', 'oxtab',
+ 'cat', 'tac', 'cut=', 'cutx=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
except getopt.GetoptError, err:
print str(err)
@@ -78,35 +84,35 @@ def parse_command_line():
kv = string.split(arg, "=", 1)
namespace.put(kv[0], kv[1])
- elif opt == '--idfl':
+ elif opt == '--idkvp':
rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
- elif opt == '--odfl':
+ elif opt == '--odkvp':
rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
- elif opt == '--ihdrdata':
+ elif opt == '--icsv':
rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
- elif opt == '--ohdrdata':
+ elif opt == '--ocsv':
rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
- elif opt == '--iidx':
+ elif opt == '--inidx':
rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
- elif opt == '--oidx':
+ elif opt == '--onidx':
rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
- #elif opt == '--ixtbl':
+ #elif opt == '--ixtab':
# pass
- elif opt == '--oxtbl':
+ elif opt == '--oxtab':
rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!?
elif opt == '--cat':
rmodulator = CatModulator()
elif opt == '--tac':
rmodulator = TacModulator()
- elif opt == '--inclflds':
+ elif opt == '--cut':
rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS")))
- elif opt == '--exclflds':
+ elif opt == '--cutx':
rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
- elif opt == '--exclflds':
+ elif opt == '--cutx':
rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
elif opt == '--sortfields':
rmodulator = SortFieldsInRecordModulator(True)
@@ -114,8 +120,6 @@ def parse_command_line():
rmodulator = SortFieldsInRecordModulator(True)
elif opt == '--sortfieldsdown':
rmodulator = SortFieldsInRecordModulator(False)
- #--mean i,x,y@a,b ... *NOT* the @-sign!
- #rmodulator = MeanModulator(["i","x","y"],["a","b"])
elif opt == '--help':
usage()
@@ -138,8 +142,8 @@ def main():
options = parse_command_line()
# parse ARGV:
- # * --ifmt: dfl,hdr1st,iidxed,align,xposealign
- # * --ofmt: dfl,hdr1st,iidxed,align,xposealign
+ # * --ifmt: dkvp,hdr1st,iidxed,align,xposealign
+ # * --ofmt: dkvp,hdr1st,iidxed,align,xposealign
# * which-control-language spec?!?
# * modulators/script ... this is the key decision area for language(s) design.
# * filenames
@@ -466,9 +470,6 @@ class MeanModulator:
class StreamModulator:
def __init__(self):
pass
- # xxx clearly define duck-ops for istream & ostream.
- # * sys.stdin, sys.stdout, file ops need to impl it (maybe need to decorate them to do so).
- # * likewise need to be able to compose one stream modulator inside another. e.g. sort(sum(inclflds(...)...)...).
def modulate(self, rreader, rmodulator, rwriter):
while True:
in_record = rreader.read()