mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
doc neatens
This commit is contained in:
parent
ad3ca6c04b
commit
00692557ea
7 changed files with 59 additions and 53 deletions
|
|
@ -59,10 +59,11 @@ static void main_usage(char* argv0, int exit_code) {
|
|||
}
|
||||
if ((i > 0) && (linelen > 0))
|
||||
fprintf(o, " ");
|
||||
else
|
||||
fprintf(o, " ");
|
||||
fprintf(o, "%s", mapper_lookup_table[i]->verb);
|
||||
}
|
||||
fprintf(o, "\n");
|
||||
fprintf(o, "\n");
|
||||
fprintf(o, "Please use \"%s {verb name} --help\" for verb-specific help.\n", argv0);
|
||||
fprintf(o, "Please use \"%s --help-all-verbs\" for help on all verbs.\n", argv0);
|
||||
|
||||
|
|
|
|||
24
c/todo.txt
24
c/todo.txt
|
|
@ -21,6 +21,8 @@ FEATURES
|
|||
|
||||
> definitely put nidx before dkvp in the mlrwik/formats page
|
||||
|
||||
> "index-numbered" -> "implicitly index-numbered" in mlrwik
|
||||
|
||||
!! use 1-(|l2|/|l1|)^2 as pca quality metric? verify against r2 in munch plots.
|
||||
-> after pgr legend fix
|
||||
|
||||
|
|
@ -28,8 +30,9 @@ FEATURES
|
|||
|
||||
! ordered cut (a la reorder). either a new command (yeck) or cut option (e.g. cut -o)
|
||||
|
||||
* stats1 mode: what about "1"=="1.0"? doc this, or impl option
|
||||
w/ temporary sscanf & reformat @ maxlen
|
||||
! rip through filenames @ start & abend unless -f each: fail fast.
|
||||
|
||||
! mlr sort CLI opt for choice of heap/merge/quick -- ?
|
||||
|
||||
* mod op (either c-like, or sane) and put into wikidoc if so.
|
||||
|
||||
|
|
@ -46,16 +49,12 @@ NEATEN
|
|||
|
||||
!! xxx's in the code
|
||||
* source hygiene: top-of-header comments, readme re memory management, etc.
|
||||
* check all usage messages
|
||||
|
||||
* prune pix/ dir
|
||||
* makefile go/d/rs stuff -> language-comparisons/ dir
|
||||
* catc.c/catc0.c -> language-comparisons/ dir
|
||||
* remove/coalesce/gzip the large data files
|
||||
* mk perfcomp dir
|
||||
* "index-numbered" -> "implicitly index-numbered" in mlrwik
|
||||
* rip through filenames @ start & abend unless -f each: fail fast.
|
||||
* play with python sketch.py (& rename -- mlr.py??) & make sure it's at least usable for something
|
||||
|
||||
* trawl around sysadmin docs etc. looking for more use-cases. not just data analysis or devops/logdive.
|
||||
|
||||
|
|
@ -65,7 +64,6 @@ NEATEN
|
|||
================================================================
|
||||
ONLINE HELP
|
||||
|
||||
* then-chaining note into mlr online help
|
||||
* put/filter: have a categorized function lister -- by string/math or arity, or some such ...
|
||||
|
||||
================================================================
|
||||
|
|
@ -73,10 +71,6 @@ IMPROVEMENTS
|
|||
* pprint reader: read using field widths?!? with strip ... that would solve the embedded-whitespace problem.
|
||||
lmhsi: column name -> char index??
|
||||
* null-handling everywhere!! :/
|
||||
* 80-column-wrap slls printer (for mlr --help)
|
||||
* char -> char* for RS/FS/PS. then, NEWLINENEWLINE + repifs -> xtab is a more or less special case of dkvp.
|
||||
BUT this makes mlr_get_line harder :/
|
||||
* mlr sort CLI opt for choice of heap/merge/quick -- ?
|
||||
|
||||
================================================================
|
||||
TESTING
|
||||
|
|
@ -91,11 +85,7 @@ DOC
|
|||
dkvp cat completely stateless. stats1/2 retain only agg state. tac/sort/etc. obvious retain all lines.
|
||||
you can snarf a 20GB file with 4GB RAM no problem.
|
||||
* performance: Go/D/Rust #'s ... also GH link from mlrwik
|
||||
* intro note about more structuring: grep/sed/ruby/perl/sed etc. for some structuring, then pipe to miller?
|
||||
* maybe restore mem-mgmt page for dev-info? or a readme?!?
|
||||
* somewhere in mlrwik put some realistic log-dive data -- not abixy & not just resource/ok.
|
||||
* at the top link to data/small/etc. make small.txt hardlinks for browser download.
|
||||
* emph all variables are stream variables and all functions are stream functions
|
||||
* meaneb: assumes uncorr. code/doc link to appxb :)
|
||||
|
||||
================================================================
|
||||
|
|
@ -140,10 +130,6 @@ UT/REG
|
|||
* multi-csv I/O: include --icsv --odkvp and --idkvp --ocsv, as well as --csv cases
|
||||
* het-xtab out
|
||||
|
||||
================================================================
|
||||
I/O
|
||||
? right-align only some in --opprint?
|
||||
|
||||
================================================================
|
||||
INTERNAL DOCS (e.g. README)
|
||||
* sllv==NULL vs. sllv_single(NULL); and mem-mgmt cmts thruout
|
||||
|
|
|
|||
|
|
@ -42,6 +42,13 @@ Its domain-specific languages are limited to the <tt>filter</tt> and
|
|||
<tt>put</tt> syntax. Futher programmability comes from chaining with
|
||||
<tt>then</tt>.
|
||||
|
||||
<li/> Unlike with <tt>awk</tt>, all variables are stream variables and all
|
||||
functions are stream functions. This means <tt>NF</tt>, <tt>NR</tt>, etc.
|
||||
change from one line to another, <tt>$x</tt> is a label for field <tt>x</tt> in
|
||||
the current record, and the input to <tt>sqrt($x)</tt> changes from one record
|
||||
to the next. Miller doesn’t let you set, say, <tt>sum=0</tt> and then
|
||||
update that on each record.
|
||||
|
||||
<li/> Miller is faster than <tt>awk</tt>, <tt>cut</tt>, and so on (depending on
|
||||
platform; see also POKI_PUT_LINK_FOR_PAGE(performance.html)HERE). In
|
||||
particular, Miller’s DSL syntax is parsed into C control structures at
|
||||
|
|
|
|||
|
|
@ -52,4 +52,6 @@ rather, I attempted only to show that Miller’s processing time here is com
|
|||
|
||||
<h1>Conclusion</h1>
|
||||
|
||||
For record-oriented data transformations, Miller is worth consideration. Field renames are worth doing as a pre-pipe or post-pipe using <tt>sed</tt>.
|
||||
For record-oriented data transformations, Miller meets or beats the Unix
|
||||
toolkit in many contexts. Field renames in particular are worth doing as a
|
||||
pre-pipe or post-pipe using <tt>sed</tt>.
|
||||
|
|
|
|||
|
|
@ -166,6 +166,13 @@ Its domain-specific languages are limited to the <tt>filter</tt> and
|
|||
<tt>put</tt> syntax. Futher programmability comes from chaining with
|
||||
<tt>then</tt>.
|
||||
|
||||
<li/> Unlike with <tt>awk</tt>, all variables are stream variables and all
|
||||
functions are stream functions. This means <tt>NF</tt>, <tt>NR</tt>, etc.
|
||||
change from one line to another, <tt>$x</tt> is a label for field <tt>x</tt> in
|
||||
the current record, and the input to <tt>sqrt($x)</tt> changes from one record
|
||||
to the next. Miller doesn’t let you set, say, <tt>sum=0</tt> and then
|
||||
update that on each record.
|
||||
|
||||
<li/> Miller is faster than <tt>awk</tt>, <tt>cut</tt>, and so on (depending on
|
||||
platform; see also <a href="performance.html">Performance</a>). In
|
||||
particular, Miller’s DSL syntax is parsed into C control structures at
|
||||
|
|
|
|||
|
|
@ -206,7 +206,9 @@ rather, I attempted only to show that Miller’s processing time here is com
|
|||
|
||||
<a id="Conclusion"/><h1>Conclusion</h1>
|
||||
|
||||
For record-oriented data transformations, Miller is worth consideration. Field renames are worth doing as a pre-pipe or post-pipe using <tt>sed</tt>.
|
||||
For record-oriented data transformations, Miller meets or beats the Unix
|
||||
toolkit in many contexts. Field renames in particular are worth doing as a
|
||||
pre-pipe or post-pipe using <tt>sed</tt>.
|
||||
</div>
|
||||
</td>
|
||||
|
||||
|
|
|
|||
|
|
@ -34,17 +34,23 @@ def usage():
|
|||
print >> sys.stderr, " -P {ps} Input/output key-value-pair separator"
|
||||
print >> sys.stderr, " -v {name=value} xxx needs more doc"
|
||||
print >> sys.stderr, ""
|
||||
print >> sys.stderr, " --idfl Input format is delimited by IRS,IFS,IPS"
|
||||
print >> sys.stderr, " --odfl Output format is delimited by IRS,IFS,IPS"
|
||||
print >> sys.stderr, " --ihdrdata Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
|
||||
print >> sys.stderr, " --ohdrdata Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
|
||||
print >> sys.stderr, " --iidx Input format is implicitly integer-indexed (awk-style)"
|
||||
print >> sys.stderr, " --oidx Output format is implicitly integer-indexed (awk-style)"
|
||||
print >> sys.stderr, " --itbl Input format is tabular-pretty-print"
|
||||
print >> sys.stderr, " --otbl Output format is tabular-pretty-print"
|
||||
print >> sys.stderr, " --ixtbl Input format is transposed-tabular-pretty-print"
|
||||
print >> sys.stderr, " --oxtbl Output format is transposed-tabular-pretty-print"
|
||||
print >> sys.stderr, "Modulator-spec help is TBD."
|
||||
print >> sys.stderr, " --idkvp Input format is delimited by IRS,IFS,IPS"
|
||||
print >> sys.stderr, " --odkvp Output format is delimited by IRS,IFS,IPS"
|
||||
print >> sys.stderr, " --icsv Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
|
||||
print >> sys.stderr, " --ocsv Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
|
||||
print >> sys.stderr, " --inidx Input format is implicitly integer-indexed (awk-style)"
|
||||
print >> sys.stderr, " --onidx Output format is implicitly integer-indexed (awk-style)"
|
||||
print >> sys.stderr, " --ixtab Input format is transposed-tabular-pretty-print"
|
||||
print >> sys.stderr, " --oxtab Output format is transposed-tabular-pretty-print"
|
||||
print >> sys.stderr, "Modulator specs:"
|
||||
print >> sys.stderr, '--cat'
|
||||
print >> sys.stderr, '--tac'
|
||||
print >> sys.stderr, '--cut'
|
||||
print >> sys.stderr, '--cutx'
|
||||
print >> sys.stderr, '--sortfields'
|
||||
print >> sys.stderr, '--sortfieldsup'
|
||||
print >> sys.stderr, '--sortfieldsdown'
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
|
|
@ -56,8 +62,8 @@ def parse_command_line():
|
|||
|
||||
try:
|
||||
optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [
|
||||
'help', 'idfl', 'odfl', 'ihdrdata', 'ohdrdata', 'iidx', 'oidx', 'itbl', 'otbl', 'ixtbl',
|
||||
'oxtbl', 'cat', 'tac', 'inclflds=', 'exclflds=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
|
||||
'help', 'idkvp', 'odkvp', 'icsv', 'ocsv', 'inidx', 'onidx', 'ixtab', 'oxtab',
|
||||
'cat', 'tac', 'cut=', 'cutx=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
|
||||
|
||||
except getopt.GetoptError, err:
|
||||
print str(err)
|
||||
|
|
@ -78,35 +84,35 @@ def parse_command_line():
|
|||
kv = string.split(arg, "=", 1)
|
||||
namespace.put(kv[0], kv[1])
|
||||
|
||||
elif opt == '--idfl':
|
||||
elif opt == '--idkvp':
|
||||
rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
|
||||
elif opt == '--odfl':
|
||||
elif opt == '--odkvp':
|
||||
rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
|
||||
|
||||
elif opt == '--ihdrdata':
|
||||
elif opt == '--icsv':
|
||||
rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
|
||||
elif opt == '--ohdrdata':
|
||||
elif opt == '--ocsv':
|
||||
rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
|
||||
|
||||
elif opt == '--iidx':
|
||||
elif opt == '--inidx':
|
||||
rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
|
||||
elif opt == '--oidx':
|
||||
elif opt == '--onidx':
|
||||
rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
|
||||
|
||||
#elif opt == '--ixtbl':
|
||||
#elif opt == '--ixtab':
|
||||
# pass
|
||||
elif opt == '--oxtbl':
|
||||
elif opt == '--oxtab':
|
||||
rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!?
|
||||
|
||||
elif opt == '--cat':
|
||||
rmodulator = CatModulator()
|
||||
elif opt == '--tac':
|
||||
rmodulator = TacModulator()
|
||||
elif opt == '--inclflds':
|
||||
elif opt == '--cut':
|
||||
rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS")))
|
||||
elif opt == '--exclflds':
|
||||
elif opt == '--cutx':
|
||||
rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
|
||||
elif opt == '--exclflds':
|
||||
elif opt == '--cutx':
|
||||
rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
|
||||
elif opt == '--sortfields':
|
||||
rmodulator = SortFieldsInRecordModulator(True)
|
||||
|
|
@ -114,8 +120,6 @@ def parse_command_line():
|
|||
rmodulator = SortFieldsInRecordModulator(True)
|
||||
elif opt == '--sortfieldsdown':
|
||||
rmodulator = SortFieldsInRecordModulator(False)
|
||||
#--mean i,x,y@a,b ... *NOT* the @-sign!
|
||||
#rmodulator = MeanModulator(["i","x","y"],["a","b"])
|
||||
|
||||
elif opt == '--help':
|
||||
usage()
|
||||
|
|
@ -138,8 +142,8 @@ def main():
|
|||
options = parse_command_line()
|
||||
|
||||
# parse ARGV:
|
||||
# * --ifmt: dfl,hdr1st,iidxed,align,xposealign
|
||||
# * --ofmt: dfl,hdr1st,iidxed,align,xposealign
|
||||
# * --ifmt: dkvp,hdr1st,iidxed,align,xposealign
|
||||
# * --ofmt: dkvp,hdr1st,iidxed,align,xposealign
|
||||
# * which-control-language spec?!?
|
||||
# * modulators/script ... this is the key decision area for language(s) design.
|
||||
# * filenames
|
||||
|
|
@ -466,9 +470,6 @@ class MeanModulator:
|
|||
class StreamModulator:
|
||||
def __init__(self):
|
||||
pass
|
||||
# xxx clearly define duck-ops for istream & ostream.
|
||||
# * sys.stdin, sys.stdout, file ops need to impl it (maybe need to decorate them to do so).
|
||||
# * likewise need to be able to compose one stream modulator inside another. e.g. sort(sum(inclflds(...)...)...).
|
||||
def modulate(self, rreader, rmodulator, rwriter):
|
||||
while True:
|
||||
in_record = rreader.read()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue