mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
Initial commit
This commit is contained in:
parent
63cf4c4262
commit
892e69978a
244 changed files with 8699594 additions and 2 deletions
501
python/sketch.py
Executable file
501
python/sketch.py
Executable file
|
|
@ -0,0 +1,501 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import os, sys
|
||||
import getopt
|
||||
import string, re
|
||||
import collections
|
||||
|
||||
# ================================================================
|
||||
# o inclflds a,x,b
|
||||
# o newflds '{$y:$x*$x, $z:$x/2, $n:-$z}'
|
||||
# o greprecs '$x <= 2 && $y eq "zebra"'
|
||||
#
|
||||
# o tabular pretty-print
|
||||
# o mean
|
||||
# o sort
|
||||
|
||||
# absolute essentials:
|
||||
# * RECORD-LEVEL:
|
||||
# k include/exclude fields
|
||||
# o new field as function of old
|
||||
# o vertical pretty-print
|
||||
# * STREAM-LEVEL:
|
||||
# o include/exclude records
|
||||
# o sort
|
||||
# o summarizations: min, max, mean, count, sum, first, last
|
||||
# o tabular pretty-print
|
||||
|
||||
# ================================================================
|
||||
def usage():
|
||||
print >> sys.stderr, "Usage: %s [options] {modulator-spec} {zero or more filenames}" % os.path.basename(sys.argv[0])
|
||||
print >> sys.stderr, "Options:"
|
||||
print >> sys.stderr, " -R {rs} Input/output record separator"
|
||||
print >> sys.stderr, " -F {fs} Input/output field separator"
|
||||
print >> sys.stderr, " -P {ps} Input/output key-value-pair separator"
|
||||
print >> sys.stderr, " -v {name=value} xxx needs more doc"
|
||||
print >> sys.stderr, ""
|
||||
print >> sys.stderr, " --idfl Input format is delimited by IRS,IFS,IPS"
|
||||
print >> sys.stderr, " --odfl Output format is delimited by IRS,IFS,IPS"
|
||||
print >> sys.stderr, " --ihdrdata Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
|
||||
print >> sys.stderr, " --ohdrdata Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
|
||||
print >> sys.stderr, " --iidx Input format is implicitly integer-indexed (awk-style)"
|
||||
print >> sys.stderr, " --oidx Output format is implicitly integer-indexed (awk-style)"
|
||||
print >> sys.stderr, " --itbl Input format is tabular-pretty-print"
|
||||
print >> sys.stderr, " --otbl Output format is tabular-pretty-print"
|
||||
print >> sys.stderr, " --ixtbl Input format is transposed-tabular-pretty-print"
|
||||
print >> sys.stderr, " --oxtbl Output format is transposed-tabular-pretty-print"
|
||||
print >> sys.stderr, "Modulator-spec help is TBD."
|
||||
sys.exit(1)
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
def parse_command_line():
|
||||
namespace = set_up_namespace()
|
||||
rreader = None
|
||||
rwriter = None
|
||||
rmodulator = None
|
||||
|
||||
try:
|
||||
optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [
|
||||
'help', 'idfl', 'odfl', 'ihdrdata', 'ohdrdata', 'iidx', 'oidx', 'itbl', 'otbl', 'ixtbl',
|
||||
'oxtbl', 'cat', 'tac', 'inclflds=', 'exclflds=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
|
||||
|
||||
except getopt.GetoptError, err:
|
||||
print str(err)
|
||||
usage()
|
||||
sys.exit(1)
|
||||
|
||||
for opt, arg in optargs:
|
||||
if opt == '-R':
|
||||
rs = arg
|
||||
namespace.put("ORS", namespace.put("IRS", rs))
|
||||
elif opt == '-F':
|
||||
fs = arg
|
||||
namespace.put("OFS", namespace.put("IFS", fs))
|
||||
elif opt == '-P':
|
||||
ps = arg
|
||||
namespace.put("OPS", namespace.put("IPS", ps))
|
||||
elif opt == '-v':
|
||||
kv = string.split(arg, "=", 1)
|
||||
namespace.put(kv[0], kv[1])
|
||||
|
||||
elif opt == '--idfl':
|
||||
rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
|
||||
elif opt == '--odfl':
|
||||
rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
|
||||
|
||||
elif opt == '--ihdrdata':
|
||||
rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
|
||||
elif opt == '--ohdrdata':
|
||||
rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
|
||||
|
||||
elif opt == '--iidx':
|
||||
rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
|
||||
elif opt == '--oidx':
|
||||
rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
|
||||
|
||||
#elif opt == '--ixtbl':
|
||||
# pass
|
||||
elif opt == '--oxtbl':
|
||||
rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!?
|
||||
|
||||
elif opt == '--cat':
|
||||
rmodulator = CatModulator()
|
||||
elif opt == '--tac':
|
||||
rmodulator = TacModulator()
|
||||
elif opt == '--inclflds':
|
||||
rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS")))
|
||||
elif opt == '--exclflds':
|
||||
rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
|
||||
elif opt == '--exclflds':
|
||||
rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
|
||||
elif opt == '--sortfields':
|
||||
rmodulator = SortFieldsInRecordModulator(True)
|
||||
elif opt == '--sortfieldsup':
|
||||
rmodulator = SortFieldsInRecordModulator(True)
|
||||
elif opt == '--sortfieldsdown':
|
||||
rmodulator = SortFieldsInRecordModulator(False)
|
||||
#--mean i,x,y@a,b ... *NOT* the @-sign!
|
||||
#rmodulator = MeanModulator(["i","x","y"],["a","b"])
|
||||
|
||||
elif opt == '--help':
|
||||
usage()
|
||||
else:
|
||||
print >> sys.stderr, "Unhandled option \"%s\"." % opt
|
||||
sys.exit(1)
|
||||
|
||||
#xxx non_option_arg_count = len(non_option_args)
|
||||
|
||||
if rreader == None:
|
||||
rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
|
||||
if rwriter == None:
|
||||
rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
|
||||
if rmodulator == None:
|
||||
rmodulator = CatModulator()
|
||||
|
||||
return {'namespace':namespace, 'rreader':rreader, 'rwriter':rwriter, 'rmodulator':rmodulator}
|
||||
|
||||
def main():
|
||||
options = parse_command_line()
|
||||
|
||||
# parse ARGV:
|
||||
# * --ifmt: dfl,hdr1st,iidxed,align,xposealign
|
||||
# * --ofmt: dfl,hdr1st,iidxed,align,xposealign
|
||||
# * which-control-language spec?!?
|
||||
# * modulators/script ... this is the key decision area for language(s) design.
|
||||
# * filenames
|
||||
|
||||
namespace = options['namespace']
|
||||
rreader = options['rreader']
|
||||
rmodulator = options['rmodulator']
|
||||
rwriter = options['rwriter']
|
||||
|
||||
smodulator = StreamModulator()
|
||||
smodulator.modulate(rreader, rmodulator, rwriter)
|
||||
|
||||
# ================================================================
|
||||
class MillerNamespace:
|
||||
def __init__(self):
|
||||
self.mapping = {}
|
||||
self.imapping = {}
|
||||
def get(self, name):
|
||||
return self.mapping[name]
|
||||
def iget(self, name):
|
||||
return self.imapping[name]
|
||||
def put(self, name, value):
|
||||
self.mapping[name] = value
|
||||
return value
|
||||
def iput(self, name, ivalue):
|
||||
self.imapping[name] = ivalue
|
||||
return ivalue
|
||||
|
||||
# ================================================================
|
||||
class Record:
|
||||
# kvs is list of pair-lists. (xxx: do tuples work too?)
|
||||
def __init__(self, kvs=[]):
|
||||
self.fields = collections.OrderedDict()
|
||||
self.mput(kvs)
|
||||
def put(self, k, v):
|
||||
self.fields[k] = v
|
||||
def mput(self, kvs):
|
||||
for [k,v] in kvs:
|
||||
self.fields[k] = v
|
||||
def get(self, k):
|
||||
return self.fields[k]
|
||||
def has_key(self, k):
|
||||
return self.fields.has_key(k)
|
||||
def get_field_names(self):
|
||||
return self.fields.keys()
|
||||
def get_pairs(self):
|
||||
return self.fields.items()
|
||||
def num_pairs(self):
|
||||
return len(self.fields.items())
|
||||
# xxx xref to record-formatter classes
|
||||
def __str__(self):
|
||||
return self.fields.__repr__
|
||||
def __repr__(self):
|
||||
return self.fields.__repr__
|
||||
|
||||
# ================================================================
|
||||
# Each record is a sequence of fields delimited by FS, each of which is a
|
||||
# key-value pair separated by PS.
|
||||
|
||||
class RecordReader:
|
||||
def __init__(self, istream, namespace, irs, ifs, ips):
|
||||
self.istream = istream
|
||||
self.namespace = namespace
|
||||
self.irs = irs
|
||||
self.ifs = ifs
|
||||
self.ips = ips
|
||||
|
||||
class RecordReaderDefault(RecordReader):
|
||||
def __init__(self, istream, namespace, irs, ifs, ips):
|
||||
RecordReader.__init__(self, istream, namespace, irs, ifs, ips)
|
||||
|
||||
def read(self):
|
||||
line = self.istream.readline() # xxx use self.irs
|
||||
if line == '':
|
||||
return None
|
||||
|
||||
line = line.strip() # Remove leading/trailing whitespace including carriage return from readline().
|
||||
fields = string.split(line, self.ifs)
|
||||
kvs = [string.split(field, self.ips, 1) for field in fields]
|
||||
record = Record(kvs)
|
||||
|
||||
self.namespace.iput("NF", record.num_pairs)
|
||||
self.namespace.iput("NR", self.namespace.iget("NR") + 1)
|
||||
|
||||
# xxx stub
|
||||
self.namespace.put("FILENAME", None)
|
||||
self.namespace.iput("FNR", self.namespace.iget("FNR") + 1)
|
||||
|
||||
return record
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# awk-style
|
||||
class RecordReaderIntegerIndexed(RecordReader):
|
||||
# xxx ctor with istream context?!? or independent of that?!? for cskv, no matter.
|
||||
# csv reader of course needs context.
|
||||
def __init__(self, istream, namespace, irs, ifs):
|
||||
RecordReader.__init__(self, istream, namespace, irs, ifs, None)
|
||||
|
||||
def read(self):
|
||||
# xxx use self.irs
|
||||
line = self.istream.readline()
|
||||
if line == '':
|
||||
return None
|
||||
line = line.strip() # Remove leading/trailing whitespace including carriage return from readline().
|
||||
fields = re.split(self.ifs, line)
|
||||
kvs = []
|
||||
i = 1
|
||||
for field in fields:
|
||||
kvs.append([i, field])
|
||||
i += 1
|
||||
return Record(kvs)
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# csv-style
|
||||
class RecordReaderHeaderFirst(RecordReader):
|
||||
def __init__(self, istream, namespace, irs, ifs):
|
||||
RecordReader.__init__(self, istream, namespace, irs, ifs, None)
|
||||
self.field_names = None
|
||||
self.header_line = None
|
||||
|
||||
def read(self):
|
||||
if self.field_names == None:
|
||||
header_line = self.istream.readline()
|
||||
if header_line == '':
|
||||
return None
|
||||
# Remove leading/trailing whitespace including carriage return from readline().
|
||||
header_line = header_line.strip()
|
||||
self.field_names = string.split(header_line, self.ifs, -1)
|
||||
self.header_line = header_line
|
||||
|
||||
data_line = self.istream.readline()
|
||||
if data_line == '':
|
||||
return None
|
||||
# Remove leading/trailing whitespace including carriage return from readline().
|
||||
data_line = data_line.strip()
|
||||
field_values = string.split(data_line, self.ifs, -1)
|
||||
if len(self.field_names) != len(field_values):
|
||||
raise Exception("Header/data length mismatch: %d != %d in \"%s\" and \"%s\"" % \
|
||||
(len(field_names), len(field_values), self.header_line, data_line))
|
||||
|
||||
return Record(zip(self.field_names, field_values))
|
||||
|
||||
# ================================================================
|
||||
# xxx ostream at ctor?? needs drain-at-end logic for prettyprint.
|
||||
|
||||
class RecordWriter:
|
||||
def __init__(self, ostream, ors, ofs, ops):
|
||||
self.ostream = ostream
|
||||
self.ors = ors
|
||||
self.ofs = ofs
|
||||
self.ops = ops
|
||||
|
||||
class RecordWriterDefault(RecordWriter):
|
||||
def __init__(self, ostream, ors, ofs, ops):
|
||||
RecordWriter.__init__(self, ostream, ors, ofs, ops)
|
||||
|
||||
def write(self, record):
|
||||
self.ostream.write(self.ofs.join([str(k)+self.ops+str(v) for [k,v] in record.get_pairs()]))
|
||||
self.ostream.write("\n")
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
class RecordWriterHeaderFirst(RecordWriter):
|
||||
def __init__(self, ostream, ors, ofs):
|
||||
RecordWriter.__init__(self, ostream, ors, ofs, None)
|
||||
self.field_names = None
|
||||
|
||||
def write(self, record):
|
||||
data_string = self.ofs.join([str(v) for [k,v] in record.get_pairs()])
|
||||
if self.field_names == None:
|
||||
self.field_names = record.get_field_names()
|
||||
header_string = self.ofs.join([str(k) for [k,v] in record.get_pairs()])
|
||||
self.ostream.write(header_string)
|
||||
self.ostream.write("\n")
|
||||
self.ostream.write(data_string)
|
||||
self.ostream.write("\n")
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# xxx rename
|
||||
|
||||
class RecordWriterVerticallyTabulated(RecordWriter):
|
||||
def __init__(self, ostream):
|
||||
RecordWriter.__init__(self, ostream, None, None, None)
|
||||
|
||||
def write(self, record):
|
||||
max_field_name_width = 1
|
||||
field_names = record.get_field_names()
|
||||
for field_name in field_names:
|
||||
field_name_width = len(field_name)
|
||||
if field_name_width > max_field_name_width:
|
||||
max_field_name_width = field_name_width
|
||||
lines = []
|
||||
for field_name in field_names:
|
||||
lines.append("%-*s %s" % (max_field_name_width, field_name, record.get(field_name)))
|
||||
self.ostream.write("\n".join(lines))
|
||||
self.ostream.write("\n\n")
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
class RecordWriterIntegerIndexed:
|
||||
def __init__(self, ostream, ors, ofs):
|
||||
self.ostream = ostream
|
||||
self.ors = ors
|
||||
self.ofs = ofs
|
||||
def write(self, record):
|
||||
self.ostream.write(self.ofs.join([str(v) for [k,v] in record.get_pairs()]))
|
||||
self.ostream.write("\n")
|
||||
|
||||
# ================================================================
|
||||
class CatModulator:
|
||||
def __init__(self):
|
||||
pass
|
||||
def modulate(self, record):
|
||||
if record == None: # drain at end
|
||||
return []
|
||||
return [record]
|
||||
|
||||
class TacModulator:
|
||||
def __init__(self):
|
||||
self.records = []
|
||||
def modulate(self, record):
|
||||
if record == None: # drain at end
|
||||
self.records.reverse()
|
||||
rv = self.records
|
||||
self.records = []
|
||||
return rv
|
||||
else:
|
||||
self.records.append(record)
|
||||
return []
|
||||
|
||||
class SelectFieldsModulator:
|
||||
def __init__(self, field_names):
|
||||
self.field_names = field_names
|
||||
def modulate(self, record):
|
||||
if record == None: # drain at end
|
||||
return []
|
||||
kvs = []
|
||||
for field_name in self.field_names:
|
||||
if record.has_key(field_name):
|
||||
kvs.append((field_name, record.get(field_name)))
|
||||
new_record = Record()
|
||||
new_record.mput(kvs)
|
||||
return [new_record]
|
||||
|
||||
# The field_names argument may be a list or hash-set -- as long as it supports
|
||||
# the "in" operator as in "name in field_names".
|
||||
# xxx to do: use a hash-set internally.
|
||||
class DeselectFieldsModulator:
|
||||
def __init__(self, field_names):
|
||||
self.field_names = field_names
|
||||
def modulate(self, record):
|
||||
if record == None: # drain at end
|
||||
return []
|
||||
kvs = []
|
||||
for field_name in record.get_field_names():
|
||||
if not field_name in self.field_names:
|
||||
kvs.append((field_name, record.get(field_name)))
|
||||
new_record = Record()
|
||||
new_record.mput(kvs)
|
||||
return [new_record]
|
||||
|
||||
class SortFieldsInRecordModulator:
|
||||
def __init__(self, do_ascending_sort=True):
|
||||
self.do_ascending_sort = do_ascending_sort
|
||||
def modulate(self, record):
|
||||
if record == None: # drain at end
|
||||
return []
|
||||
kvs = []
|
||||
sorted_field_names = sorted(record.get_field_names())
|
||||
if not self.do_ascending_sort:
|
||||
sorted_field_names.reverse() # xxx optimize
|
||||
for field_name in sorted_field_names:
|
||||
kvs.append((field_name, record.get(field_name)))
|
||||
new_record = Record()
|
||||
new_record.mput(kvs)
|
||||
return [new_record]
|
||||
|
||||
class MeanKeeper:
|
||||
def __init__(self):
|
||||
self.sum = 0.0
|
||||
self.count = 0
|
||||
def put(x):
|
||||
self.sum += x
|
||||
self.count += 1
|
||||
def get_sum():
|
||||
return self.sum
|
||||
def get_count():
|
||||
return self.count
|
||||
def get_mean():
|
||||
# In IEEE-standard floating-point this would give NaN in the empty case.
|
||||
# But Python throws an exception on divide by zero instead.
|
||||
if self.count == 0:
|
||||
return None
|
||||
else:
|
||||
return self.sum / self.count
|
||||
|
||||
class MeanModulator:
|
||||
def __init__(self, collate_field_names, key_field_names=[]):
|
||||
self.collate_field_names = collate_field_names
|
||||
self.key_field_names = key_field_names
|
||||
# map from key-field values to (map from collate-field names to MSCKeeper objects).
|
||||
self.collate_outputs = {}
|
||||
|
||||
def modulate(self, record):
|
||||
if record != None: # drain at end
|
||||
|
||||
# xxx optimize
|
||||
for value_field_name in self.collate_field_names:
|
||||
if not record.has_key(value_field_name):
|
||||
return []
|
||||
for key_field_name in self.key_field_names:
|
||||
if not record.has_key(key_field_name):
|
||||
return []
|
||||
|
||||
collate_field_values = [float(record.get(k)) for k in self.collate_field_names]
|
||||
key_string = ",".join([record.get(k) for k in self.key_field_names])
|
||||
|
||||
return []
|
||||
else:
|
||||
# xxx stub
|
||||
output_record = Record()
|
||||
output_record.put("foo", "bar")
|
||||
return [output_record]
|
||||
|
||||
# ================================================================
|
||||
class StreamModulator:
|
||||
def __init__(self):
|
||||
pass
|
||||
# xxx clearly define duck-ops for istream & ostream.
|
||||
# * sys.stdin, sys.stdout, file ops need to impl it (maybe need to decorate them to do so).
|
||||
# * likewise need to be able to compose one stream modulator inside another. e.g. sort(sum(inclflds(...)...)...).
|
||||
def modulate(self, rreader, rmodulator, rwriter):
|
||||
while True:
|
||||
in_record = rreader.read()
|
||||
|
||||
out_records = rmodulator.modulate(in_record)
|
||||
|
||||
for out_record in out_records:
|
||||
rwriter.write(out_record)
|
||||
|
||||
if in_record == None:
|
||||
break
|
||||
|
||||
# ================================================================
|
||||
def set_up_namespace():
|
||||
namespace = MillerNamespace()
|
||||
namespace.put("ORS", namespace.put("IRS", "\n"))
|
||||
namespace.put("OFS", namespace.put("IFS", ","))
|
||||
namespace.put("OPS", namespace.put("IPS", "="))
|
||||
|
||||
# xxx CONVFMT
|
||||
|
||||
namespace.put("FILENAME", None)
|
||||
namespace.iput("NF", None)
|
||||
namespace.iput("NR", 0)
|
||||
namespace.iput("FNR", 0)
|
||||
|
||||
return namespace
|
||||
|
||||
# ================================================================
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue