Initial commit

This commit is contained in:
John Kerl 2015-05-03 16:11:45 -07:00
parent 63cf4c4262
commit 892e69978a
244 changed files with 8699594 additions and 2 deletions

501
python/sketch.py Executable file
View file

@ -0,0 +1,501 @@
#!/usr/bin/python
import os, sys
import getopt
import string, re
import collections
# ================================================================
# o inclflds a,x,b
# o newflds '{$y:$x*$x, $z:$x/2, $n:-$z}'
# o greprecs '$x <= 2 && $y eq "zebra"'
#
# o tabular pretty-print
# o mean
# o sort
# absolute essentials:
# * RECORD-LEVEL:
# k include/exclude fields
# o new field as function of old
# o vertical pretty-print
# * STREAM-LEVEL:
# o include/exclude records
# o sort
# o summarizations: min, max, mean, count, sum, first, last
# o tabular pretty-print
# ================================================================
def usage():
print >> sys.stderr, "Usage: %s [options] {modulator-spec} {zero or more filenames}" % os.path.basename(sys.argv[0])
print >> sys.stderr, "Options:"
print >> sys.stderr, " -R {rs} Input/output record separator"
print >> sys.stderr, " -F {fs} Input/output field separator"
print >> sys.stderr, " -P {ps} Input/output key-value-pair separator"
print >> sys.stderr, " -v {name=value} xxx needs more doc"
print >> sys.stderr, ""
print >> sys.stderr, " --idfl Input format is delimited by IRS,IFS,IPS"
print >> sys.stderr, " --odfl Output format is delimited by IRS,IFS,IPS"
print >> sys.stderr, " --ihdrdata Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
print >> sys.stderr, " --ohdrdata Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
print >> sys.stderr, " --iidx Input format is implicitly integer-indexed (awk-style)"
print >> sys.stderr, " --oidx Output format is implicitly integer-indexed (awk-style)"
print >> sys.stderr, " --itbl Input format is tabular-pretty-print"
print >> sys.stderr, " --otbl Output format is tabular-pretty-print"
print >> sys.stderr, " --ixtbl Input format is transposed-tabular-pretty-print"
print >> sys.stderr, " --oxtbl Output format is transposed-tabular-pretty-print"
print >> sys.stderr, "Modulator-spec help is TBD."
sys.exit(1)
# ----------------------------------------------------------------
def parse_command_line():
namespace = set_up_namespace()
rreader = None
rwriter = None
rmodulator = None
try:
optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [
'help', 'idfl', 'odfl', 'ihdrdata', 'ohdrdata', 'iidx', 'oidx', 'itbl', 'otbl', 'ixtbl',
'oxtbl', 'cat', 'tac', 'inclflds=', 'exclflds=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(1)
for opt, arg in optargs:
if opt == '-R':
rs = arg
namespace.put("ORS", namespace.put("IRS", rs))
elif opt == '-F':
fs = arg
namespace.put("OFS", namespace.put("IFS", fs))
elif opt == '-P':
ps = arg
namespace.put("OPS", namespace.put("IPS", ps))
elif opt == '-v':
kv = string.split(arg, "=", 1)
namespace.put(kv[0], kv[1])
elif opt == '--idfl':
rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
elif opt == '--odfl':
rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
elif opt == '--ihdrdata':
rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
elif opt == '--ohdrdata':
rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
elif opt == '--iidx':
rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
elif opt == '--oidx':
rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
#elif opt == '--ixtbl':
# pass
elif opt == '--oxtbl':
rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!?
elif opt == '--cat':
rmodulator = CatModulator()
elif opt == '--tac':
rmodulator = TacModulator()
elif opt == '--inclflds':
rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS")))
elif opt == '--exclflds':
rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
elif opt == '--exclflds':
rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
elif opt == '--sortfields':
rmodulator = SortFieldsInRecordModulator(True)
elif opt == '--sortfieldsup':
rmodulator = SortFieldsInRecordModulator(True)
elif opt == '--sortfieldsdown':
rmodulator = SortFieldsInRecordModulator(False)
#--mean i,x,y@a,b ... *NOT* the @-sign!
#rmodulator = MeanModulator(["i","x","y"],["a","b"])
elif opt == '--help':
usage()
else:
print >> sys.stderr, "Unhandled option \"%s\"." % opt
sys.exit(1)
#xxx non_option_arg_count = len(non_option_args)
if rreader == None:
rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
if rwriter == None:
rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
if rmodulator == None:
rmodulator = CatModulator()
return {'namespace':namespace, 'rreader':rreader, 'rwriter':rwriter, 'rmodulator':rmodulator}
def main():
options = parse_command_line()
# parse ARGV:
# * --ifmt: dfl,hdr1st,iidxed,align,xposealign
# * --ofmt: dfl,hdr1st,iidxed,align,xposealign
# * which-control-language spec?!?
# * modulators/script ... this is the key decision area for language(s) design.
# * filenames
namespace = options['namespace']
rreader = options['rreader']
rmodulator = options['rmodulator']
rwriter = options['rwriter']
smodulator = StreamModulator()
smodulator.modulate(rreader, rmodulator, rwriter)
# ================================================================
class MillerNamespace:
def __init__(self):
self.mapping = {}
self.imapping = {}
def get(self, name):
return self.mapping[name]
def iget(self, name):
return self.imapping[name]
def put(self, name, value):
self.mapping[name] = value
return value
def iput(self, name, ivalue):
self.imapping[name] = ivalue
return ivalue
# ================================================================
class Record:
# kvs is list of pair-lists. (xxx: do tuples work too?)
def __init__(self, kvs=[]):
self.fields = collections.OrderedDict()
self.mput(kvs)
def put(self, k, v):
self.fields[k] = v
def mput(self, kvs):
for [k,v] in kvs:
self.fields[k] = v
def get(self, k):
return self.fields[k]
def has_key(self, k):
return self.fields.has_key(k)
def get_field_names(self):
return self.fields.keys()
def get_pairs(self):
return self.fields.items()
def num_pairs(self):
return len(self.fields.items())
# xxx xref to record-formatter classes
def __str__(self):
return self.fields.__repr__
def __repr__(self):
return self.fields.__repr__
# ================================================================
# Each record is a sequence of fields delimited by FS, each of which is a
# key-value pair separated by PS.
class RecordReader:
def __init__(self, istream, namespace, irs, ifs, ips):
self.istream = istream
self.namespace = namespace
self.irs = irs
self.ifs = ifs
self.ips = ips
class RecordReaderDefault(RecordReader):
def __init__(self, istream, namespace, irs, ifs, ips):
RecordReader.__init__(self, istream, namespace, irs, ifs, ips)
def read(self):
line = self.istream.readline() # xxx use self.irs
if line == '':
return None
line = line.strip() # Remove leading/trailing whitespace including carriage return from readline().
fields = string.split(line, self.ifs)
kvs = [string.split(field, self.ips, 1) for field in fields]
record = Record(kvs)
self.namespace.iput("NF", record.num_pairs)
self.namespace.iput("NR", self.namespace.iget("NR") + 1)
# xxx stub
self.namespace.put("FILENAME", None)
self.namespace.iput("FNR", self.namespace.iget("FNR") + 1)
return record
# ----------------------------------------------------------------
# awk-style
class RecordReaderIntegerIndexed(RecordReader):
# xxx ctor with istream context?!? or independent of that?!? for cskv, no matter.
# csv reader of course needs context.
def __init__(self, istream, namespace, irs, ifs):
RecordReader.__init__(self, istream, namespace, irs, ifs, None)
def read(self):
# xxx use self.irs
line = self.istream.readline()
if line == '':
return None
line = line.strip() # Remove leading/trailing whitespace including carriage return from readline().
fields = re.split(self.ifs, line)
kvs = []
i = 1
for field in fields:
kvs.append([i, field])
i += 1
return Record(kvs)
# ----------------------------------------------------------------
# csv-style
class RecordReaderHeaderFirst(RecordReader):
def __init__(self, istream, namespace, irs, ifs):
RecordReader.__init__(self, istream, namespace, irs, ifs, None)
self.field_names = None
self.header_line = None
def read(self):
if self.field_names == None:
header_line = self.istream.readline()
if header_line == '':
return None
# Remove leading/trailing whitespace including carriage return from readline().
header_line = header_line.strip()
self.field_names = string.split(header_line, self.ifs, -1)
self.header_line = header_line
data_line = self.istream.readline()
if data_line == '':
return None
# Remove leading/trailing whitespace including carriage return from readline().
data_line = data_line.strip()
field_values = string.split(data_line, self.ifs, -1)
if len(self.field_names) != len(field_values):
raise Exception("Header/data length mismatch: %d != %d in \"%s\" and \"%s\"" % \
(len(field_names), len(field_values), self.header_line, data_line))
return Record(zip(self.field_names, field_values))
# ================================================================
# xxx ostream at ctor?? needs drain-at-end logic for prettyprint.
class RecordWriter:
def __init__(self, ostream, ors, ofs, ops):
self.ostream = ostream
self.ors = ors
self.ofs = ofs
self.ops = ops
class RecordWriterDefault(RecordWriter):
def __init__(self, ostream, ors, ofs, ops):
RecordWriter.__init__(self, ostream, ors, ofs, ops)
def write(self, record):
self.ostream.write(self.ofs.join([str(k)+self.ops+str(v) for [k,v] in record.get_pairs()]))
self.ostream.write("\n")
# ----------------------------------------------------------------
class RecordWriterHeaderFirst(RecordWriter):
def __init__(self, ostream, ors, ofs):
RecordWriter.__init__(self, ostream, ors, ofs, None)
self.field_names = None
def write(self, record):
data_string = self.ofs.join([str(v) for [k,v] in record.get_pairs()])
if self.field_names == None:
self.field_names = record.get_field_names()
header_string = self.ofs.join([str(k) for [k,v] in record.get_pairs()])
self.ostream.write(header_string)
self.ostream.write("\n")
self.ostream.write(data_string)
self.ostream.write("\n")
# ----------------------------------------------------------------
# xxx rename
class RecordWriterVerticallyTabulated(RecordWriter):
def __init__(self, ostream):
RecordWriter.__init__(self, ostream, None, None, None)
def write(self, record):
max_field_name_width = 1
field_names = record.get_field_names()
for field_name in field_names:
field_name_width = len(field_name)
if field_name_width > max_field_name_width:
max_field_name_width = field_name_width
lines = []
for field_name in field_names:
lines.append("%-*s %s" % (max_field_name_width, field_name, record.get(field_name)))
self.ostream.write("\n".join(lines))
self.ostream.write("\n\n")
# ----------------------------------------------------------------
class RecordWriterIntegerIndexed:
def __init__(self, ostream, ors, ofs):
self.ostream = ostream
self.ors = ors
self.ofs = ofs
def write(self, record):
self.ostream.write(self.ofs.join([str(v) for [k,v] in record.get_pairs()]))
self.ostream.write("\n")
# ================================================================
class CatModulator:
def __init__(self):
pass
def modulate(self, record):
if record == None: # drain at end
return []
return [record]
class TacModulator:
def __init__(self):
self.records = []
def modulate(self, record):
if record == None: # drain at end
self.records.reverse()
rv = self.records
self.records = []
return rv
else:
self.records.append(record)
return []
class SelectFieldsModulator:
def __init__(self, field_names):
self.field_names = field_names
def modulate(self, record):
if record == None: # drain at end
return []
kvs = []
for field_name in self.field_names:
if record.has_key(field_name):
kvs.append((field_name, record.get(field_name)))
new_record = Record()
new_record.mput(kvs)
return [new_record]
# The field_names argument may be a list or hash-set -- as long as it supports
# the "in" operator as in "name in field_names".
# xxx to do: use a hash-set internally.
class DeselectFieldsModulator:
def __init__(self, field_names):
self.field_names = field_names
def modulate(self, record):
if record == None: # drain at end
return []
kvs = []
for field_name in record.get_field_names():
if not field_name in self.field_names:
kvs.append((field_name, record.get(field_name)))
new_record = Record()
new_record.mput(kvs)
return [new_record]
class SortFieldsInRecordModulator:
def __init__(self, do_ascending_sort=True):
self.do_ascending_sort = do_ascending_sort
def modulate(self, record):
if record == None: # drain at end
return []
kvs = []
sorted_field_names = sorted(record.get_field_names())
if not self.do_ascending_sort:
sorted_field_names.reverse() # xxx optimize
for field_name in sorted_field_names:
kvs.append((field_name, record.get(field_name)))
new_record = Record()
new_record.mput(kvs)
return [new_record]
class MeanKeeper:
def __init__(self):
self.sum = 0.0
self.count = 0
def put(x):
self.sum += x
self.count += 1
def get_sum():
return self.sum
def get_count():
return self.count
def get_mean():
# In IEEE-standard floating-point this would give NaN in the empty case.
# But Python throws an exception on divide by zero instead.
if self.count == 0:
return None
else:
return self.sum / self.count
class MeanModulator:
def __init__(self, collate_field_names, key_field_names=[]):
self.collate_field_names = collate_field_names
self.key_field_names = key_field_names
# map from key-field values to (map from collate-field names to MSCKeeper objects).
self.collate_outputs = {}
def modulate(self, record):
if record != None: # drain at end
# xxx optimize
for value_field_name in self.collate_field_names:
if not record.has_key(value_field_name):
return []
for key_field_name in self.key_field_names:
if not record.has_key(key_field_name):
return []
collate_field_values = [float(record.get(k)) for k in self.collate_field_names]
key_string = ",".join([record.get(k) for k in self.key_field_names])
return []
else:
# xxx stub
output_record = Record()
output_record.put("foo", "bar")
return [output_record]
# ================================================================
class StreamModulator:
def __init__(self):
pass
# xxx clearly define duck-ops for istream & ostream.
# * sys.stdin, sys.stdout, file ops need to impl it (maybe need to decorate them to do so).
# * likewise need to be able to compose one stream modulator inside another. e.g. sort(sum(inclflds(...)...)...).
def modulate(self, rreader, rmodulator, rwriter):
while True:
in_record = rreader.read()
out_records = rmodulator.modulate(in_record)
for out_record in out_records:
rwriter.write(out_record)
if in_record == None:
break
# ================================================================
def set_up_namespace():
namespace = MillerNamespace()
namespace.put("ORS", namespace.put("IRS", "\n"))
namespace.put("OFS", namespace.put("IFS", ","))
namespace.put("OPS", namespace.put("IPS", "="))
# xxx CONVFMT
namespace.put("FILENAME", None)
namespace.iput("NF", None)
namespace.iput("NR", 0)
namespace.iput("FNR", 0)
return namespace
# ================================================================
main()