mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
Update 2015-era Python sketch to Python 3 (#1372)
This commit is contained in:
parent
71171bc04c
commit
5b29169b08
1 changed files with 504 additions and 364 deletions
868
python/sketch.py
868
python/sketch.py
|
|
@ -1,8 +1,9 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import os, sys
|
||||
import os
|
||||
import sys
|
||||
import getopt
|
||||
import string, re
|
||||
import re
|
||||
import collections
|
||||
|
||||
# ================================================================
|
||||
|
|
@ -25,478 +26,617 @@ import collections
|
|||
# o summarizations: min, max, mean, count, sum, first, last
|
||||
# o tabular pretty-print
|
||||
|
||||
|
||||
# ================================================================
|
||||
def usage():
|
||||
print >> sys.stderr, "Usage: %s [options] {modulator-spec} {zero or more filenames}" % os.path.basename(sys.argv[0])
|
||||
print >> sys.stderr, "Options:"
|
||||
print >> sys.stderr, " -R {rs} Input/output record separator"
|
||||
print >> sys.stderr, " -F {fs} Input/output field separator"
|
||||
print >> sys.stderr, " -P {ps} Input/output key-value-pair separator"
|
||||
print >> sys.stderr, " -v {name=value} xxx needs more doc"
|
||||
print >> sys.stderr, ""
|
||||
print >> sys.stderr, " --idkvp Input format is delimited by IRS,IFS,IPS"
|
||||
print >> sys.stderr, " --odkvp Output format is delimited by IRS,IFS,IPS"
|
||||
print >> sys.stderr, " --icsv Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
|
||||
print >> sys.stderr, " --ocsv Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
|
||||
print >> sys.stderr, " --inidx Input format is implicitly integer-indexed (awk-style)"
|
||||
print >> sys.stderr, " --onidx Output format is implicitly integer-indexed (awk-style)"
|
||||
print >> sys.stderr, " --ixtab Input format is transposed-tabular-pretty-print"
|
||||
print >> sys.stderr, " --oxtab Output format is transposed-tabular-pretty-print"
|
||||
print >> sys.stderr, "Modulator specs:"
|
||||
print >> sys.stderr, '--cat'
|
||||
print >> sys.stderr, '--tac'
|
||||
print >> sys.stderr, '--cut'
|
||||
print >> sys.stderr, '--cutx'
|
||||
print >> sys.stderr, '--sortfields'
|
||||
print >> sys.stderr, '--sortfieldsup'
|
||||
print >> sys.stderr, '--sortfieldsdown'
|
||||
print(
|
||||
"Usage: %s [options] {modulator-spec} {zero or more filenames}"
|
||||
% os.path.basename(sys.argv[0]),
|
||||
file=sys.stderr,
|
||||
)
|
||||
msg = """
|
||||
Options:
|
||||
-R {rs} Input/output record separator
|
||||
-F {fs} Input/output field separator
|
||||
-P {ps} Input/output key-value-pair separator
|
||||
-v {name=value} xxx needs more doc
|
||||
|
||||
--idkvp Input format is delimited by IRS,IFS,IPS
|
||||
--odkvp Output format is delimited by IRS,IFS,IPS
|
||||
--icsv Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)
|
||||
--ocsv Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)
|
||||
--inidx Input format is implicitly integer-indexed (awk-style)
|
||||
--onidx Output format is implicitly integer-indexed (awk-style)
|
||||
--ixtab Input format is transposed-tabular-pretty-print
|
||||
--oxtab Output format is transposed-tabular-pretty-print
|
||||
Modulator specs:
|
||||
--cat
|
||||
--tac
|
||||
--cut
|
||||
--cutx
|
||||
--sortfields
|
||||
--sortfieldsup
|
||||
--sortfieldsdown
|
||||
"""
|
||||
print(msg, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
def parse_command_line():
|
||||
namespace = set_up_namespace()
|
||||
rreader = None
|
||||
rwriter = None
|
||||
rmodulator = None
|
||||
namespace = set_up_namespace()
|
||||
rreader = None
|
||||
rwriter = None
|
||||
rmodulator = None
|
||||
|
||||
try:
|
||||
optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [
|
||||
'help', 'idkvp', 'odkvp', 'icsv', 'ocsv', 'inidx', 'onidx', 'ixtab', 'oxtab',
|
||||
'cat', 'tac', 'cut=', 'cutx=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
|
||||
try:
|
||||
optargs, non_option_args = getopt.getopt(
|
||||
sys.argv[1:],
|
||||
"R:F:P:v:h",
|
||||
[
|
||||
"help",
|
||||
"idkvp",
|
||||
"odkvp",
|
||||
"icsv",
|
||||
"ocsv",
|
||||
"inidx",
|
||||
"onidx",
|
||||
"ixtab",
|
||||
"oxtab",
|
||||
"cat",
|
||||
"tac",
|
||||
"cut=",
|
||||
"cutx=",
|
||||
"sortfields",
|
||||
"sortfieldsup",
|
||||
"sortfieldsdown",
|
||||
],
|
||||
)
|
||||
|
||||
except getopt.GetoptError, err:
|
||||
print str(err)
|
||||
usage()
|
||||
sys.exit(1)
|
||||
except getopt.GetoptError as e:
|
||||
print(str(e))
|
||||
usage()
|
||||
sys.exit(1)
|
||||
|
||||
for opt, arg in optargs:
|
||||
if opt == '-R':
|
||||
rs = arg
|
||||
namespace.put("ORS", namespace.put("IRS", rs))
|
||||
elif opt == '-F':
|
||||
fs = arg
|
||||
namespace.put("OFS", namespace.put("IFS", fs))
|
||||
elif opt == '-P':
|
||||
ps = arg
|
||||
namespace.put("OPS", namespace.put("IPS", ps))
|
||||
elif opt == '-v':
|
||||
kv = string.split(arg, "=", 1)
|
||||
namespace.put(kv[0], kv[1])
|
||||
for opt, arg in optargs:
|
||||
if opt == "-R":
|
||||
rs = arg
|
||||
namespace.put("ORS", namespace.put("IRS", rs))
|
||||
elif opt == "-F":
|
||||
fs = arg
|
||||
namespace.put("OFS", namespace.put("IFS", fs))
|
||||
elif opt == "-P":
|
||||
ps = arg
|
||||
namespace.put("OPS", namespace.put("IPS", ps))
|
||||
elif opt == "-v":
|
||||
kv = arg.split("=", 1)
|
||||
namespace.put(kv[0], kv[1])
|
||||
|
||||
elif opt == '--idkvp':
|
||||
rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
|
||||
elif opt == '--odkvp':
|
||||
rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
|
||||
elif opt == "--idkvp":
|
||||
rreader = RecordReaderDefault(
|
||||
istream=sys.stdin,
|
||||
namespace=namespace,
|
||||
irs=namespace.get("IRS"),
|
||||
ifs=namespace.get("IFS"),
|
||||
ips=namespace.get("IPS"),
|
||||
)
|
||||
elif opt == "--odkvp":
|
||||
rwriter = RecordWriterDefault(
|
||||
ostream=sys.stdout,
|
||||
ors=namespace.get("ORS"),
|
||||
ofs=namespace.get("OFS"),
|
||||
ops=namespace.get("OPS"),
|
||||
)
|
||||
|
||||
elif opt == '--icsv':
|
||||
rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
|
||||
elif opt == '--ocsv':
|
||||
rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
|
||||
elif opt == "--icsv":
|
||||
rreader = RecordReaderHeaderFirst(
|
||||
istream=sys.stdin,
|
||||
namespace=namespace,
|
||||
irs=namespace.get("IRS"),
|
||||
ifs=namespace.get("IFS"),
|
||||
)
|
||||
elif opt == "--ocsv":
|
||||
rwriter = RecordWriterHeaderFirst(
|
||||
ostream=sys.stdout,
|
||||
ors=namespace.get("ORS"),
|
||||
ofs=namespace.get("OFS"),
|
||||
)
|
||||
|
||||
elif opt == '--inidx':
|
||||
rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
|
||||
elif opt == '--onidx':
|
||||
rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
|
||||
elif opt == "--inidx":
|
||||
rreader = RecordReaderIntegerIndexed(
|
||||
istream=sys.stdin,
|
||||
namespace=namespace,
|
||||
irs=namespace.get("IRS"),
|
||||
ifs=namespace.get("IFS"),
|
||||
)
|
||||
elif opt == "--onidx":
|
||||
rwriter = RecordWriterIntegerIndexed(
|
||||
ostream=sys.stdout,
|
||||
ors=namespace.get("ORS"),
|
||||
ofs=namespace.get("OFS"),
|
||||
)
|
||||
|
||||
#elif opt == '--ixtab':
|
||||
# pass
|
||||
elif opt == '--oxtab':
|
||||
rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!?
|
||||
# elif opt == '--ixtab':
|
||||
# pass
|
||||
elif opt == "--oxtab":
|
||||
rwriter = RecordWriterVerticallyTabulated(
|
||||
ostream=sys.stdout
|
||||
) # xxx args w/r/t/ RS/FS/PS?!?
|
||||
|
||||
elif opt == '--cat':
|
||||
rmodulator = CatModulator()
|
||||
elif opt == '--tac':
|
||||
rmodulator = TacModulator()
|
||||
elif opt == '--cut':
|
||||
rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS")))
|
||||
elif opt == '--cutx':
|
||||
rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
|
||||
elif opt == '--cutx':
|
||||
rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
|
||||
elif opt == '--sortfields':
|
||||
rmodulator = SortFieldsInRecordModulator(True)
|
||||
elif opt == '--sortfieldsup':
|
||||
rmodulator = SortFieldsInRecordModulator(True)
|
||||
elif opt == '--sortfieldsdown':
|
||||
rmodulator = SortFieldsInRecordModulator(False)
|
||||
elif opt == "--cat":
|
||||
rmodulator = CatModulator()
|
||||
elif opt == "--tac":
|
||||
rmodulator = TacModulator()
|
||||
elif opt == "--cut":
|
||||
rmodulator = SelectFieldsModulator(arg.split(namespace.get("IFS")))
|
||||
elif opt == "--cutx":
|
||||
rmodulator = DeselectFieldsModulator(arg.split(namespace.get("IFS")))
|
||||
elif opt == "--cutx":
|
||||
rmodulator = DeselectFieldsModulator(arg.split(namespace.get("IFS")))
|
||||
elif opt == "--sortfields":
|
||||
rmodulator = SortFieldsInRecordModulator(True)
|
||||
elif opt == "--sortfieldsup":
|
||||
rmodulator = SortFieldsInRecordModulator(True)
|
||||
elif opt == "--sortfieldsdown":
|
||||
rmodulator = SortFieldsInRecordModulator(False)
|
||||
|
||||
elif opt == '--help':
|
||||
usage()
|
||||
else:
|
||||
print >> sys.stderr, "Unhandled option \"%s\"." % opt
|
||||
sys.exit(1)
|
||||
elif opt == "--help":
|
||||
usage()
|
||||
else:
|
||||
print('Unhandled option "%s".' % opt, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
#xxx non_option_arg_count = len(non_option_args)
|
||||
# xxx non_option_arg_count = len(non_option_args)
|
||||
|
||||
if rreader == None:
|
||||
rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
|
||||
if rwriter == None:
|
||||
rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
|
||||
if rmodulator == None:
|
||||
rmodulator = CatModulator()
|
||||
if rreader is None:
|
||||
rreader = RecordReaderDefault(
|
||||
istream=sys.stdin,
|
||||
namespace=namespace,
|
||||
irs=namespace.get("IRS"),
|
||||
ifs=namespace.get("IFS"),
|
||||
ips=namespace.get("IPS"),
|
||||
)
|
||||
if rwriter is None:
|
||||
rwriter = RecordWriterDefault(
|
||||
ostream=sys.stdout,
|
||||
ors=namespace.get("ORS"),
|
||||
ofs=namespace.get("OFS"),
|
||||
ops=namespace.get("OPS"),
|
||||
)
|
||||
if rmodulator is None:
|
||||
rmodulator = CatModulator()
|
||||
|
||||
return {
|
||||
"namespace": namespace,
|
||||
"rreader": rreader,
|
||||
"rwriter": rwriter,
|
||||
"rmodulator": rmodulator,
|
||||
}
|
||||
|
||||
return {'namespace':namespace, 'rreader':rreader, 'rwriter':rwriter, 'rmodulator':rmodulator}
|
||||
|
||||
def main():
|
||||
options = parse_command_line()
|
||||
options = parse_command_line()
|
||||
|
||||
# parse ARGV:
|
||||
# * --ifmt: dkvp,hdr1st,iidxed,align,xposealign
|
||||
# * --ofmt: dkvp,hdr1st,iidxed,align,xposealign
|
||||
# * which-control-language spec?!?
|
||||
# * modulators/script ... this is the key decision area for language(s) design.
|
||||
# * filenames
|
||||
# parse ARGV:
|
||||
# * --ifmt: dkvp,hdr1st,iidxed,align,xposealign
|
||||
# * --ofmt: dkvp,hdr1st,iidxed,align,xposealign
|
||||
# * which-control-language spec?!?
|
||||
# * modulators/script ... this is the key decision area for language(s) design.
|
||||
# * filenames
|
||||
|
||||
namespace = options['namespace']
|
||||
rreader = options['rreader']
|
||||
rmodulator = options['rmodulator']
|
||||
rwriter = options['rwriter']
|
||||
rreader = options["rreader"]
|
||||
rmodulator = options["rmodulator"]
|
||||
rwriter = options["rwriter"]
|
||||
|
||||
smodulator = StreamModulator()
|
||||
smodulator.modulate(rreader, rmodulator, rwriter)
|
||||
|
||||
smodulator = StreamModulator()
|
||||
smodulator.modulate(rreader, rmodulator, rwriter)
|
||||
|
||||
# ================================================================
|
||||
class MillerNamespace:
|
||||
def __init__(self):
|
||||
self.mapping = {}
|
||||
self.imapping = {}
|
||||
def get(self, name):
|
||||
return self.mapping[name]
|
||||
def iget(self, name):
|
||||
return self.imapping[name]
|
||||
def put(self, name, value):
|
||||
self.mapping[name] = value
|
||||
return value
|
||||
def iput(self, name, ivalue):
|
||||
self.imapping[name] = ivalue
|
||||
return ivalue
|
||||
def __init__(self):
|
||||
self.mapping = {}
|
||||
self.imapping = {}
|
||||
|
||||
def get(self, name):
|
||||
return self.mapping[name]
|
||||
|
||||
def iget(self, name):
|
||||
return self.imapping[name]
|
||||
|
||||
def put(self, name, value):
|
||||
self.mapping[name] = value
|
||||
return value
|
||||
|
||||
def iput(self, name, ivalue):
|
||||
self.imapping[name] = ivalue
|
||||
return ivalue
|
||||
|
||||
|
||||
# ================================================================
|
||||
class Record:
|
||||
# kvs is list of pair-lists. (xxx: do tuples work too?)
|
||||
def __init__(self, kvs=[]):
|
||||
self.fields = collections.OrderedDict()
|
||||
self.mput(kvs)
|
||||
def put(self, k, v):
|
||||
self.fields[k] = v
|
||||
def mput(self, kvs):
|
||||
for [k,v] in kvs:
|
||||
self.fields[k] = v
|
||||
def get(self, k):
|
||||
return self.fields[k]
|
||||
def has_key(self, k):
|
||||
return self.fields.has_key(k)
|
||||
def get_field_names(self):
|
||||
return self.fields.keys()
|
||||
def get_pairs(self):
|
||||
return self.fields.items()
|
||||
def num_pairs(self):
|
||||
return len(self.fields.items())
|
||||
# xxx xref to record-formatter classes
|
||||
def __str__(self):
|
||||
return self.fields.__repr__
|
||||
def __repr__(self):
|
||||
return self.fields.__repr__
|
||||
# kvs is list of pair-lists. (xxx: do tuples work too?)
|
||||
def __init__(self, kvs=[]):
|
||||
self.fields = collections.OrderedDict()
|
||||
self.mput(kvs)
|
||||
|
||||
def put(self, k, v):
|
||||
self.fields[k] = v
|
||||
|
||||
def mput(self, kvs):
|
||||
for [k, v] in kvs:
|
||||
self.fields[k] = v
|
||||
|
||||
def get(self, k):
|
||||
return self.fields[k]
|
||||
|
||||
def has_key(self, k):
|
||||
return k in self.fields.keys()
|
||||
|
||||
def get_field_names(self):
|
||||
return self.fields.keys()
|
||||
|
||||
def get_pairs(self):
|
||||
return self.fields.items()
|
||||
|
||||
def num_pairs(self):
|
||||
return len(self.fields.items())
|
||||
|
||||
# xxx xref to record-formatter classes
|
||||
def __str__(self):
|
||||
return self.fields.__repr__
|
||||
|
||||
def __repr__(self):
|
||||
return self.fields.__repr__
|
||||
|
||||
|
||||
# ================================================================
|
||||
# Each record is a sequence of fields delimited by FS, each of which is a
|
||||
# key-value pair separated by PS.
|
||||
|
||||
|
||||
class RecordReader:
|
||||
def __init__(self, istream, namespace, irs, ifs, ips):
|
||||
self.istream = istream
|
||||
self.namespace = namespace
|
||||
self.irs = irs
|
||||
self.ifs = ifs
|
||||
self.ips = ips
|
||||
def __init__(self, istream, namespace, irs, ifs, ips):
|
||||
self.istream = istream
|
||||
self.namespace = namespace
|
||||
self.irs = irs
|
||||
self.ifs = ifs
|
||||
self.ips = ips
|
||||
|
||||
|
||||
class RecordReaderDefault(RecordReader):
|
||||
def __init__(self, istream, namespace, irs, ifs, ips):
|
||||
RecordReader.__init__(self, istream, namespace, irs, ifs, ips)
|
||||
def __init__(self, istream, namespace, irs, ifs, ips):
|
||||
RecordReader.__init__(self, istream, namespace, irs, ifs, ips)
|
||||
|
||||
def read(self):
|
||||
line = self.istream.readline() # xxx use self.irs
|
||||
if line == '':
|
||||
return None
|
||||
def read(self):
|
||||
line = self.istream.readline() # xxx use self.irs
|
||||
if line == "":
|
||||
return None
|
||||
|
||||
line = line.strip() # Remove leading/trailing whitespace including carriage return from readline().
|
||||
fields = string.split(line, self.ifs)
|
||||
kvs = [string.split(field, self.ips, 1) for field in fields]
|
||||
record = Record(kvs)
|
||||
line = (
|
||||
line.strip()
|
||||
) # Remove leading/trailing whitespace including carriage return from readline().
|
||||
fields = line.split(self.ifs)
|
||||
kvs = [field.split(self.ips, 1) for field in fields]
|
||||
record = Record(kvs)
|
||||
|
||||
self.namespace.iput("NF", record.num_pairs)
|
||||
self.namespace.iput("NR", self.namespace.iget("NR") + 1)
|
||||
self.namespace.iput("NF", record.num_pairs)
|
||||
self.namespace.iput("NR", self.namespace.iget("NR") + 1)
|
||||
|
||||
# xxx stub
|
||||
self.namespace.put("FILENAME", None)
|
||||
self.namespace.iput("FNR", self.namespace.iget("FNR") + 1)
|
||||
# xxx stub
|
||||
self.namespace.put("FILENAME", None)
|
||||
self.namespace.iput("FNR", self.namespace.iget("FNR") + 1)
|
||||
|
||||
return record
|
||||
|
||||
return record
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# awk-style
|
||||
class RecordReaderIntegerIndexed(RecordReader):
|
||||
# xxx ctor with istream context?!? or independent of that?!? for cskv, no matter.
|
||||
# csv reader of course needs context.
|
||||
def __init__(self, istream, namespace, irs, ifs):
|
||||
RecordReader.__init__(self, istream, namespace, irs, ifs, None)
|
||||
# xxx ctor with istream context?!? or independent of that?!? for cskv, no matter.
|
||||
# csv reader of course needs context.
|
||||
def __init__(self, istream, namespace, irs, ifs):
|
||||
RecordReader.__init__(self, istream, namespace, irs, ifs, None)
|
||||
|
||||
def read(self):
|
||||
# xxx use self.irs
|
||||
line = self.istream.readline()
|
||||
if line == "":
|
||||
return None
|
||||
line = (
|
||||
line.strip()
|
||||
) # Remove leading/trailing whitespace including carriage return from readline().
|
||||
fields = re.split(self.ifs, line)
|
||||
kvs = []
|
||||
i = 1
|
||||
for field in fields:
|
||||
kvs.append([i, field])
|
||||
i += 1
|
||||
return Record(kvs)
|
||||
|
||||
def read(self):
|
||||
# xxx use self.irs
|
||||
line = self.istream.readline()
|
||||
if line == '':
|
||||
return None
|
||||
line = line.strip() # Remove leading/trailing whitespace including carriage return from readline().
|
||||
fields = re.split(self.ifs, line)
|
||||
kvs = []
|
||||
i = 1
|
||||
for field in fields:
|
||||
kvs.append([i, field])
|
||||
i += 1
|
||||
return Record(kvs)
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# csv-style
|
||||
class RecordReaderHeaderFirst(RecordReader):
|
||||
def __init__(self, istream, namespace, irs, ifs):
|
||||
RecordReader.__init__(self, istream, namespace, irs, ifs, None)
|
||||
self.field_names = None
|
||||
self.header_line = None
|
||||
def __init__(self, istream, namespace, irs, ifs):
|
||||
RecordReader.__init__(self, istream, namespace, irs, ifs, None)
|
||||
self.field_names = None
|
||||
self.header_line = None
|
||||
|
||||
def read(self):
|
||||
if self.field_names == None:
|
||||
header_line = self.istream.readline()
|
||||
if header_line == '':
|
||||
def read(self):
|
||||
if not self.field_names:
|
||||
header_line = self.istream.readline()
|
||||
if header_line == "":
|
||||
return None
|
||||
# Remove leading/trailing whitespace including carriage return from readline().
|
||||
header_line = header_line.strip()
|
||||
self.field_names = header_line.split(self.ifs, -1)
|
||||
self.header_line = header_line
|
||||
|
||||
data_line = self.istream.readline()
|
||||
if data_line == "":
|
||||
return None
|
||||
# Remove leading/trailing whitespace including carriage return from readline().
|
||||
header_line = header_line.strip()
|
||||
self.field_names = string.split(header_line, self.ifs, -1)
|
||||
self.header_line = header_line
|
||||
# Remove leading/trailing whitespace including carriage return from readline().
|
||||
data_line = data_line.strip()
|
||||
field_values = data_line.split(self.ifs, -1)
|
||||
if len(self.field_names) != len(field_values):
|
||||
raise Exception(
|
||||
'Header/data length mismatch: %d != %d in "%s" and "%s"'
|
||||
% (
|
||||
len(self.field_names),
|
||||
len(field_values),
|
||||
self.header_line,
|
||||
data_line,
|
||||
)
|
||||
)
|
||||
|
||||
data_line = self.istream.readline()
|
||||
if data_line == '':
|
||||
return None
|
||||
# Remove leading/trailing whitespace including carriage return from readline().
|
||||
data_line = data_line.strip()
|
||||
field_values = string.split(data_line, self.ifs, -1)
|
||||
if len(self.field_names) != len(field_values):
|
||||
raise Exception("Header/data length mismatch: %d != %d in \"%s\" and \"%s\"" % \
|
||||
(len(field_names), len(field_values), self.header_line, data_line))
|
||||
return Record(zip(self.field_names, field_values))
|
||||
|
||||
return Record(zip(self.field_names, field_values))
|
||||
|
||||
# ================================================================
|
||||
# xxx ostream at ctor?? needs drain-at-end logic for prettyprint.
|
||||
|
||||
|
||||
class RecordWriter:
|
||||
def __init__(self, ostream, ors, ofs, ops):
|
||||
self.ostream = ostream
|
||||
self.ors = ors
|
||||
self.ofs = ofs
|
||||
self.ops = ops
|
||||
def __init__(self, ostream, ors, ofs, ops):
|
||||
self.ostream = ostream
|
||||
self.ors = ors
|
||||
self.ofs = ofs
|
||||
self.ops = ops
|
||||
|
||||
|
||||
class RecordWriterDefault(RecordWriter):
|
||||
def __init__(self, ostream, ors, ofs, ops):
|
||||
RecordWriter.__init__(self, ostream, ors, ofs, ops)
|
||||
def __init__(self, ostream, ors, ofs, ops):
|
||||
RecordWriter.__init__(self, ostream, ors, ofs, ops)
|
||||
|
||||
def write(self, record):
|
||||
self.ostream.write(
|
||||
self.ofs.join([str(k) + self.ops + str(v) for [k, v] in record.get_pairs()])
|
||||
)
|
||||
self.ostream.write("\n")
|
||||
|
||||
def write(self, record):
|
||||
self.ostream.write(self.ofs.join([str(k)+self.ops+str(v) for [k,v] in record.get_pairs()]))
|
||||
self.ostream.write("\n")
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
class RecordWriterHeaderFirst(RecordWriter):
|
||||
def __init__(self, ostream, ors, ofs):
|
||||
RecordWriter.__init__(self, ostream, ors, ofs, None)
|
||||
self.field_names = None
|
||||
def __init__(self, ostream, ors, ofs):
|
||||
RecordWriter.__init__(self, ostream, ors, ofs, None)
|
||||
self.field_names = None
|
||||
|
||||
def write(self, record):
|
||||
data_string = self.ofs.join([str(v) for [k,v] in record.get_pairs()])
|
||||
if self.field_names == None:
|
||||
self.field_names = record.get_field_names()
|
||||
header_string = self.ofs.join([str(k) for [k,v] in record.get_pairs()])
|
||||
self.ostream.write(header_string)
|
||||
def write(self, record):
|
||||
data_string = self.ofs.join([str(v) for [k, v] in record.get_pairs()])
|
||||
if self.field_names is None:
|
||||
self.field_names = record.get_field_names()
|
||||
header_string = self.ofs.join([str(k) for [k, v] in record.get_pairs()])
|
||||
self.ostream.write(header_string)
|
||||
self.ostream.write("\n")
|
||||
self.ostream.write(data_string)
|
||||
self.ostream.write("\n")
|
||||
self.ostream.write(data_string)
|
||||
self.ostream.write("\n")
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# xxx rename
|
||||
|
||||
class RecordWriterVerticallyTabulated(RecordWriter):
|
||||
def __init__(self, ostream):
|
||||
RecordWriter.__init__(self, ostream, None, None, None)
|
||||
|
||||
def write(self, record):
|
||||
max_field_name_width = 1
|
||||
field_names = record.get_field_names()
|
||||
for field_name in field_names:
|
||||
field_name_width = len(field_name)
|
||||
if field_name_width > max_field_name_width:
|
||||
max_field_name_width = field_name_width
|
||||
lines = []
|
||||
for field_name in field_names:
|
||||
lines.append("%-*s %s" % (max_field_name_width, field_name, record.get(field_name)))
|
||||
self.ostream.write("\n".join(lines))
|
||||
self.ostream.write("\n\n")
|
||||
class RecordWriterVerticallyTabulated(RecordWriter):
|
||||
def __init__(self, ostream):
|
||||
RecordWriter.__init__(self, ostream, None, None, None)
|
||||
|
||||
def write(self, record):
|
||||
max_field_name_width = 1
|
||||
field_names = record.get_field_names()
|
||||
for field_name in field_names:
|
||||
field_name_width = len(field_name)
|
||||
if field_name_width > max_field_name_width:
|
||||
max_field_name_width = field_name_width
|
||||
lines = []
|
||||
for field_name in field_names:
|
||||
lines.append(
|
||||
"%-*s %s" % (max_field_name_width, field_name, record.get(field_name))
|
||||
)
|
||||
self.ostream.write("\n".join(lines))
|
||||
self.ostream.write("\n\n")
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
class RecordWriterIntegerIndexed:
|
||||
def __init__(self, ostream, ors, ofs):
|
||||
self.ostream = ostream
|
||||
self.ors = ors
|
||||
self.ofs = ofs
|
||||
def write(self, record):
|
||||
self.ostream.write(self.ofs.join([str(v) for [k,v] in record.get_pairs()]))
|
||||
self.ostream.write("\n")
|
||||
def __init__(self, ostream, ors, ofs):
|
||||
self.ostream = ostream
|
||||
self.ors = ors
|
||||
self.ofs = ofs
|
||||
|
||||
def write(self, record):
|
||||
self.ostream.write(self.ofs.join([str(v) for [k, v] in record.get_pairs()]))
|
||||
self.ostream.write("\n")
|
||||
|
||||
|
||||
# ================================================================
|
||||
class CatModulator:
|
||||
def __init__(self):
|
||||
pass
|
||||
def modulate(self, record):
|
||||
if record == None: # drain at end
|
||||
return []
|
||||
return [record]
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def modulate(self, record):
|
||||
if record is None: # drain at end
|
||||
return []
|
||||
return [record]
|
||||
|
||||
|
||||
class TacModulator:
|
||||
def __init__(self):
|
||||
self.records = []
|
||||
def modulate(self, record):
|
||||
if record == None: # drain at end
|
||||
self.records.reverse()
|
||||
rv = self.records
|
||||
self.records = []
|
||||
return rv
|
||||
else:
|
||||
self.records.append(record)
|
||||
return []
|
||||
def __init__(self):
|
||||
self.records = []
|
||||
|
||||
def modulate(self, record):
|
||||
if record is None: # drain at end
|
||||
self.records.reverse()
|
||||
rv = self.records
|
||||
self.records = []
|
||||
return rv
|
||||
else:
|
||||
self.records.append(record)
|
||||
return []
|
||||
|
||||
|
||||
class SelectFieldsModulator:
|
||||
def __init__(self, field_names):
|
||||
self.field_names = field_names
|
||||
def modulate(self, record):
|
||||
if record == None: # drain at end
|
||||
return []
|
||||
kvs = []
|
||||
for field_name in self.field_names:
|
||||
if record.has_key(field_name):
|
||||
kvs.append((field_name, record.get(field_name)))
|
||||
new_record = Record()
|
||||
new_record.mput(kvs)
|
||||
return [new_record]
|
||||
def __init__(self, field_names):
|
||||
self.field_names = field_names
|
||||
|
||||
def modulate(self, record):
|
||||
if record is None: # drain at end
|
||||
return []
|
||||
kvs = []
|
||||
for field_name in self.field_names:
|
||||
if record.has_key(field_name):
|
||||
kvs.append((field_name, record.get(field_name)))
|
||||
new_record = Record()
|
||||
new_record.mput(kvs)
|
||||
return [new_record]
|
||||
|
||||
|
||||
# The field_names argument may be a list or hash-set -- as long as it supports
|
||||
# the "in" operator as in "name in field_names".
|
||||
# xxx to do: use a hash-set internally.
|
||||
class DeselectFieldsModulator:
|
||||
def __init__(self, field_names):
|
||||
self.field_names = field_names
|
||||
def modulate(self, record):
|
||||
if record == None: # drain at end
|
||||
return []
|
||||
kvs = []
|
||||
for field_name in record.get_field_names():
|
||||
if not field_name in self.field_names:
|
||||
kvs.append((field_name, record.get(field_name)))
|
||||
new_record = Record()
|
||||
new_record.mput(kvs)
|
||||
return [new_record]
|
||||
def __init__(self, field_names):
|
||||
self.field_names = field_names
|
||||
|
||||
def modulate(self, record):
|
||||
if record is None: # drain at end
|
||||
return []
|
||||
kvs = []
|
||||
for field_name in record.get_field_names():
|
||||
if field_name not in self.field_names:
|
||||
kvs.append((field_name, record.get(field_name)))
|
||||
new_record = Record()
|
||||
new_record.mput(kvs)
|
||||
return [new_record]
|
||||
|
||||
|
||||
class SortFieldsInRecordModulator:
|
||||
def __init__(self, do_ascending_sort=True):
|
||||
self.do_ascending_sort = do_ascending_sort
|
||||
def modulate(self, record):
|
||||
if record == None: # drain at end
|
||||
return []
|
||||
kvs = []
|
||||
sorted_field_names = sorted(record.get_field_names())
|
||||
if not self.do_ascending_sort:
|
||||
sorted_field_names.reverse() # xxx optimize
|
||||
for field_name in sorted_field_names:
|
||||
kvs.append((field_name, record.get(field_name)))
|
||||
new_record = Record()
|
||||
new_record.mput(kvs)
|
||||
return [new_record]
|
||||
def __init__(self, do_ascending_sort=True):
|
||||
self.do_ascending_sort = do_ascending_sort
|
||||
|
||||
def modulate(self, record):
|
||||
if record is None: # drain at end
|
||||
return []
|
||||
kvs = []
|
||||
sorted_field_names = sorted(record.get_field_names())
|
||||
if not self.do_ascending_sort:
|
||||
sorted_field_names.reverse() # xxx optimize
|
||||
for field_name in sorted_field_names:
|
||||
kvs.append((field_name, record.get(field_name)))
|
||||
new_record = Record()
|
||||
new_record.mput(kvs)
|
||||
return [new_record]
|
||||
|
||||
|
||||
class MeanKeeper:
|
||||
def __init__(self):
|
||||
self.sum = 0.0
|
||||
self.count = 0
|
||||
def put(x):
|
||||
self.sum += x
|
||||
self.count += 1
|
||||
def get_sum():
|
||||
return self.sum
|
||||
def get_count():
|
||||
return self.count
|
||||
def get_mean():
|
||||
# In IEEE-standard floating-point this would give NaN in the empty case.
|
||||
# But Python throws an exception on divide by zero instead.
|
||||
if self.count == 0:
|
||||
return None
|
||||
else:
|
||||
return self.sum / self.count
|
||||
def __init__(self):
|
||||
self.sum = 0.0
|
||||
self.count = 0
|
||||
|
||||
def put(self, x):
|
||||
self.sum += x
|
||||
self.count += 1
|
||||
|
||||
def get_sum(self):
|
||||
return self.sum
|
||||
|
||||
def get_count(self):
|
||||
return self.count
|
||||
|
||||
def get_mean(self):
|
||||
# In IEEE-standard floating-point this would give NaN in the empty case.
|
||||
# But Python throws an exception on divide by zero instead.
|
||||
if self.count == 0:
|
||||
return None
|
||||
else:
|
||||
return self.sum / self.count
|
||||
|
||||
|
||||
class MeanModulator:
|
||||
def __init__(self, collate_field_names, key_field_names=[]):
|
||||
self.collate_field_names = collate_field_names
|
||||
self.key_field_names = key_field_names
|
||||
# map from key-field values to (map from collate-field names to MSCKeeper objects).
|
||||
self.collate_outputs = {}
|
||||
def __init__(self, collate_field_names, key_field_names=[]):
|
||||
self.collate_field_names = collate_field_names
|
||||
self.key_field_names = key_field_names
|
||||
# map from key-field values to (map from collate-field names to MSCKeeper objects).
|
||||
self.collate_outputs = {}
|
||||
|
||||
def modulate(self, record):
|
||||
if record != None: # drain at end
|
||||
def modulate(self, record):
|
||||
if record is not None: # drain at end
|
||||
# xxx optimize
|
||||
for value_field_name in self.collate_field_names:
|
||||
if not record.has_key(value_field_name):
|
||||
return []
|
||||
for key_field_name in self.key_field_names:
|
||||
if not record.has_key(key_field_name):
|
||||
return []
|
||||
|
||||
# xxx optimize
|
||||
for value_field_name in self.collate_field_names:
|
||||
if not record.has_key(value_field_name):
|
||||
return []
|
||||
for key_field_name in self.key_field_names:
|
||||
if not record.has_key(key_field_name):
|
||||
return []
|
||||
collate_field_values = [
|
||||
float(record.get(k)) for k in self.collate_field_names
|
||||
]
|
||||
key_string = ",".join([record.get(k) for k in self.key_field_names])
|
||||
|
||||
collate_field_values = [float(record.get(k)) for k in self.collate_field_names]
|
||||
key_string = ",".join([record.get(k) for k in self.key_field_names])
|
||||
# xxx wip
|
||||
return []
|
||||
else:
|
||||
# xxx stub
|
||||
output_record = Record()
|
||||
output_record.put("foo", "bar")
|
||||
return [output_record]
|
||||
|
||||
return []
|
||||
else:
|
||||
# xxx stub
|
||||
output_record = Record()
|
||||
output_record.put("foo", "bar")
|
||||
return [output_record]
|
||||
|
||||
# ================================================================
|
||||
class StreamModulator:
|
||||
def __init__(self):
|
||||
pass
|
||||
def modulate(self, rreader, rmodulator, rwriter):
|
||||
while True:
|
||||
in_record = rreader.read()
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
out_records = rmodulator.modulate(in_record)
|
||||
def modulate(self, rreader, rmodulator, rwriter):
|
||||
while True:
|
||||
in_record = rreader.read()
|
||||
|
||||
for out_record in out_records:
|
||||
rwriter.write(out_record)
|
||||
out_records = rmodulator.modulate(in_record)
|
||||
|
||||
for out_record in out_records:
|
||||
rwriter.write(out_record)
|
||||
|
||||
if in_record is None:
|
||||
break
|
||||
|
||||
if in_record == None:
|
||||
break
|
||||
|
||||
# ================================================================
|
||||
def set_up_namespace():
|
||||
namespace = MillerNamespace()
|
||||
namespace.put("ORS", namespace.put("IRS", "\n"))
|
||||
namespace.put("OFS", namespace.put("IFS", ","))
|
||||
namespace.put("OPS", namespace.put("IPS", "="))
|
||||
namespace = MillerNamespace()
|
||||
namespace.put("ORS", namespace.put("IRS", "\n"))
|
||||
namespace.put("OFS", namespace.put("IFS", ","))
|
||||
namespace.put("OPS", namespace.put("IPS", "="))
|
||||
|
||||
# xxx CONVFMT
|
||||
# xxx CONVFMT
|
||||
|
||||
namespace.put("FILENAME", None)
|
||||
namespace.iput("NF", None)
|
||||
namespace.iput("NR", 0)
|
||||
namespace.iput("FNR", 0)
|
||||
namespace.put("FILENAME", None)
|
||||
namespace.iput("NF", None)
|
||||
namespace.iput("NR", 0)
|
||||
namespace.iput("FNR", 0)
|
||||
|
||||
return namespace
|
||||
|
||||
return namespace
|
||||
|
||||
# ================================================================
|
||||
main()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue