#!/usr/bin/python import os, sys import getopt import string, re import collections # ================================================================ # o inclflds a,x,b # o newflds '{$y:$x*$x, $z:$x/2, $n:-$z}' # o greprecs '$x <= 2 && $y eq "zebra"' # # o tabular pretty-print # o mean # o sort # absolute essentials: # * RECORD-LEVEL: # k include/exclude fields # o new field as function of old # o vertical pretty-print # * STREAM-LEVEL: # o include/exclude records # o sort # o summarizations: min, max, mean, count, sum, first, last # o tabular pretty-print # ================================================================ def usage(): print >> sys.stderr, "Usage: %s [options] {modulator-spec} {zero or more filenames}" % os.path.basename(sys.argv[0]) print >> sys.stderr, "Options:" print >> sys.stderr, " -R {rs} Input/output record separator" print >> sys.stderr, " -F {fs} Input/output field separator" print >> sys.stderr, " -P {ps} Input/output key-value-pair separator" print >> sys.stderr, " -v {name=value} xxx needs more doc" print >> sys.stderr, "" print >> sys.stderr, " --idkvp Input format is delimited by IRS,IFS,IPS" print >> sys.stderr, " --odkvp Output format is delimited by IRS,IFS,IPS" print >> sys.stderr, " --icsv Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)" print >> sys.stderr, " --ocsv Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)" print >> sys.stderr, " --inidx Input format is implicitly integer-indexed (awk-style)" print >> sys.stderr, " --onidx Output format is implicitly integer-indexed (awk-style)" print >> sys.stderr, " --ixtab Input format is transposed-tabular-pretty-print" print >> sys.stderr, " --oxtab Output format is transposed-tabular-pretty-print" print >> sys.stderr, "Modulator specs:" print >> sys.stderr, '--cat' print >> sys.stderr, '--tac' print >> sys.stderr, '--cut' print >> sys.stderr, '--cutx' print >> sys.stderr, '--sortfields' print >> sys.stderr, '--sortfieldsup' print >> sys.stderr, '--sortfieldsdown' sys.exit(1) # ---------------------------------------------------------------- def parse_command_line(): namespace = set_up_namespace() rreader = None rwriter = None rmodulator = None try: optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [ 'help', 'idkvp', 'odkvp', 'icsv', 'ocsv', 'inidx', 'onidx', 'ixtab', 'oxtab', 'cat', 'tac', 'cut=', 'cutx=', 'sortfields', 'sortfieldsup', 'sortfieldsdown']) except getopt.GetoptError, err: print str(err) usage() sys.exit(1) for opt, arg in optargs: if opt == '-R': rs = arg namespace.put("ORS", namespace.put("IRS", rs)) elif opt == '-F': fs = arg namespace.put("OFS", namespace.put("IFS", fs)) elif opt == '-P': ps = arg namespace.put("OPS", namespace.put("IPS", ps)) elif opt == '-v': kv = string.split(arg, "=", 1) namespace.put(kv[0], kv[1]) elif opt == '--idkvp': rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS")) elif opt == '--odkvp': rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS")) elif opt == '--icsv': rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS")) elif opt == '--ocsv': rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS")) elif opt == '--inidx': rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS")) elif opt == '--onidx': rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS")) #elif opt == '--ixtab': # pass elif opt == '--oxtab': rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!? elif opt == '--cat': rmodulator = CatModulator() elif opt == '--tac': rmodulator = TacModulator() elif opt == '--cut': rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS"))) elif opt == '--cutx': rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS"))) elif opt == '--cutx': rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS"))) elif opt == '--sortfields': rmodulator = SortFieldsInRecordModulator(True) elif opt == '--sortfieldsup': rmodulator = SortFieldsInRecordModulator(True) elif opt == '--sortfieldsdown': rmodulator = SortFieldsInRecordModulator(False) elif opt == '--help': usage() else: print >> sys.stderr, "Unhandled option \"%s\"." % opt sys.exit(1) #xxx non_option_arg_count = len(non_option_args) if rreader == None: rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS")) if rwriter == None: rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS")) if rmodulator == None: rmodulator = CatModulator() return {'namespace':namespace, 'rreader':rreader, 'rwriter':rwriter, 'rmodulator':rmodulator} def main(): options = parse_command_line() # parse ARGV: # * --ifmt: dkvp,hdr1st,iidxed,align,xposealign # * --ofmt: dkvp,hdr1st,iidxed,align,xposealign # * which-control-language spec?!? # * modulators/script ... this is the key decision area for language(s) design. # * filenames namespace = options['namespace'] rreader = options['rreader'] rmodulator = options['rmodulator'] rwriter = options['rwriter'] smodulator = StreamModulator() smodulator.modulate(rreader, rmodulator, rwriter) # ================================================================ class MillerNamespace: def __init__(self): self.mapping = {} self.imapping = {} def get(self, name): return self.mapping[name] def iget(self, name): return self.imapping[name] def put(self, name, value): self.mapping[name] = value return value def iput(self, name, ivalue): self.imapping[name] = ivalue return ivalue # ================================================================ class Record: # kvs is list of pair-lists. (xxx: do tuples work too?) def __init__(self, kvs=[]): self.fields = collections.OrderedDict() self.mput(kvs) def put(self, k, v): self.fields[k] = v def mput(self, kvs): for [k,v] in kvs: self.fields[k] = v def get(self, k): return self.fields[k] def has_key(self, k): return self.fields.has_key(k) def get_field_names(self): return self.fields.keys() def get_pairs(self): return self.fields.items() def num_pairs(self): return len(self.fields.items()) # xxx xref to record-formatter classes def __str__(self): return self.fields.__repr__ def __repr__(self): return self.fields.__repr__ # ================================================================ # Each record is a sequence of fields delimited by FS, each of which is a # key-value pair separated by PS. class RecordReader: def __init__(self, istream, namespace, irs, ifs, ips): self.istream = istream self.namespace = namespace self.irs = irs self.ifs = ifs self.ips = ips class RecordReaderDefault(RecordReader): def __init__(self, istream, namespace, irs, ifs, ips): RecordReader.__init__(self, istream, namespace, irs, ifs, ips) def read(self): line = self.istream.readline() # xxx use self.irs if line == '': return None line = line.strip() # Remove leading/trailing whitespace including carriage return from readline(). fields = string.split(line, self.ifs) kvs = [string.split(field, self.ips, 1) for field in fields] record = Record(kvs) self.namespace.iput("NF", record.num_pairs) self.namespace.iput("NR", self.namespace.iget("NR") + 1) # xxx stub self.namespace.put("FILENAME", None) self.namespace.iput("FNR", self.namespace.iget("FNR") + 1) return record # ---------------------------------------------------------------- # awk-style class RecordReaderIntegerIndexed(RecordReader): # xxx ctor with istream context?!? or independent of that?!? for cskv, no matter. # csv reader of course needs context. def __init__(self, istream, namespace, irs, ifs): RecordReader.__init__(self, istream, namespace, irs, ifs, None) def read(self): # xxx use self.irs line = self.istream.readline() if line == '': return None line = line.strip() # Remove leading/trailing whitespace including carriage return from readline(). fields = re.split(self.ifs, line) kvs = [] i = 1 for field in fields: kvs.append([i, field]) i += 1 return Record(kvs) # ---------------------------------------------------------------- # csv-style class RecordReaderHeaderFirst(RecordReader): def __init__(self, istream, namespace, irs, ifs): RecordReader.__init__(self, istream, namespace, irs, ifs, None) self.field_names = None self.header_line = None def read(self): if self.field_names == None: header_line = self.istream.readline() if header_line == '': return None # Remove leading/trailing whitespace including carriage return from readline(). header_line = header_line.strip() self.field_names = string.split(header_line, self.ifs, -1) self.header_line = header_line data_line = self.istream.readline() if data_line == '': return None # Remove leading/trailing whitespace including carriage return from readline(). data_line = data_line.strip() field_values = string.split(data_line, self.ifs, -1) if len(self.field_names) != len(field_values): raise Exception("Header/data length mismatch: %d != %d in \"%s\" and \"%s\"" % \ (len(field_names), len(field_values), self.header_line, data_line)) return Record(zip(self.field_names, field_values)) # ================================================================ # xxx ostream at ctor?? needs drain-at-end logic for prettyprint. class RecordWriter: def __init__(self, ostream, ors, ofs, ops): self.ostream = ostream self.ors = ors self.ofs = ofs self.ops = ops class RecordWriterDefault(RecordWriter): def __init__(self, ostream, ors, ofs, ops): RecordWriter.__init__(self, ostream, ors, ofs, ops) def write(self, record): self.ostream.write(self.ofs.join([str(k)+self.ops+str(v) for [k,v] in record.get_pairs()])) self.ostream.write("\n") # ---------------------------------------------------------------- class RecordWriterHeaderFirst(RecordWriter): def __init__(self, ostream, ors, ofs): RecordWriter.__init__(self, ostream, ors, ofs, None) self.field_names = None def write(self, record): data_string = self.ofs.join([str(v) for [k,v] in record.get_pairs()]) if self.field_names == None: self.field_names = record.get_field_names() header_string = self.ofs.join([str(k) for [k,v] in record.get_pairs()]) self.ostream.write(header_string) self.ostream.write("\n") self.ostream.write(data_string) self.ostream.write("\n") # ---------------------------------------------------------------- # xxx rename class RecordWriterVerticallyTabulated(RecordWriter): def __init__(self, ostream): RecordWriter.__init__(self, ostream, None, None, None) def write(self, record): max_field_name_width = 1 field_names = record.get_field_names() for field_name in field_names: field_name_width = len(field_name) if field_name_width > max_field_name_width: max_field_name_width = field_name_width lines = [] for field_name in field_names: lines.append("%-*s %s" % (max_field_name_width, field_name, record.get(field_name))) self.ostream.write("\n".join(lines)) self.ostream.write("\n\n") # ---------------------------------------------------------------- class RecordWriterIntegerIndexed: def __init__(self, ostream, ors, ofs): self.ostream = ostream self.ors = ors self.ofs = ofs def write(self, record): self.ostream.write(self.ofs.join([str(v) for [k,v] in record.get_pairs()])) self.ostream.write("\n") # ================================================================ class CatModulator: def __init__(self): pass def modulate(self, record): if record == None: # drain at end return [] return [record] class TacModulator: def __init__(self): self.records = [] def modulate(self, record): if record == None: # drain at end self.records.reverse() rv = self.records self.records = [] return rv else: self.records.append(record) return [] class SelectFieldsModulator: def __init__(self, field_names): self.field_names = field_names def modulate(self, record): if record == None: # drain at end return [] kvs = [] for field_name in self.field_names: if record.has_key(field_name): kvs.append((field_name, record.get(field_name))) new_record = Record() new_record.mput(kvs) return [new_record] # The field_names argument may be a list or hash-set -- as long as it supports # the "in" operator as in "name in field_names". # xxx to do: use a hash-set internally. class DeselectFieldsModulator: def __init__(self, field_names): self.field_names = field_names def modulate(self, record): if record == None: # drain at end return [] kvs = [] for field_name in record.get_field_names(): if not field_name in self.field_names: kvs.append((field_name, record.get(field_name))) new_record = Record() new_record.mput(kvs) return [new_record] class SortFieldsInRecordModulator: def __init__(self, do_ascending_sort=True): self.do_ascending_sort = do_ascending_sort def modulate(self, record): if record == None: # drain at end return [] kvs = [] sorted_field_names = sorted(record.get_field_names()) if not self.do_ascending_sort: sorted_field_names.reverse() # xxx optimize for field_name in sorted_field_names: kvs.append((field_name, record.get(field_name))) new_record = Record() new_record.mput(kvs) return [new_record] class MeanKeeper: def __init__(self): self.sum = 0.0 self.count = 0 def put(x): self.sum += x self.count += 1 def get_sum(): return self.sum def get_count(): return self.count def get_mean(): # In IEEE-standard floating-point this would give NaN in the empty case. # But Python throws an exception on divide by zero instead. if self.count == 0: return None else: return self.sum / self.count class MeanModulator: def __init__(self, collate_field_names, key_field_names=[]): self.collate_field_names = collate_field_names self.key_field_names = key_field_names # map from key-field values to (map from collate-field names to MSCKeeper objects). self.collate_outputs = {} def modulate(self, record): if record != None: # drain at end # xxx optimize for value_field_name in self.collate_field_names: if not record.has_key(value_field_name): return [] for key_field_name in self.key_field_names: if not record.has_key(key_field_name): return [] collate_field_values = [float(record.get(k)) for k in self.collate_field_names] key_string = ",".join([record.get(k) for k in self.key_field_names]) return [] else: # xxx stub output_record = Record() output_record.put("foo", "bar") return [output_record] # ================================================================ class StreamModulator: def __init__(self): pass def modulate(self, rreader, rmodulator, rwriter): while True: in_record = rreader.read() out_records = rmodulator.modulate(in_record) for out_record in out_records: rwriter.write(out_record) if in_record == None: break # ================================================================ def set_up_namespace(): namespace = MillerNamespace() namespace.put("ORS", namespace.put("IRS", "\n")) namespace.put("OFS", namespace.put("IFS", ",")) namespace.put("OPS", namespace.put("IPS", "=")) # xxx CONVFMT namespace.put("FILENAME", None) namespace.iput("NF", None) namespace.iput("NR", 0) namespace.iput("FNR", 0) return namespace # ================================================================ main()