diff --git a/go/src/miller/cli/mlrcli_mappers.go b/go/src/miller/cli/mlrcli_mappers.go index 40817b753..5fd63266f 100644 --- a/go/src/miller/cli/mlrcli_mappers.go +++ b/go/src/miller/cli/mlrcli_mappers.go @@ -16,6 +16,7 @@ var MAPPER_LOOKUP_TABLE = []mapping.MapperSetup{ mappers.CountSetup, mappers.CutSetup, mappers.FilterSetup, + mappers.GrepSetup, mappers.GroupBySetup, mappers.GroupLikeSetup, mappers.HeadSetup, diff --git a/go/src/miller/mappers/grep.go b/go/src/miller/mappers/grep.go new file mode 100644 index 000000000..70bdebbd6 --- /dev/null +++ b/go/src/miller/mappers/grep.go @@ -0,0 +1,164 @@ +package mappers + +import ( + "flag" + "fmt" + "os" + "regexp" + + "miller/clitypes" + "miller/mapping" + "miller/types" +) + +// ---------------------------------------------------------------- +var GrepSetup = mapping.MapperSetup{ + Verb: "grep", + ParseCLIFunc: mapperGrepParseCLI, + IgnoresInput: false, +} + +func mapperGrepParseCLI( + pargi *int, + argc int, + args []string, + errorHandling flag.ErrorHandling, // ContinueOnError or ExitOnError + _ *clitypes.TReaderOptions, + __ *clitypes.TWriterOptions, +) mapping.IRecordMapper { + + // Get the verb name from the current spot in the mlr command line + argi := *pargi + verb := args[argi] + argi++ + + // Parse local flags + flagSet := flag.NewFlagSet(verb, errorHandling) + + pIgnoreCase := flagSet.Bool( + "i", + false, + `Use case-insensitive search`, + ) + + pInvert := flagSet.Bool( + "v", + false, + `Invert: pass through records which do not match the regex.`, + ) + + flagSet.Usage = func() { + ostream := os.Stderr + if errorHandling == flag.ContinueOnError { // help intentionally requested + ostream = os.Stdout + } + mapperGrepUsage(ostream, args[0], verb, flagSet) + } + flagSet.Parse(args[argi:]) + if errorHandling == flag.ContinueOnError { // help intentionally requested + return nil + } + + // Find out how many flags were consumed by this verb and advance for the + // next verb + argi = len(args) - len(flagSet.Args()) + + // Get the regex from the command line + if argi >= argc { + flagSet.Usage() + os.Exit(1) + } + pattern := args[argi] + argi += 1 + + // TODO: maybe CompilePOSIX + regexp, err := regexp.Compile(pattern) + if err != nil { + fmt.Fprintf(os.Stderr, "%s %s: couldn't compile regex \"%s\"\n", + args[0], verb, pattern) + os.Exit(1) + } + + mapper, _ := NewMapperGrep( + regexp, + *pIgnoreCase, + *pInvert, + ) + + *pargi = argi + return mapper +} + +func mapperGrepUsage( + o *os.File, + argv0 string, + verb string, + flagSet *flag.FlagSet, +) { + fmt.Fprintf(o, "Usage: %s %s [options] {regular expression}\n", argv0, verb) + fmt.Fprintf(o, "Passes through records which match the regular expression.\n") + + // flagSet.PrintDefaults() doesn't let us control stdout vs stderr + fmt.Fprint(o, "Options:\n") + flagSet.VisitAll(func(f *flag.Flag) { + fmt.Fprintf(o, " -%v (default %v) %v\n", f.Name, f.Value, f.Usage) // f.Name, f.Value + }) + + fmt.Fprint(o, `Note that "mlr filter" is more powerful, but requires you to know field names. +By contrast, "mlr grep" allows you to regex-match the entire record. It does +this by formatting each record in memory as DKVP, using command-line-specified +ORS/OFS/OPS, and matching the resulting line against the regex specified +here. In particular, the regex is not applied to the input stream: if you +have CSV with header line "x,y,z" and data line "1,2,3" then the regex will +be matched, not against either of these lines, but against the DKVP line +"x=1,y=2,z=3". Furthermore, not all the options to system grep are supported, +and this command is intended to be merely a keystroke-saver. To get all the +features of system grep, you can do + "mlr --odkvp ... | grep ... | mlr --idkvp ..." +`) + +} + +// ---------------------------------------------------------------- +type MapperGrep struct { + regexp *regexp.Regexp + ignoreCase bool + invert bool +} + +func NewMapperGrep( + regexp *regexp.Regexp, + ignoreCase bool, + invert bool, +) (*MapperGrep, error) { + this := &MapperGrep{ + regexp: regexp, + ignoreCase: ignoreCase, + invert: invert, + } + return this, nil +} + +// ---------------------------------------------------------------- +func (this *MapperGrep) Map( + inrecAndContext *types.RecordAndContext, + outputChannel chan<- *types.RecordAndContext, +) { + inrec := inrecAndContext.Record + if inrec != nil { // not end of record stream + inrecAsString := inrec.ToDKVPString() + // TODO: ignore case + matches := this.regexp.Match([]byte(inrecAsString)) + if this.invert { + if !matches { + outputChannel <- inrecAndContext + } + } else { + if matches { + outputChannel <- inrecAndContext + } + } + } else { + outputChannel <- inrecAndContext + } +} diff --git a/go/todo.txt b/go/todo.txt index b366f7bd3..df6d3efc4 100644 --- a/go/todo.txt +++ b/go/todo.txt @@ -4,8 +4,10 @@ TOP OF LIST: * UDFs * local vars w/ typing +* grep -i + * some more verbs - o altkv decimate grep having-fields remove-empty-columns reorder + o altkv decimate having-fields remove-empty-columns reorder o stats1 stats2 * double-check rand-seeding diff --git a/go/u/try-help.out b/go/u/try-help.out index 6b83947a5..4a9a2893f 100644 --- a/go/u/try-help.out +++ b/go/u/try-help.out @@ -44,6 +44,24 @@ TODO: put detailed on-line help here. full transparency on the precedence and associativity rules of Miller's grammar, to stdout. +================================================================ +Usage: ./mlr grep [options] {regular expression} +Passes through records which match the regular expression. +Options: + -i (default false) Use case-insensitive search + -v (default false) Invert: pass through records which do not match the regex. +Note that "mlr filter" is more powerful, but requires you to know field names. +By contrast, "mlr grep" allows you to regex-match the entire record. It does +this by formatting each record in memory as DKVP, using command-line-specified +ORS/OFS/OPS, and matching the resulting line against the regex specified +here. In particular, the regex is not applied to the input stream: if you +have CSV with header line "x,y,z" and data line "1,2,3" then the regex will +be matched, not against either of these lines, but against the DKVP line +"x=1,y=2,z=3". Furthermore, not all the options to system grep are supported, +and this command is intended to be merely a keystroke-saver. To get all the +features of system grep, you can do + "mlr --odkvp ... | grep ... | mlr --idkvp ..." + ================================================================ Usage: ./mlr group-by [options] Outputs records in batches having identical values at specified field names. diff --git a/go/u/try-verbs b/go/u/try-verbs index cdbceb12d..0d49baa6a 100755 --- a/go/u/try-verbs +++ b/go/u/try-verbs @@ -66,3 +66,6 @@ run_mlr --opprint --from u/medium.dkvp count -g a -n run_mlr --opprint --from u/medium.dkvp count -g a,b -n run_mlr --opprint --from u/medium.dkvp count -o NAME run_mlr --opprint --from u/medium.dkvp count -g a -o NAME + +run_mlr --opprint --from u/s.dkvp grep pan +run_mlr --opprint --from u/s.dkvp grep -v pan diff --git a/go/u/try-verbs.out b/go/u/try-verbs.out index 5d973884b..824665b71 100644 --- a/go/u/try-verbs.out +++ b/go/u/try-verbs.out @@ -484,3 +484,15 @@ eks 10 wye 7 zee 8 hat 7 + +---------------------------------------------------------------- +mlr --opprint --from u/s.dkvp grep pan +a b i x y +pan pan 1 0.3467901443380824 0.7268028627434533 +eks pan 2 0.7586799647899636 0.5221511083334797 + +---------------------------------------------------------------- +mlr --opprint --from u/s.dkvp grep -v pan +a b i x y +wye wye 3 0.20460330576630303 0.33831852551664776 +eks wye 4 0.38139939387114097 0.13418874328430463