From d9298fd26b16f5dbb3800aab3fd3c2d4fa88551f Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 26 Jan 2022 23:06:41 -0500 Subject: [PATCH 1/3] mlr split --- internal/pkg/output/file_output_handlers.go | 23 + .../pkg/transformers/aaa_transformer_table.go | 1 + internal/pkg/transformers/split.go | 437 ++++++++++++++++++ test/input/example.csv | 11 + todo.txt | 2 + 5 files changed, 474 insertions(+) create mode 100644 internal/pkg/transformers/split.go create mode 100644 test/input/example.csv diff --git a/internal/pkg/output/file_output_handlers.go b/internal/pkg/output/file_output_handlers.go index b5e1df510..cd7c3f896 100644 --- a/internal/pkg/output/file_output_handlers.go +++ b/internal/pkg/output/file_output_handlers.go @@ -56,6 +56,17 @@ type MultiOutputHandlerManager struct { } // ---------------------------------------------------------------- +func NewFileOutputHandlerManager( + recordWriterOptions *cli.TWriterOptions, + doAppend bool, +) *MultiOutputHandlerManager { + if doAppend { + return NewFileAppendHandlerManager(recordWriterOptions) + } else { + return NewFileWritetHandlerManager(recordWriterOptions) + } +} + func NewFileWritetHandlerManager( recordWriterOptions *cli.TWriterOptions, ) *MultiOutputHandlerManager { @@ -228,6 +239,18 @@ func newOutputHandlerCommon( } // ---------------------------------------------------------------- +func NewFileOutputHandler( + filename string, + recordWriterOptions *cli.TWriterOptions, + doAppend bool, +) (*FileOutputHandler, error) { + if doAppend { + return NewFileAppendOutputHandler(filename, recordWriterOptions) + } else { + return NewFileWriteOutputHandler(filename, recordWriterOptions) + } +} + func NewFileWriteOutputHandler( filename string, recordWriterOptions *cli.TWriterOptions, diff --git a/internal/pkg/transformers/aaa_transformer_table.go b/internal/pkg/transformers/aaa_transformer_table.go index ed6c0a84d..463b745a4 100644 --- a/internal/pkg/transformers/aaa_transformer_table.go +++ b/internal/pkg/transformers/aaa_transformer_table.go @@ -59,6 +59,7 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{ SkipTrivialRecordsSetup, SortSetup, SortWithinRecordsSetup, + SplitSetup, Stats1Setup, Stats2Setup, StepSetup, diff --git a/internal/pkg/transformers/split.go b/internal/pkg/transformers/split.go new file mode 100644 index 000000000..287b42768 --- /dev/null +++ b/internal/pkg/transformers/split.go @@ -0,0 +1,437 @@ +package transformers + +import ( + "bytes" + "container/list" + "fmt" + "net/url" + "os" + "strings" + + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/mlrval" + "github.com/johnkerl/miller/internal/pkg/output" + "github.com/johnkerl/miller/internal/pkg/types" +) + +// ---------------------------------------------------------------- +const verbNameSplit = "split" +const splitDefaultOutputFileNamePrefix = "split" + +var SplitSetup = TransformerSetup{ + Verb: verbNameSplit, + UsageFunc: transformerSplitUsage, + ParseCLIFunc: transformerSplitParseCLI, + IgnoresInput: false, +} + +func transformerSplitUsage( + o *os.File, + doExit bool, + exitCode int, +) { + fmt.Fprintf(o, "Usage: %s %s [options] {filename}\n", "mlr", verbNameSplit) + fmt.Fprintf(o, + `Options: +-n {n}: Cap file sizes at N records. +-m {m}: Produce M files, round-robining records among them. +-g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c. +Exactly one of -m, -n, or -g must be supplied. +--prefix {p} Specify filename prefix; default "`+splitDefaultOutputFileNamePrefix+`". +--suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv". +-a Append to existing file(s), if any, rather than overwriting. +-v Send records along to downstream verbs as well as splitting to files. +-h|--help Show this message. +Any of the output-format command-line flags (see mlr -h). For example, using + mlr --icsv --from myfile.csv split --ojson -n 1000 +the input is CSV, but the output files are JSON. + +Examples: Suppose myfile.csv has 1,000,000 records. + +100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc. + mlr --csv --from myfile.csv split -n 10000 + +10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc. + mlr --csv --from myfile.csv split -m 10 +Same, but with JSON output. + mlr --csv --from myfile.csv split -m 10 -o json + +Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc. + mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat +Same, but written to the /tmp/ directory. + mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat + +If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv. + mlr --csv --from myfile.csv split -g shape + +If the color field has values yellow and green, and the shape field has values triangle and square, +then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc. + mlr --csv --from myfile.csv split -g color,shape + +See also the "tee" DSL function which lets you do more ad-hoc customization. +`) + if doExit { + os.Exit(exitCode) + } +} + +func transformerSplitParseCLI( + pargi *int, + argc int, + args []string, + mainOptions *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + verb := args[argi] + argi++ + + var n int = 0 + var doMod bool = false + var doSize bool = false + var groupByFieldNames []string = nil + var emitDownstream bool = false + var doAppend bool = false + var outputFileNamePrefix string = splitDefaultOutputFileNamePrefix + var outputFileNameSuffix string = "uninit" + haveOutputFileNameSuffix := false + + var localOptions *cli.TOptions = nil + if mainOptions != nil { + copyThereof := *mainOptions // struct copy + localOptions = ©Thereof + } + + // Parse local flags. + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + if args[argi] == "--" { + break // All transformers must do this so main-flags can follow verb-flags + } + argi++ + + if opt == "-h" || opt == "--help" { + transformerSplitUsage(os.Stdout, true, 0) + + } else if opt == "-n" { + n = cli.VerbGetIntArgOrDie(verb, opt, args, &argi, argc) + doSize = true + + } else if opt == "-m" { + n = cli.VerbGetIntArgOrDie(verb, opt, args, &argi, argc) + doMod = true + + } else if opt == "-g" { + groupByFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + + } else if opt == "--prefix" { + outputFileNamePrefix = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) + + } else if opt == "--suffix" { + outputFileNameSuffix = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) + haveOutputFileNameSuffix = true + + } else if opt == "-a" { + doAppend = true + + } else if opt == "-v" { + emitDownstream = true + + } else { + // This is inelegant. For error-proofing we advance argi already in our + // loop (so individual if-statements don't need to). However, + // ParseWriterOptions expects it unadvanced. + largi := argi - 1 + if cli.FLAG_TABLE.Parse(args, argc, &largi, localOptions) { + // This lets mlr main and mlr split have different output formats. + // Nothing else to handle here. + argi = largi + } else { + transformerSplitUsage(os.Stderr, true, 1) + } + } + } + + doGroup := groupByFieldNames != nil + if !doMod && !doSize && !doGroup { + fmt.Fprintf(os.Stderr, "mlr %s: At least one of -m, -n, or -g is required.\n", verb) + os.Exit(1) + } + if (doMod && doSize) || (doMod && doGroup) || (doSize && doGroup) { + fmt.Fprintf(os.Stderr, "mlr %s: Only one of -m, -n, or -g is required.\n", verb) + os.Exit(1) + } + + cli.FinalizeWriterOptions(&localOptions.WriterOptions) + if !haveOutputFileNameSuffix { + outputFileNameSuffix = localOptions.WriterOptions.OutputFileFormat + } + + *pargi = argi + if !doConstruct { // All transformers must do this for main command-line parsing + return nil + } + + transformer, err := NewTransformerSplit( + n, + doMod, + doSize, + groupByFieldNames, + emitDownstream, + doAppend, + outputFileNamePrefix, + outputFileNameSuffix, + &localOptions.WriterOptions, + ) + if err != nil { + // Error message already printed out + os.Exit(1) + } + + return transformer +} + +// ---------------------------------------------------------------- +type TransformerSplit struct { + n int + outputFileNamePrefix string + outputFileNameSuffix string + emitDownstream bool + ungroupedCounter int + groupByFieldNames []string + recordWriterOptions *cli.TWriterOptions + doAppend bool + + // For doSize ungrouped: only one file open at a time + outputHandler output.OutputHandler + previousQuotient int + + // For all other cases: multiple files open at a time + outputHandlerManager output.OutputHandlerManager + + recordTransformerFunc RecordTransformerFunc +} + +func NewTransformerSplit( + n int, + doMod bool, + doSize bool, + groupByFieldNames []string, + emitDownstream bool, + doAppend bool, + outputFileNamePrefix string, + outputFileNameSuffix string, + recordWriterOptions *cli.TWriterOptions, +) (*TransformerSplit, error) { + + tr := &TransformerSplit{ + n: n, + outputFileNamePrefix: outputFileNamePrefix, + outputFileNameSuffix: outputFileNameSuffix, + emitDownstream: emitDownstream, + ungroupedCounter: 0, + groupByFieldNames: groupByFieldNames, + recordWriterOptions: recordWriterOptions, + doAppend: doAppend, + + outputHandler: nil, + previousQuotient: -1, + } + + tr.outputHandlerManager = output.NewFileOutputHandlerManager(recordWriterOptions, doAppend) + + if groupByFieldNames != nil { + tr.recordTransformerFunc = tr.splitGrouped + } else if doMod { + tr.recordTransformerFunc = tr.splitModUngrouped + } else { + tr.recordTransformerFunc = tr.splitSizeUngrouped + } + + return tr, nil +} + +func (tr *TransformerSplit) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel, + outputDownstreamDoneChannel) +} + +func (tr *TransformerSplit) splitModUngrouped( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + if !inrecAndContext.EndOfStream { + remainder := 1 + (tr.ungroupedCounter % tr.n) + filename := tr.makeUngroupedOutputFileName(remainder) + + err := tr.outputHandlerManager.WriteRecordAndContext(inrecAndContext, filename) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: file-write error: %v\n", err) + os.Exit(1) + } + + if tr.emitDownstream { + outputRecordsAndContexts.PushBack(inrecAndContext) + } + + tr.ungroupedCounter++ + + } else { + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker + errs := tr.outputHandlerManager.Close() + if len(errs) > 0 { + for _, err := range errs { + fmt.Fprintf(os.Stderr, "mlr: file-close error: %v\n", err) + } + os.Exit(1) + } + } +} + +func (tr *TransformerSplit) splitSizeUngrouped( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + var err error + if !inrecAndContext.EndOfStream { + quotient := 1 + (tr.ungroupedCounter / tr.n) + + if quotient != tr.previousQuotient { + if tr.outputHandler != nil { + err = tr.outputHandler.Close() + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: file-close error: %v\n", err) + os.Exit(1) + } + } + + filename := tr.makeUngroupedOutputFileName(quotient) + tr.outputHandler, err = output.NewFileOutputHandler( + filename, + tr.recordWriterOptions, + tr.doAppend, + ) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: file-open error: %v\n", err) + os.Exit(1) + } + + tr.previousQuotient = quotient + } + + err = tr.outputHandler.WriteRecordAndContext(inrecAndContext) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: file-write error: %v\n", err) + os.Exit(1) + } + + if tr.emitDownstream { + outputRecordsAndContexts.PushBack(inrecAndContext) + } + + tr.ungroupedCounter++ + + } else { + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker + + if tr.outputHandler != nil { + err := tr.outputHandler.Close() + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: file-close error: %v\n", err) + os.Exit(1) + } + } + } +} + +func (tr *TransformerSplit) splitGrouped( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + if !inrecAndContext.EndOfStream { + var filename string + groupByFieldValues, ok := inrecAndContext.Record.GetSelectedValues(tr.groupByFieldNames) + if !ok { + filename = fmt.Sprintf("%s_ungrouped.%s", tr.outputFileNamePrefix, tr.outputFileNameSuffix) + } else { + filename = tr.makeGroupedOutputFileName(groupByFieldValues) + } + err := tr.outputHandlerManager.WriteRecordAndContext(inrecAndContext, filename) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) + os.Exit(1) + } + + if tr.emitDownstream { + outputRecordsAndContexts.PushBack(inrecAndContext) + } + + } else { + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker + + errs := tr.outputHandlerManager.Close() + if len(errs) > 0 { + for _, err := range errs { + fmt.Fprintf(os.Stderr, "mlr: file-close error: %v\n", err) + } + os.Exit(1) + } + } +} + +// makeUngroupedOutputFileName example: "split_53.csv" +func (tr *TransformerSplit) makeUngroupedOutputFileName(k int) string { + return fmt.Sprintf("%s_%d.%s", tr.outputFileNamePrefix, k, tr.outputFileNameSuffix) +} + +// makeGroupedOutputFileName example: "split_orange.csv" +func (tr *TransformerSplit) makeGroupedOutputFileName( + groupByFieldValues []*mlrval.Mlrval, +) string { + var buffer bytes.Buffer + buffer.WriteString(tr.outputFileNamePrefix) + for _, groupByFieldValue := range groupByFieldValues { + buffer.WriteString("_") + buffer.WriteString(url.QueryEscape(groupByFieldValue.String())) + } + buffer.WriteString(".") + buffer.WriteString(tr.outputFileNameSuffix) + return buffer.String() +} + +// makeGroupedIndexedOutputFileName example: "split_yellow_53.csv" +func (tr *TransformerSplit) makeGroupedIndexedOutputFileName( + groupByFieldValues []*mlrval.Mlrval, + index int, +) string { + // URL-escape the fields which come from data and which may have '/' + // etc within. Don't URL-escape the prefix since people may want to + // use prefixes like '/tmp/split' to write to the /tmp directory, etc. + var buffer bytes.Buffer + buffer.WriteString(tr.outputFileNamePrefix) + for _, groupByFieldValue := range groupByFieldValues { + buffer.WriteString("_") + buffer.WriteString(url.QueryEscape(groupByFieldValue.String())) + } + buffer.WriteString(fmt.Sprintf("_%d", index)) + buffer.WriteString(".") + buffer.WriteString(tr.outputFileNameSuffix) + return buffer.String() +} diff --git a/test/input/example.csv b/test/input/example.csv new file mode 100644 index 000000000..bf79dd5f7 --- /dev/null +++ b/test/input/example.csv @@ -0,0 +1,11 @@ +color,shape,flag,k,index,quantity,rate +yellow,triangle,true,1,11,43.6498,9.8870 +red,square,true,2,15,79.2778,0.0130 +red,circle,true,3,16,13.8103,2.9010 +red,square,false,4,48,77.5542,7.4670 +purple,triangle,false,5,51,81.2290,8.5910 +red,square,false,6,64,77.1991,9.5310 +purple,triangle,false,7,65,80.1405,5.8240 +yellow,circle,true,8,73,63.9785,4.2370 +yellow,circle,true,9,87,63.5058,8.3350 +purple,square,false,10,91,72.3735,8.2430 diff --git a/todo.txt b/todo.txt index 92fda4e5f..a6e7a7439 100644 --- a/todo.txt +++ b/todo.txt @@ -26,6 +26,7 @@ FEATURES o format/unformat o strmatch o =~ +* separate examples from FAQs ---------------------------------------------------------------- k better print-interpolate with {} etc @@ -42,6 +43,7 @@ mlr split ... -n, -g -- ? ---------------------------------------------------------------- * new example entry, with ccump and pgr + o slwin --prune (or somesuch) to only emit averages over full windows -- ? * make a lag-by-n and lead-by-n ---------------------------------------------------------------- From 494671ba42536beeb6f5bfa8465939d7bd7afc8b Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 26 Jan 2022 23:07:53 -0500 Subject: [PATCH 2/3] regression-test cases --- test/cases/cli-help/0001/expout | 41 ++++++++++++++++ test/cases/verb-split/0001/cmd | 1 + test/cases/verb-split/0001/experr | 0 test/cases/verb-split/0001/expout | 0 test/cases/verb-split/0001/postcmp | 3 ++ test/cases/verb-split/0001/split_1.csv.expect | 6 +++ test/cases/verb-split/0001/split_2.csv.expect | 6 +++ test/cases/verb-split/0002/cmd | 1 + test/cases/verb-split/0002/experr | 0 test/cases/verb-split/0002/expout | 0 test/cases/verb-split/0002/postcmp | 6 +++ test/cases/verb-split/0002/split_1.csv.expect | 3 ++ test/cases/verb-split/0002/split_2.csv.expect | 3 ++ test/cases/verb-split/0002/split_3.csv.expect | 3 ++ test/cases/verb-split/0002/split_4.csv.expect | 3 ++ test/cases/verb-split/0002/split_5.csv.expect | 3 ++ test/cases/verb-split/0003/cmd | 1 + test/cases/verb-split/0003/experr | 0 test/cases/verb-split/0003/expout | 0 test/cases/verb-split/0003/postcmp | 3 ++ .../verb-split/0003/split_circle.csv.expect | 4 ++ .../verb-split/0003/split_square.csv.expect | 5 ++ .../verb-split/0003/split_triangle.csv.expect | 4 ++ test/cases/verb-split/0004/cmd | 1 + test/cases/verb-split/0004/experr | 0 test/cases/verb-split/0004/expout | 0 test/cases/verb-split/0004/postcmp | 7 +++ .../0004/split_purple_square.csv.expect | 2 + .../0004/split_purple_triangle.csv.expect | 3 ++ .../0004/split_red_circle.csv.expect | 2 + .../0004/split_red_square.csv.expect | 4 ++ .../0004/split_yellow_circle.csv.expect | 3 ++ .../0004/split_yellow_triangle.csv.expect | 2 + test/cases/verb-split/0005/cmd | 1 + test/cases/verb-split/0005/experr | 0 test/cases/verb-split/0005/expout | 0 test/cases/verb-split/0005/postcmp | 3 ++ test/cases/verb-split/0005/split_1.dat.expect | 6 +++ test/cases/verb-split/0005/split_2.dat.expect | 6 +++ test/cases/verb-split/0006/cmd | 1 + test/cases/verb-split/0006/experr | 0 test/cases/verb-split/0006/expout | 0 test/cases/verb-split/0006/postcmp | 3 ++ .../cases/verb-split/0006/split_1.json.expect | 47 +++++++++++++++++++ .../cases/verb-split/0006/split_2.json.expect | 47 +++++++++++++++++++ test/cases/verb-split/0007/cmd | 1 + test/cases/verb-split/0007/experr | 0 test/cases/verb-split/0007/expout | 11 +++++ test/cases/verb-split/0007/postcmp | 3 ++ test/cases/verb-split/0007/split_1.csv.expect | 6 +++ test/cases/verb-split/0007/split_2.csv.expect | 6 +++ 51 files changed, 261 insertions(+) create mode 100644 test/cases/verb-split/0001/cmd create mode 100644 test/cases/verb-split/0001/experr create mode 100644 test/cases/verb-split/0001/expout create mode 100644 test/cases/verb-split/0001/postcmp create mode 100644 test/cases/verb-split/0001/split_1.csv.expect create mode 100644 test/cases/verb-split/0001/split_2.csv.expect create mode 100644 test/cases/verb-split/0002/cmd create mode 100644 test/cases/verb-split/0002/experr create mode 100644 test/cases/verb-split/0002/expout create mode 100644 test/cases/verb-split/0002/postcmp create mode 100644 test/cases/verb-split/0002/split_1.csv.expect create mode 100644 test/cases/verb-split/0002/split_2.csv.expect create mode 100644 test/cases/verb-split/0002/split_3.csv.expect create mode 100644 test/cases/verb-split/0002/split_4.csv.expect create mode 100644 test/cases/verb-split/0002/split_5.csv.expect create mode 100644 test/cases/verb-split/0003/cmd create mode 100644 test/cases/verb-split/0003/experr create mode 100644 test/cases/verb-split/0003/expout create mode 100644 test/cases/verb-split/0003/postcmp create mode 100644 test/cases/verb-split/0003/split_circle.csv.expect create mode 100644 test/cases/verb-split/0003/split_square.csv.expect create mode 100644 test/cases/verb-split/0003/split_triangle.csv.expect create mode 100644 test/cases/verb-split/0004/cmd create mode 100644 test/cases/verb-split/0004/experr create mode 100644 test/cases/verb-split/0004/expout create mode 100644 test/cases/verb-split/0004/postcmp create mode 100644 test/cases/verb-split/0004/split_purple_square.csv.expect create mode 100644 test/cases/verb-split/0004/split_purple_triangle.csv.expect create mode 100644 test/cases/verb-split/0004/split_red_circle.csv.expect create mode 100644 test/cases/verb-split/0004/split_red_square.csv.expect create mode 100644 test/cases/verb-split/0004/split_yellow_circle.csv.expect create mode 100644 test/cases/verb-split/0004/split_yellow_triangle.csv.expect create mode 100644 test/cases/verb-split/0005/cmd create mode 100644 test/cases/verb-split/0005/experr create mode 100644 test/cases/verb-split/0005/expout create mode 100644 test/cases/verb-split/0005/postcmp create mode 100644 test/cases/verb-split/0005/split_1.dat.expect create mode 100644 test/cases/verb-split/0005/split_2.dat.expect create mode 100644 test/cases/verb-split/0006/cmd create mode 100644 test/cases/verb-split/0006/experr create mode 100644 test/cases/verb-split/0006/expout create mode 100644 test/cases/verb-split/0006/postcmp create mode 100644 test/cases/verb-split/0006/split_1.json.expect create mode 100644 test/cases/verb-split/0006/split_2.json.expect create mode 100644 test/cases/verb-split/0007/cmd create mode 100644 test/cases/verb-split/0007/experr create mode 100644 test/cases/verb-split/0007/expout create mode 100644 test/cases/verb-split/0007/postcmp create mode 100644 test/cases/verb-split/0007/split_1.csv.expect create mode 100644 test/cases/verb-split/0007/split_2.csv.expect diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index 3da8c64e1..14fae72c6 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -929,6 +929,47 @@ Options: -r Recursively sort subobjects/submaps, e.g. for JSON input. -h|--help Show this message. +================================================================ +split +Usage: mlr split [options] {filename} +Options: +-n {n}: Cap file sizes at N records. +-m {m}: Produce M files, round-robining records among them. +-g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c. +Exactly one of -m, -n, or -g must be supplied. +--prefix {p} Specify filename prefix; default "split". +--suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv". +-a Append to existing file(s), if any, rather than overwriting. +-v Send records along to downstream verbs as well as splitting to files. +-h|--help Show this message. +Any of the output-format command-line flags (see mlr -h). For example, using + mlr --icsv --from myfile.csv split --ojson -n 1000 +the input is CSV, but the output files are JSON. + +Examples: Suppose myfile.csv has 1,000,000 records. + +100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc. + mlr --csv --from myfile.csv split -n 10000 + +10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc. + mlr --csv --from myfile.csv split -m 10 +Same, but with JSON output. + mlr --csv --from myfile.csv split -m 10 -o json + +Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc. + mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat +Same, but written to the /tmp/ directory. + mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat + +If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv. + mlr --csv --from myfile.csv split -g shape + +If the color field has values yellow and green, and the shape field has values triangle and square, +then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc. + mlr --csv --from myfile.csv split -g color,shape + +See also the "tee" DSL function which lets you do more ad-hoc customization. + ================================================================ stats1 Usage: mlr stats1 [options] diff --git a/test/cases/verb-split/0001/cmd b/test/cases/verb-split/0001/cmd new file mode 100644 index 000000000..8ef25a57d --- /dev/null +++ b/test/cases/verb-split/0001/cmd @@ -0,0 +1 @@ +mlr --csv split -m 2 --prefix ${CASEDIR}/split test/input/example.csv diff --git a/test/cases/verb-split/0001/experr b/test/cases/verb-split/0001/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0001/expout b/test/cases/verb-split/0001/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0001/postcmp b/test/cases/verb-split/0001/postcmp new file mode 100644 index 000000000..97b9056ae --- /dev/null +++ b/test/cases/verb-split/0001/postcmp @@ -0,0 +1,3 @@ +${CASEDIR}/split_1.csv.expect ${CASEDIR}/split_1.csv +${CASEDIR}/split_2.csv.expect ${CASEDIR}/split_2.csv + diff --git a/test/cases/verb-split/0001/split_1.csv.expect b/test/cases/verb-split/0001/split_1.csv.expect new file mode 100644 index 000000000..f228ed651 --- /dev/null +++ b/test/cases/verb-split/0001/split_1.csv.expect @@ -0,0 +1,6 @@ +color,shape,flag,k,index,quantity,rate +yellow,triangle,true,1,11,43.6498,9.8870 +red,circle,true,3,16,13.8103,2.9010 +purple,triangle,false,5,51,81.2290,8.5910 +purple,triangle,false,7,65,80.1405,5.8240 +yellow,circle,true,9,87,63.5058,8.3350 diff --git a/test/cases/verb-split/0001/split_2.csv.expect b/test/cases/verb-split/0001/split_2.csv.expect new file mode 100644 index 000000000..cf8dd0bd4 --- /dev/null +++ b/test/cases/verb-split/0001/split_2.csv.expect @@ -0,0 +1,6 @@ +color,shape,flag,k,index,quantity,rate +red,square,true,2,15,79.2778,0.0130 +red,square,false,4,48,77.5542,7.4670 +red,square,false,6,64,77.1991,9.5310 +yellow,circle,true,8,73,63.9785,4.2370 +purple,square,false,10,91,72.3735,8.2430 diff --git a/test/cases/verb-split/0002/cmd b/test/cases/verb-split/0002/cmd new file mode 100644 index 000000000..81c9af7dd --- /dev/null +++ b/test/cases/verb-split/0002/cmd @@ -0,0 +1 @@ +mlr --csv split -n 2 --prefix ${CASEDIR}/split test/input/example.csv diff --git a/test/cases/verb-split/0002/experr b/test/cases/verb-split/0002/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0002/expout b/test/cases/verb-split/0002/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0002/postcmp b/test/cases/verb-split/0002/postcmp new file mode 100644 index 000000000..b0cb3514e --- /dev/null +++ b/test/cases/verb-split/0002/postcmp @@ -0,0 +1,6 @@ +${CASEDIR}/split_1.csv.expect ${CASEDIR}/split_1.csv +${CASEDIR}/split_2.csv.expect ${CASEDIR}/split_2.csv +${CASEDIR}/split_3.csv.expect ${CASEDIR}/split_3.csv +${CASEDIR}/split_4.csv.expect ${CASEDIR}/split_4.csv +${CASEDIR}/split_5.csv.expect ${CASEDIR}/split_5.csv + diff --git a/test/cases/verb-split/0002/split_1.csv.expect b/test/cases/verb-split/0002/split_1.csv.expect new file mode 100644 index 000000000..6203cbca0 --- /dev/null +++ b/test/cases/verb-split/0002/split_1.csv.expect @@ -0,0 +1,3 @@ +color,shape,flag,k,index,quantity,rate +yellow,triangle,true,1,11,43.6498,9.8870 +red,square,true,2,15,79.2778,0.0130 diff --git a/test/cases/verb-split/0002/split_2.csv.expect b/test/cases/verb-split/0002/split_2.csv.expect new file mode 100644 index 000000000..9ad680950 --- /dev/null +++ b/test/cases/verb-split/0002/split_2.csv.expect @@ -0,0 +1,3 @@ +color,shape,flag,k,index,quantity,rate +red,circle,true,3,16,13.8103,2.9010 +red,square,false,4,48,77.5542,7.4670 diff --git a/test/cases/verb-split/0002/split_3.csv.expect b/test/cases/verb-split/0002/split_3.csv.expect new file mode 100644 index 000000000..bc2e5ba37 --- /dev/null +++ b/test/cases/verb-split/0002/split_3.csv.expect @@ -0,0 +1,3 @@ +color,shape,flag,k,index,quantity,rate +purple,triangle,false,5,51,81.2290,8.5910 +red,square,false,6,64,77.1991,9.5310 diff --git a/test/cases/verb-split/0002/split_4.csv.expect b/test/cases/verb-split/0002/split_4.csv.expect new file mode 100644 index 000000000..0be4a6258 --- /dev/null +++ b/test/cases/verb-split/0002/split_4.csv.expect @@ -0,0 +1,3 @@ +color,shape,flag,k,index,quantity,rate +purple,triangle,false,7,65,80.1405,5.8240 +yellow,circle,true,8,73,63.9785,4.2370 diff --git a/test/cases/verb-split/0002/split_5.csv.expect b/test/cases/verb-split/0002/split_5.csv.expect new file mode 100644 index 000000000..577f20e31 --- /dev/null +++ b/test/cases/verb-split/0002/split_5.csv.expect @@ -0,0 +1,3 @@ +color,shape,flag,k,index,quantity,rate +yellow,circle,true,9,87,63.5058,8.3350 +purple,square,false,10,91,72.3735,8.2430 diff --git a/test/cases/verb-split/0003/cmd b/test/cases/verb-split/0003/cmd new file mode 100644 index 000000000..32b90536e --- /dev/null +++ b/test/cases/verb-split/0003/cmd @@ -0,0 +1 @@ +mlr --csv split -g shape --prefix ${CASEDIR}/split test/input/example.csv diff --git a/test/cases/verb-split/0003/experr b/test/cases/verb-split/0003/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0003/expout b/test/cases/verb-split/0003/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0003/postcmp b/test/cases/verb-split/0003/postcmp new file mode 100644 index 000000000..d00abe7f1 --- /dev/null +++ b/test/cases/verb-split/0003/postcmp @@ -0,0 +1,3 @@ +${CASEDIR}/split_square.csv.expect ${CASEDIR}/split_square.csv +${CASEDIR}/split_circle.csv.expect ${CASEDIR}/split_circle.csv +${CASEDIR}/split_triangle.csv.expect ${CASEDIR}/split_triangle.csv diff --git a/test/cases/verb-split/0003/split_circle.csv.expect b/test/cases/verb-split/0003/split_circle.csv.expect new file mode 100644 index 000000000..6ea6a0a93 --- /dev/null +++ b/test/cases/verb-split/0003/split_circle.csv.expect @@ -0,0 +1,4 @@ +color,shape,flag,k,index,quantity,rate +red,circle,true,3,16,13.8103,2.9010 +yellow,circle,true,8,73,63.9785,4.2370 +yellow,circle,true,9,87,63.5058,8.3350 diff --git a/test/cases/verb-split/0003/split_square.csv.expect b/test/cases/verb-split/0003/split_square.csv.expect new file mode 100644 index 000000000..122663bfe --- /dev/null +++ b/test/cases/verb-split/0003/split_square.csv.expect @@ -0,0 +1,5 @@ +color,shape,flag,k,index,quantity,rate +red,square,true,2,15,79.2778,0.0130 +red,square,false,4,48,77.5542,7.4670 +red,square,false,6,64,77.1991,9.5310 +purple,square,false,10,91,72.3735,8.2430 diff --git a/test/cases/verb-split/0003/split_triangle.csv.expect b/test/cases/verb-split/0003/split_triangle.csv.expect new file mode 100644 index 000000000..70bce77e6 --- /dev/null +++ b/test/cases/verb-split/0003/split_triangle.csv.expect @@ -0,0 +1,4 @@ +color,shape,flag,k,index,quantity,rate +yellow,triangle,true,1,11,43.6498,9.8870 +purple,triangle,false,5,51,81.2290,8.5910 +purple,triangle,false,7,65,80.1405,5.8240 diff --git a/test/cases/verb-split/0004/cmd b/test/cases/verb-split/0004/cmd new file mode 100644 index 000000000..938e16043 --- /dev/null +++ b/test/cases/verb-split/0004/cmd @@ -0,0 +1 @@ +mlr --csv split -g color,shape --prefix ${CASEDIR}/split test/input/example.csv diff --git a/test/cases/verb-split/0004/experr b/test/cases/verb-split/0004/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0004/expout b/test/cases/verb-split/0004/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0004/postcmp b/test/cases/verb-split/0004/postcmp new file mode 100644 index 000000000..25cc0362d --- /dev/null +++ b/test/cases/verb-split/0004/postcmp @@ -0,0 +1,7 @@ +${CASEDIR}/split_purple_square.csv.expect ${CASEDIR}/split_purple_square.csv +${CASEDIR}/split_purple_triangle.csv.expect ${CASEDIR}/split_purple_triangle.csv +${CASEDIR}/split_red_circle.csv.expect ${CASEDIR}/split_red_circle.csv +${CASEDIR}/split_red_square.csv.expect ${CASEDIR}/split_red_square.csv +${CASEDIR}/split_yellow_circle.csv.expect ${CASEDIR}/split_yellow_circle.csv +${CASEDIR}/split_yellow_triangle.csv.expect ${CASEDIR}/split_yellow_triangle.csv + diff --git a/test/cases/verb-split/0004/split_purple_square.csv.expect b/test/cases/verb-split/0004/split_purple_square.csv.expect new file mode 100644 index 000000000..019f93431 --- /dev/null +++ b/test/cases/verb-split/0004/split_purple_square.csv.expect @@ -0,0 +1,2 @@ +color,shape,flag,k,index,quantity,rate +purple,square,false,10,91,72.3735,8.2430 diff --git a/test/cases/verb-split/0004/split_purple_triangle.csv.expect b/test/cases/verb-split/0004/split_purple_triangle.csv.expect new file mode 100644 index 000000000..7201c5aba --- /dev/null +++ b/test/cases/verb-split/0004/split_purple_triangle.csv.expect @@ -0,0 +1,3 @@ +color,shape,flag,k,index,quantity,rate +purple,triangle,false,5,51,81.2290,8.5910 +purple,triangle,false,7,65,80.1405,5.8240 diff --git a/test/cases/verb-split/0004/split_red_circle.csv.expect b/test/cases/verb-split/0004/split_red_circle.csv.expect new file mode 100644 index 000000000..79d82eb67 --- /dev/null +++ b/test/cases/verb-split/0004/split_red_circle.csv.expect @@ -0,0 +1,2 @@ +color,shape,flag,k,index,quantity,rate +red,circle,true,3,16,13.8103,2.9010 diff --git a/test/cases/verb-split/0004/split_red_square.csv.expect b/test/cases/verb-split/0004/split_red_square.csv.expect new file mode 100644 index 000000000..439afffa0 --- /dev/null +++ b/test/cases/verb-split/0004/split_red_square.csv.expect @@ -0,0 +1,4 @@ +color,shape,flag,k,index,quantity,rate +red,square,true,2,15,79.2778,0.0130 +red,square,false,4,48,77.5542,7.4670 +red,square,false,6,64,77.1991,9.5310 diff --git a/test/cases/verb-split/0004/split_yellow_circle.csv.expect b/test/cases/verb-split/0004/split_yellow_circle.csv.expect new file mode 100644 index 000000000..cbeb34546 --- /dev/null +++ b/test/cases/verb-split/0004/split_yellow_circle.csv.expect @@ -0,0 +1,3 @@ +color,shape,flag,k,index,quantity,rate +yellow,circle,true,8,73,63.9785,4.2370 +yellow,circle,true,9,87,63.5058,8.3350 diff --git a/test/cases/verb-split/0004/split_yellow_triangle.csv.expect b/test/cases/verb-split/0004/split_yellow_triangle.csv.expect new file mode 100644 index 000000000..cc98d358e --- /dev/null +++ b/test/cases/verb-split/0004/split_yellow_triangle.csv.expect @@ -0,0 +1,2 @@ +color,shape,flag,k,index,quantity,rate +yellow,triangle,true,1,11,43.6498,9.8870 diff --git a/test/cases/verb-split/0005/cmd b/test/cases/verb-split/0005/cmd new file mode 100644 index 000000000..77ddf9077 --- /dev/null +++ b/test/cases/verb-split/0005/cmd @@ -0,0 +1 @@ +mlr --csv split -m 2 --prefix ${CASEDIR}/split --suffix dat test/input/example.csv diff --git a/test/cases/verb-split/0005/experr b/test/cases/verb-split/0005/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0005/expout b/test/cases/verb-split/0005/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0005/postcmp b/test/cases/verb-split/0005/postcmp new file mode 100644 index 000000000..a40bc1194 --- /dev/null +++ b/test/cases/verb-split/0005/postcmp @@ -0,0 +1,3 @@ +${CASEDIR}/split_1.dat.expect ${CASEDIR}/split_1.dat +${CASEDIR}/split_2.dat.expect ${CASEDIR}/split_2.dat + diff --git a/test/cases/verb-split/0005/split_1.dat.expect b/test/cases/verb-split/0005/split_1.dat.expect new file mode 100644 index 000000000..f228ed651 --- /dev/null +++ b/test/cases/verb-split/0005/split_1.dat.expect @@ -0,0 +1,6 @@ +color,shape,flag,k,index,quantity,rate +yellow,triangle,true,1,11,43.6498,9.8870 +red,circle,true,3,16,13.8103,2.9010 +purple,triangle,false,5,51,81.2290,8.5910 +purple,triangle,false,7,65,80.1405,5.8240 +yellow,circle,true,9,87,63.5058,8.3350 diff --git a/test/cases/verb-split/0005/split_2.dat.expect b/test/cases/verb-split/0005/split_2.dat.expect new file mode 100644 index 000000000..cf8dd0bd4 --- /dev/null +++ b/test/cases/verb-split/0005/split_2.dat.expect @@ -0,0 +1,6 @@ +color,shape,flag,k,index,quantity,rate +red,square,true,2,15,79.2778,0.0130 +red,square,false,4,48,77.5542,7.4670 +red,square,false,6,64,77.1991,9.5310 +yellow,circle,true,8,73,63.9785,4.2370 +purple,square,false,10,91,72.3735,8.2430 diff --git a/test/cases/verb-split/0006/cmd b/test/cases/verb-split/0006/cmd new file mode 100644 index 000000000..a93d29864 --- /dev/null +++ b/test/cases/verb-split/0006/cmd @@ -0,0 +1 @@ +mlr --csv split -m 2 --prefix ${CASEDIR}/split --ojson test/input/example.csv diff --git a/test/cases/verb-split/0006/experr b/test/cases/verb-split/0006/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0006/expout b/test/cases/verb-split/0006/expout new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0006/postcmp b/test/cases/verb-split/0006/postcmp new file mode 100644 index 000000000..e63a309cf --- /dev/null +++ b/test/cases/verb-split/0006/postcmp @@ -0,0 +1,3 @@ +${CASEDIR}/split_1.json.expect ${CASEDIR}/split_1.json +${CASEDIR}/split_2.json.expect ${CASEDIR}/split_2.json + diff --git a/test/cases/verb-split/0006/split_1.json.expect b/test/cases/verb-split/0006/split_1.json.expect new file mode 100644 index 000000000..00534646c --- /dev/null +++ b/test/cases/verb-split/0006/split_1.json.expect @@ -0,0 +1,47 @@ +[ +{ + "color": "yellow", + "shape": "triangle", + "flag": "true", + "k": 1, + "index": 11, + "quantity": 43.6498, + "rate": 9.8870 +}, +{ + "color": "red", + "shape": "circle", + "flag": "true", + "k": 3, + "index": 16, + "quantity": 13.8103, + "rate": 2.9010 +}, +{ + "color": "purple", + "shape": "triangle", + "flag": "false", + "k": 5, + "index": 51, + "quantity": 81.2290, + "rate": 8.5910 +}, +{ + "color": "purple", + "shape": "triangle", + "flag": "false", + "k": 7, + "index": 65, + "quantity": 80.1405, + "rate": 5.8240 +}, +{ + "color": "yellow", + "shape": "circle", + "flag": "true", + "k": 9, + "index": 87, + "quantity": 63.5058, + "rate": 8.3350 +} +] diff --git a/test/cases/verb-split/0006/split_2.json.expect b/test/cases/verb-split/0006/split_2.json.expect new file mode 100644 index 000000000..d2d370f1e --- /dev/null +++ b/test/cases/verb-split/0006/split_2.json.expect @@ -0,0 +1,47 @@ +[ +{ + "color": "red", + "shape": "square", + "flag": "true", + "k": 2, + "index": 15, + "quantity": 79.2778, + "rate": 0.0130 +}, +{ + "color": "red", + "shape": "square", + "flag": "false", + "k": 4, + "index": 48, + "quantity": 77.5542, + "rate": 7.4670 +}, +{ + "color": "red", + "shape": "square", + "flag": "false", + "k": 6, + "index": 64, + "quantity": 77.1991, + "rate": 9.5310 +}, +{ + "color": "yellow", + "shape": "circle", + "flag": "true", + "k": 8, + "index": 73, + "quantity": 63.9785, + "rate": 4.2370 +}, +{ + "color": "purple", + "shape": "square", + "flag": "false", + "k": 10, + "index": 91, + "quantity": 72.3735, + "rate": 8.2430 +} +] diff --git a/test/cases/verb-split/0007/cmd b/test/cases/verb-split/0007/cmd new file mode 100644 index 000000000..44f51882f --- /dev/null +++ b/test/cases/verb-split/0007/cmd @@ -0,0 +1 @@ +mlr --csv split -m 2 -v --prefix ${CASEDIR}/split test/input/example.csv diff --git a/test/cases/verb-split/0007/experr b/test/cases/verb-split/0007/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-split/0007/expout b/test/cases/verb-split/0007/expout new file mode 100644 index 000000000..bf79dd5f7 --- /dev/null +++ b/test/cases/verb-split/0007/expout @@ -0,0 +1,11 @@ +color,shape,flag,k,index,quantity,rate +yellow,triangle,true,1,11,43.6498,9.8870 +red,square,true,2,15,79.2778,0.0130 +red,circle,true,3,16,13.8103,2.9010 +red,square,false,4,48,77.5542,7.4670 +purple,triangle,false,5,51,81.2290,8.5910 +red,square,false,6,64,77.1991,9.5310 +purple,triangle,false,7,65,80.1405,5.8240 +yellow,circle,true,8,73,63.9785,4.2370 +yellow,circle,true,9,87,63.5058,8.3350 +purple,square,false,10,91,72.3735,8.2430 diff --git a/test/cases/verb-split/0007/postcmp b/test/cases/verb-split/0007/postcmp new file mode 100644 index 000000000..97b9056ae --- /dev/null +++ b/test/cases/verb-split/0007/postcmp @@ -0,0 +1,3 @@ +${CASEDIR}/split_1.csv.expect ${CASEDIR}/split_1.csv +${CASEDIR}/split_2.csv.expect ${CASEDIR}/split_2.csv + diff --git a/test/cases/verb-split/0007/split_1.csv.expect b/test/cases/verb-split/0007/split_1.csv.expect new file mode 100644 index 000000000..f228ed651 --- /dev/null +++ b/test/cases/verb-split/0007/split_1.csv.expect @@ -0,0 +1,6 @@ +color,shape,flag,k,index,quantity,rate +yellow,triangle,true,1,11,43.6498,9.8870 +red,circle,true,3,16,13.8103,2.9010 +purple,triangle,false,5,51,81.2290,8.5910 +purple,triangle,false,7,65,80.1405,5.8240 +yellow,circle,true,9,87,63.5058,8.3350 diff --git a/test/cases/verb-split/0007/split_2.csv.expect b/test/cases/verb-split/0007/split_2.csv.expect new file mode 100644 index 000000000..cf8dd0bd4 --- /dev/null +++ b/test/cases/verb-split/0007/split_2.csv.expect @@ -0,0 +1,6 @@ +color,shape,flag,k,index,quantity,rate +red,square,true,2,15,79.2778,0.0130 +red,square,false,4,48,77.5542,7.4670 +red,square,false,6,64,77.1991,9.5310 +yellow,circle,true,8,73,63.9785,4.2370 +purple,square,false,10,91,72.3735,8.2430 From c8b71779d995a24c00465072ac02cd896273d5d9 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 26 Jan 2022 23:11:17 -0500 Subject: [PATCH 3/3] doc-build artifacts --- docs/src/manpage.md | 46 +++++++++++++++++++++++++++-- docs/src/manpage.txt | 46 +++++++++++++++++++++++++++-- docs/src/reference-verbs.md | 46 +++++++++++++++++++++++++++++ docs/src/reference-verbs.md.in | 6 ++++ man/manpage.txt | 46 +++++++++++++++++++++++++++-- man/mlr.1 | 54 +++++++++++++++++++++++++++++++--- todo.txt | 11 +++---- 7 files changed, 235 insertions(+), 20 deletions(-) diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 77331400b..9492fe5b1 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -195,8 +195,8 @@ VERB LIST json-stringify join label least-frequent merge-fields most-frequent nest nothing put regularize remove-empty-columns rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort - sort-within-records stats1 stats2 step tac tail tee template top unflatten - uniq unsparsify + sort-within-records split stats1 stats2 step tac tail tee template top + unflatten uniq unsparsify FUNCTION LIST abs acos acosh any append apply arrayify asin asinh asserting_absent @@ -1737,6 +1737,46 @@ VERBS -r Recursively sort subobjects/submaps, e.g. for JSON input. -h|--help Show this message. + split + Usage: mlr split [options] {filename} + Options: + -n {n}: Cap file sizes at N records. + -m {m}: Produce M files, round-robining records among them. + -g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c. + Exactly one of -m, -n, or -g must be supplied. + --prefix {p} Specify filename prefix; default "split". + --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv". + -a Append to existing file(s), if any, rather than overwriting. + -v Send records along to downstream verbs as well as splitting to files. + -h|--help Show this message. + Any of the output-format command-line flags (see mlr -h). For example, using + mlr --icsv --from myfile.csv split --ojson -n 1000 + the input is CSV, but the output files are JSON. + + Examples: Suppose myfile.csv has 1,000,000 records. + + 100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc. + mlr --csv --from myfile.csv split -n 10000 + + 10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc. + mlr --csv --from myfile.csv split -m 10 + Same, but with JSON output. + mlr --csv --from myfile.csv split -m 10 -o json + + Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc. + mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat + Same, but written to the /tmp/ directory. + mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat + + If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv. + mlr --csv --from myfile.csv split -g shape + + If the color field has values yellow and green, and the shape field has values triangle and square, + then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc. + mlr --csv --from myfile.csv split -g color,shape + + See also the "tee" DSL function which lets you do more ad-hoc customization. + stats1 Usage: mlr stats1 [options] Computes univariate statistics for one or more given fields, accumulated across @@ -3091,5 +3131,5 @@ SEE ALSO - 2022-01-25 MILLER(1) + 2022-01-27 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 8aa7753f3..bdba32306 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -174,8 +174,8 @@ VERB LIST json-stringify join label least-frequent merge-fields most-frequent nest nothing put regularize remove-empty-columns rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort - sort-within-records stats1 stats2 step tac tail tee template top unflatten - uniq unsparsify + sort-within-records split stats1 stats2 step tac tail tee template top + unflatten uniq unsparsify FUNCTION LIST abs acos acosh any append apply arrayify asin asinh asserting_absent @@ -1716,6 +1716,46 @@ VERBS -r Recursively sort subobjects/submaps, e.g. for JSON input. -h|--help Show this message. + split + Usage: mlr split [options] {filename} + Options: + -n {n}: Cap file sizes at N records. + -m {m}: Produce M files, round-robining records among them. + -g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c. + Exactly one of -m, -n, or -g must be supplied. + --prefix {p} Specify filename prefix; default "split". + --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv". + -a Append to existing file(s), if any, rather than overwriting. + -v Send records along to downstream verbs as well as splitting to files. + -h|--help Show this message. + Any of the output-format command-line flags (see mlr -h). For example, using + mlr --icsv --from myfile.csv split --ojson -n 1000 + the input is CSV, but the output files are JSON. + + Examples: Suppose myfile.csv has 1,000,000 records. + + 100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc. + mlr --csv --from myfile.csv split -n 10000 + + 10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc. + mlr --csv --from myfile.csv split -m 10 + Same, but with JSON output. + mlr --csv --from myfile.csv split -m 10 -o json + + Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc. + mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat + Same, but written to the /tmp/ directory. + mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat + + If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv. + mlr --csv --from myfile.csv split -g shape + + If the color field has values yellow and green, and the shape field has values triangle and square, + then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc. + mlr --csv --from myfile.csv split -g color,shape + + See also the "tee" DSL function which lets you do more ad-hoc customization. + stats1 Usage: mlr stats1 [options] Computes univariate statistics for one or more given fields, accumulated across @@ -3070,4 +3110,4 @@ SEE ALSO - 2022-01-25 MILLER(1) + 2022-01-27 MILLER(1) diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index bd74fa02f..991a14748 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -2978,6 +2978,52 @@ a b c 9 8 7 +## split + +
+mlr split --help
+
+
+Usage: mlr split [options] {filename}
+Options:
+-n {n}:      Cap file sizes at N records.
+-m {m}:      Produce M files, round-robining records among them.
+-g {a,b,c}:  Write separate files with records having distinct values for fields named a,b,c.
+Exactly one  of -m, -n, or -g must be supplied.
+--prefix {p} Specify filename prefix; default "split".
+--suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
+-a           Append to existing file(s), if any, rather than overwriting.
+-v           Send records along to downstream verbs as well as splitting to files.
+-h|--help    Show this message.
+Any of the output-format command-line flags (see mlr -h). For example, using
+  mlr --icsv --from myfile.csv split --ojson -n 1000
+the input is CSV, but the output files are JSON.
+
+Examples: Suppose myfile.csv has 1,000,000 records.
+
+100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc.
+  mlr --csv --from myfile.csv split -n 10000
+
+10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc.
+  mlr --csv --from myfile.csv split -m 10
+Same, but with JSON output.
+  mlr --csv --from myfile.csv split -m 10 -o json
+
+Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc.
+  mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat
+Same, but written to the /tmp/ directory.
+  mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat
+
+If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv.
+  mlr --csv --from myfile.csv split -g shape
+
+If the color field has values yellow and green, and the shape field has values triangle and square,
+then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc.
+  mlr --csv --from myfile.csv split -g color,shape
+
+See also the "tee" DSL function which lets you do more ad-hoc customization.
+
+ ## stats1
diff --git a/docs/src/reference-verbs.md.in b/docs/src/reference-verbs.md.in
index 0addd5d57..2c371a554 100644
--- a/docs/src/reference-verbs.md.in
+++ b/docs/src/reference-verbs.md.in
@@ -936,6 +936,12 @@ GENMD-RUN-COMMAND
 mlr --ijson --opprint sort-within-records data/sort-within-records.json
 GENMD-EOF
 
+## split
+
+GENMD-RUN-COMMAND
+mlr split --help
+GENMD-EOF
+
 ## stats1
 
 GENMD-RUN-COMMAND
diff --git a/man/manpage.txt b/man/manpage.txt
index 8aa7753f3..bdba32306 100644
--- a/man/manpage.txt
+++ b/man/manpage.txt
@@ -174,8 +174,8 @@ VERB LIST
        json-stringify join label least-frequent merge-fields most-frequent nest
        nothing put regularize remove-empty-columns rename reorder repeat reshape
        sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort
-       sort-within-records stats1 stats2 step tac tail tee template top unflatten
-       uniq unsparsify
+       sort-within-records split stats1 stats2 step tac tail tee template top
+       unflatten uniq unsparsify
 
 FUNCTION LIST
        abs acos acosh any append apply arrayify asin asinh asserting_absent
@@ -1716,6 +1716,46 @@ VERBS
        -r        Recursively sort subobjects/submaps, e.g. for JSON input.
        -h|--help Show this message.
 
+   split
+       Usage: mlr split [options] {filename}
+       Options:
+       -n {n}:      Cap file sizes at N records.
+       -m {m}:      Produce M files, round-robining records among them.
+       -g {a,b,c}:  Write separate files with records having distinct values for fields named a,b,c.
+       Exactly one  of -m, -n, or -g must be supplied.
+       --prefix {p} Specify filename prefix; default "split".
+       --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
+       -a           Append to existing file(s), if any, rather than overwriting.
+       -v           Send records along to downstream verbs as well as splitting to files.
+       -h|--help    Show this message.
+       Any of the output-format command-line flags (see mlr -h). For example, using
+         mlr --icsv --from myfile.csv split --ojson -n 1000
+       the input is CSV, but the output files are JSON.
+
+       Examples: Suppose myfile.csv has 1,000,000 records.
+
+       100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc.
+         mlr --csv --from myfile.csv split -n 10000
+
+       10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc.
+         mlr --csv --from myfile.csv split -m 10
+       Same, but with JSON output.
+         mlr --csv --from myfile.csv split -m 10 -o json
+
+       Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc.
+         mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat
+       Same, but written to the /tmp/ directory.
+         mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat
+
+       If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv.
+         mlr --csv --from myfile.csv split -g shape
+
+       If the color field has values yellow and green, and the shape field has values triangle and square,
+       then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc.
+         mlr --csv --from myfile.csv split -g color,shape
+
+       See also the "tee" DSL function which lets you do more ad-hoc customization.
+
    stats1
        Usage: mlr stats1 [options]
        Computes univariate statistics for one or more given fields, accumulated across
@@ -3070,4 +3110,4 @@ SEE ALSO
 
 
 
-                                  2022-01-25                         MILLER(1)
+                                  2022-01-27                         MILLER(1)
diff --git a/man/mlr.1 b/man/mlr.1
index 6eb74a052..57011f3e4 100644
--- a/man/mlr.1
+++ b/man/mlr.1
@@ -2,12 +2,12 @@
 .\"     Title: mlr
 .\"    Author: [see the "AUTHOR" section]
 .\" Generator: ./mkman.rb
-.\"      Date: 2022-01-25
+.\"      Date: 2022-01-27
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "MILLER" "1" "2022-01-25" "\ \&" "\ \&"
+.TH "MILLER" "1" "2022-01-27" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Portability definitions
 .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -215,8 +215,8 @@ fraction gap grep group-by group-like having-fields head histogram json-parse
 json-stringify join label least-frequent merge-fields most-frequent nest
 nothing put regularize remove-empty-columns rename reorder repeat reshape
 sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort
-sort-within-records stats1 stats2 step tac tail tee template top unflatten
-uniq unsparsify
+sort-within-records split stats1 stats2 step tac tail tee template top
+unflatten uniq unsparsify
 .fi
 .if n \{\
 .RE
@@ -2169,6 +2169,52 @@ Options:
 .fi
 .if n \{\
 .RE
+.SS "split"
+.if n \{\
+.RS 0
+.\}
+.nf
+Usage: mlr split [options] {filename}
+Options:
+-n {n}:      Cap file sizes at N records.
+-m {m}:      Produce M files, round-robining records among them.
+-g {a,b,c}:  Write separate files with records having distinct values for fields named a,b,c.
+Exactly one  of -m, -n, or -g must be supplied.
+--prefix {p} Specify filename prefix; default "split".
+--suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
+-a           Append to existing file(s), if any, rather than overwriting.
+-v           Send records along to downstream verbs as well as splitting to files.
+-h|--help    Show this message.
+Any of the output-format command-line flags (see mlr -h). For example, using
+  mlr --icsv --from myfile.csv split --ojson -n 1000
+the input is CSV, but the output files are JSON.
+
+Examples: Suppose myfile.csv has 1,000,000 records.
+
+100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc.
+  mlr --csv --from myfile.csv split -n 10000
+
+10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc.
+  mlr --csv --from myfile.csv split -m 10
+Same, but with JSON output.
+  mlr --csv --from myfile.csv split -m 10 -o json
+
+Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc.
+  mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat
+Same, but written to the /tmp/ directory.
+  mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat
+
+If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv.
+  mlr --csv --from myfile.csv split -g shape
+
+If the color field has values yellow and green, and the shape field has values triangle and square,
+then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc.
+  mlr --csv --from myfile.csv split -g color,shape
+
+See also the "tee" DSL function which lets you do more ad-hoc customization.
+.fi
+.if n \{\
+.RE
 .SS "stats1"
 .if n \{\
 .RS 0
diff --git a/todo.txt b/todo.txt
index a6e7a7439..3539fde95 100644
--- a/todo.txt
+++ b/todo.txt
@@ -1,4 +1,4 @@
-================================================================
+===============================================================
 RELEASES
 
 * follow ...
@@ -27,6 +27,9 @@ FEATURES
   o strmatch
   o =~
 * separate examples from FAQs
+* mlr split -- needs an example page along with the tee DSL function
+* new example entry, with ccump and pgr
+  o slwin --prune (or somesuch) to only emit averages over full windows -- ?
 
 ----------------------------------------------------------------
 k better print-interpolate with {} etc
@@ -34,16 +37,10 @@ k better print-interpolate with {} etc
 ----------------------------------------------------------------
 ! sysdate, sysdate_local; datediff ...
 
-----------------------------------------------------------------
-mlr split ... -n, -g -- ?
-- how to specify filenames?
-
 ----------------------------------------------------------------
 ! strmatch https://github.com/johnkerl/miller/issues/77#issuecomment-538790927
 
 ----------------------------------------------------------------
-* new example entry, with ccump and pgr
-  o slwin --prune (or somesuch) to only emit averages over full windows -- ?
 * make a lag-by-n and lead-by-n
 
 ----------------------------------------------------------------