diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 77331400b..9492fe5b1 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -195,8 +195,8 @@ VERB LIST json-stringify join label least-frequent merge-fields most-frequent nest nothing put regularize remove-empty-columns rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort - sort-within-records stats1 stats2 step tac tail tee template top unflatten - uniq unsparsify + sort-within-records split stats1 stats2 step tac tail tee template top + unflatten uniq unsparsify FUNCTION LIST abs acos acosh any append apply arrayify asin asinh asserting_absent @@ -1737,6 +1737,46 @@ VERBS -r Recursively sort subobjects/submaps, e.g. for JSON input. -h|--help Show this message. + split + Usage: mlr split [options] {filename} + Options: + -n {n}: Cap file sizes at N records. + -m {m}: Produce M files, round-robining records among them. + -g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c. + Exactly one of -m, -n, or -g must be supplied. + --prefix {p} Specify filename prefix; default "split". + --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv". + -a Append to existing file(s), if any, rather than overwriting. + -v Send records along to downstream verbs as well as splitting to files. + -h|--help Show this message. + Any of the output-format command-line flags (see mlr -h). For example, using + mlr --icsv --from myfile.csv split --ojson -n 1000 + the input is CSV, but the output files are JSON. + + Examples: Suppose myfile.csv has 1,000,000 records. + + 100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc. + mlr --csv --from myfile.csv split -n 10000 + + 10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc. + mlr --csv --from myfile.csv split -m 10 + Same, but with JSON output. + mlr --csv --from myfile.csv split -m 10 -o json + + Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc. + mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat + Same, but written to the /tmp/ directory. + mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat + + If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv. + mlr --csv --from myfile.csv split -g shape + + If the color field has values yellow and green, and the shape field has values triangle and square, + then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc. + mlr --csv --from myfile.csv split -g color,shape + + See also the "tee" DSL function which lets you do more ad-hoc customization. + stats1 Usage: mlr stats1 [options] Computes univariate statistics for one or more given fields, accumulated across @@ -3091,5 +3131,5 @@ SEE ALSO - 2022-01-25 MILLER(1) + 2022-01-27 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 8aa7753f3..bdba32306 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -174,8 +174,8 @@ VERB LIST json-stringify join label least-frequent merge-fields most-frequent nest nothing put regularize remove-empty-columns rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort - sort-within-records stats1 stats2 step tac tail tee template top unflatten - uniq unsparsify + sort-within-records split stats1 stats2 step tac tail tee template top + unflatten uniq unsparsify FUNCTION LIST abs acos acosh any append apply arrayify asin asinh asserting_absent @@ -1716,6 +1716,46 @@ VERBS -r Recursively sort subobjects/submaps, e.g. for JSON input. -h|--help Show this message. + split + Usage: mlr split [options] {filename} + Options: + -n {n}: Cap file sizes at N records. + -m {m}: Produce M files, round-robining records among them. + -g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c. + Exactly one of -m, -n, or -g must be supplied. + --prefix {p} Specify filename prefix; default "split". + --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv". + -a Append to existing file(s), if any, rather than overwriting. + -v Send records along to downstream verbs as well as splitting to files. + -h|--help Show this message. + Any of the output-format command-line flags (see mlr -h). For example, using + mlr --icsv --from myfile.csv split --ojson -n 1000 + the input is CSV, but the output files are JSON. + + Examples: Suppose myfile.csv has 1,000,000 records. + + 100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc. + mlr --csv --from myfile.csv split -n 10000 + + 10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc. + mlr --csv --from myfile.csv split -m 10 + Same, but with JSON output. + mlr --csv --from myfile.csv split -m 10 -o json + + Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc. + mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat + Same, but written to the /tmp/ directory. + mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat + + If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv. + mlr --csv --from myfile.csv split -g shape + + If the color field has values yellow and green, and the shape field has values triangle and square, + then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc. + mlr --csv --from myfile.csv split -g color,shape + + See also the "tee" DSL function which lets you do more ad-hoc customization. + stats1 Usage: mlr stats1 [options] Computes univariate statistics for one or more given fields, accumulated across @@ -3070,4 +3110,4 @@ SEE ALSO - 2022-01-25 MILLER(1) + 2022-01-27 MILLER(1) diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index bd74fa02f..991a14748 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -2978,6 +2978,52 @@ a b c 9 8 7 +## split + +
+mlr split --help ++
+Usage: mlr split [options] {filename}
+Options:
+-n {n}: Cap file sizes at N records.
+-m {m}: Produce M files, round-robining records among them.
+-g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c.
+Exactly one of -m, -n, or -g must be supplied.
+--prefix {p} Specify filename prefix; default "split".
+--suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
+-a Append to existing file(s), if any, rather than overwriting.
+-v Send records along to downstream verbs as well as splitting to files.
+-h|--help Show this message.
+Any of the output-format command-line flags (see mlr -h). For example, using
+ mlr --icsv --from myfile.csv split --ojson -n 1000
+the input is CSV, but the output files are JSON.
+
+Examples: Suppose myfile.csv has 1,000,000 records.
+
+100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc.
+ mlr --csv --from myfile.csv split -n 10000
+
+10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc.
+ mlr --csv --from myfile.csv split -m 10
+Same, but with JSON output.
+ mlr --csv --from myfile.csv split -m 10 -o json
+
+Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc.
+ mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat
+Same, but written to the /tmp/ directory.
+ mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat
+
+If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv.
+ mlr --csv --from myfile.csv split -g shape
+
+If the color field has values yellow and green, and the shape field has values triangle and square,
+then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc.
+ mlr --csv --from myfile.csv split -g color,shape
+
+See also the "tee" DSL function which lets you do more ad-hoc customization.
+
+
## stats1
diff --git a/docs/src/reference-verbs.md.in b/docs/src/reference-verbs.md.in
index 0addd5d57..2c371a554 100644
--- a/docs/src/reference-verbs.md.in
+++ b/docs/src/reference-verbs.md.in
@@ -936,6 +936,12 @@ GENMD-RUN-COMMAND
mlr --ijson --opprint sort-within-records data/sort-within-records.json
GENMD-EOF
+## split
+
+GENMD-RUN-COMMAND
+mlr split --help
+GENMD-EOF
+
## stats1
GENMD-RUN-COMMAND
diff --git a/internal/pkg/output/file_output_handlers.go b/internal/pkg/output/file_output_handlers.go
index b5e1df510..cd7c3f896 100644
--- a/internal/pkg/output/file_output_handlers.go
+++ b/internal/pkg/output/file_output_handlers.go
@@ -56,6 +56,17 @@ type MultiOutputHandlerManager struct {
}
// ----------------------------------------------------------------
+func NewFileOutputHandlerManager(
+ recordWriterOptions *cli.TWriterOptions,
+ doAppend bool,
+) *MultiOutputHandlerManager {
+ if doAppend {
+ return NewFileAppendHandlerManager(recordWriterOptions)
+ } else {
+ return NewFileWritetHandlerManager(recordWriterOptions)
+ }
+}
+
func NewFileWritetHandlerManager(
recordWriterOptions *cli.TWriterOptions,
) *MultiOutputHandlerManager {
@@ -228,6 +239,18 @@ func newOutputHandlerCommon(
}
// ----------------------------------------------------------------
+func NewFileOutputHandler(
+ filename string,
+ recordWriterOptions *cli.TWriterOptions,
+ doAppend bool,
+) (*FileOutputHandler, error) {
+ if doAppend {
+ return NewFileAppendOutputHandler(filename, recordWriterOptions)
+ } else {
+ return NewFileWriteOutputHandler(filename, recordWriterOptions)
+ }
+}
+
func NewFileWriteOutputHandler(
filename string,
recordWriterOptions *cli.TWriterOptions,
diff --git a/internal/pkg/transformers/aaa_transformer_table.go b/internal/pkg/transformers/aaa_transformer_table.go
index ed6c0a84d..463b745a4 100644
--- a/internal/pkg/transformers/aaa_transformer_table.go
+++ b/internal/pkg/transformers/aaa_transformer_table.go
@@ -59,6 +59,7 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{
SkipTrivialRecordsSetup,
SortSetup,
SortWithinRecordsSetup,
+ SplitSetup,
Stats1Setup,
Stats2Setup,
StepSetup,
diff --git a/internal/pkg/transformers/split.go b/internal/pkg/transformers/split.go
new file mode 100644
index 000000000..287b42768
--- /dev/null
+++ b/internal/pkg/transformers/split.go
@@ -0,0 +1,437 @@
+package transformers
+
+import (
+ "bytes"
+ "container/list"
+ "fmt"
+ "net/url"
+ "os"
+ "strings"
+
+ "github.com/johnkerl/miller/internal/pkg/cli"
+ "github.com/johnkerl/miller/internal/pkg/mlrval"
+ "github.com/johnkerl/miller/internal/pkg/output"
+ "github.com/johnkerl/miller/internal/pkg/types"
+)
+
+// ----------------------------------------------------------------
+const verbNameSplit = "split"
+const splitDefaultOutputFileNamePrefix = "split"
+
+var SplitSetup = TransformerSetup{
+ Verb: verbNameSplit,
+ UsageFunc: transformerSplitUsage,
+ ParseCLIFunc: transformerSplitParseCLI,
+ IgnoresInput: false,
+}
+
+func transformerSplitUsage(
+ o *os.File,
+ doExit bool,
+ exitCode int,
+) {
+ fmt.Fprintf(o, "Usage: %s %s [options] {filename}\n", "mlr", verbNameSplit)
+ fmt.Fprintf(o,
+ `Options:
+-n {n}: Cap file sizes at N records.
+-m {m}: Produce M files, round-robining records among them.
+-g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c.
+Exactly one of -m, -n, or -g must be supplied.
+--prefix {p} Specify filename prefix; default "`+splitDefaultOutputFileNamePrefix+`".
+--suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
+-a Append to existing file(s), if any, rather than overwriting.
+-v Send records along to downstream verbs as well as splitting to files.
+-h|--help Show this message.
+Any of the output-format command-line flags (see mlr -h). For example, using
+ mlr --icsv --from myfile.csv split --ojson -n 1000
+the input is CSV, but the output files are JSON.
+
+Examples: Suppose myfile.csv has 1,000,000 records.
+
+100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc.
+ mlr --csv --from myfile.csv split -n 10000
+
+10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc.
+ mlr --csv --from myfile.csv split -m 10
+Same, but with JSON output.
+ mlr --csv --from myfile.csv split -m 10 -o json
+
+Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc.
+ mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat
+Same, but written to the /tmp/ directory.
+ mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat
+
+If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv.
+ mlr --csv --from myfile.csv split -g shape
+
+If the color field has values yellow and green, and the shape field has values triangle and square,
+then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc.
+ mlr --csv --from myfile.csv split -g color,shape
+
+See also the "tee" DSL function which lets you do more ad-hoc customization.
+`)
+ if doExit {
+ os.Exit(exitCode)
+ }
+}
+
+func transformerSplitParseCLI(
+ pargi *int,
+ argc int,
+ args []string,
+ mainOptions *cli.TOptions,
+ doConstruct bool, // false for first pass of CLI-parse, true for second pass
+) IRecordTransformer {
+
+ // Skip the verb name from the current spot in the mlr command line
+ argi := *pargi
+ verb := args[argi]
+ argi++
+
+ var n int = 0
+ var doMod bool = false
+ var doSize bool = false
+ var groupByFieldNames []string = nil
+ var emitDownstream bool = false
+ var doAppend bool = false
+ var outputFileNamePrefix string = splitDefaultOutputFileNamePrefix
+ var outputFileNameSuffix string = "uninit"
+ haveOutputFileNameSuffix := false
+
+ var localOptions *cli.TOptions = nil
+ if mainOptions != nil {
+ copyThereof := *mainOptions // struct copy
+ localOptions = ©Thereof
+ }
+
+ // Parse local flags.
+ for argi < argc /* variable increment: 1 or 2 depending on flag */ {
+ opt := args[argi]
+ if !strings.HasPrefix(opt, "-") {
+ break // No more flag options to process
+ }
+ if args[argi] == "--" {
+ break // All transformers must do this so main-flags can follow verb-flags
+ }
+ argi++
+
+ if opt == "-h" || opt == "--help" {
+ transformerSplitUsage(os.Stdout, true, 0)
+
+ } else if opt == "-n" {
+ n = cli.VerbGetIntArgOrDie(verb, opt, args, &argi, argc)
+ doSize = true
+
+ } else if opt == "-m" {
+ n = cli.VerbGetIntArgOrDie(verb, opt, args, &argi, argc)
+ doMod = true
+
+ } else if opt == "-g" {
+ groupByFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
+
+ } else if opt == "--prefix" {
+ outputFileNamePrefix = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
+
+ } else if opt == "--suffix" {
+ outputFileNameSuffix = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
+ haveOutputFileNameSuffix = true
+
+ } else if opt == "-a" {
+ doAppend = true
+
+ } else if opt == "-v" {
+ emitDownstream = true
+
+ } else {
+ // This is inelegant. For error-proofing we advance argi already in our
+ // loop (so individual if-statements don't need to). However,
+ // ParseWriterOptions expects it unadvanced.
+ largi := argi - 1
+ if cli.FLAG_TABLE.Parse(args, argc, &largi, localOptions) {
+ // This lets mlr main and mlr split have different output formats.
+ // Nothing else to handle here.
+ argi = largi
+ } else {
+ transformerSplitUsage(os.Stderr, true, 1)
+ }
+ }
+ }
+
+ doGroup := groupByFieldNames != nil
+ if !doMod && !doSize && !doGroup {
+ fmt.Fprintf(os.Stderr, "mlr %s: At least one of -m, -n, or -g is required.\n", verb)
+ os.Exit(1)
+ }
+ if (doMod && doSize) || (doMod && doGroup) || (doSize && doGroup) {
+ fmt.Fprintf(os.Stderr, "mlr %s: Only one of -m, -n, or -g is required.\n", verb)
+ os.Exit(1)
+ }
+
+ cli.FinalizeWriterOptions(&localOptions.WriterOptions)
+ if !haveOutputFileNameSuffix {
+ outputFileNameSuffix = localOptions.WriterOptions.OutputFileFormat
+ }
+
+ *pargi = argi
+ if !doConstruct { // All transformers must do this for main command-line parsing
+ return nil
+ }
+
+ transformer, err := NewTransformerSplit(
+ n,
+ doMod,
+ doSize,
+ groupByFieldNames,
+ emitDownstream,
+ doAppend,
+ outputFileNamePrefix,
+ outputFileNameSuffix,
+ &localOptions.WriterOptions,
+ )
+ if err != nil {
+ // Error message already printed out
+ os.Exit(1)
+ }
+
+ return transformer
+}
+
+// ----------------------------------------------------------------
+type TransformerSplit struct {
+ n int
+ outputFileNamePrefix string
+ outputFileNameSuffix string
+ emitDownstream bool
+ ungroupedCounter int
+ groupByFieldNames []string
+ recordWriterOptions *cli.TWriterOptions
+ doAppend bool
+
+ // For doSize ungrouped: only one file open at a time
+ outputHandler output.OutputHandler
+ previousQuotient int
+
+ // For all other cases: multiple files open at a time
+ outputHandlerManager output.OutputHandlerManager
+
+ recordTransformerFunc RecordTransformerFunc
+}
+
+func NewTransformerSplit(
+ n int,
+ doMod bool,
+ doSize bool,
+ groupByFieldNames []string,
+ emitDownstream bool,
+ doAppend bool,
+ outputFileNamePrefix string,
+ outputFileNameSuffix string,
+ recordWriterOptions *cli.TWriterOptions,
+) (*TransformerSplit, error) {
+
+ tr := &TransformerSplit{
+ n: n,
+ outputFileNamePrefix: outputFileNamePrefix,
+ outputFileNameSuffix: outputFileNameSuffix,
+ emitDownstream: emitDownstream,
+ ungroupedCounter: 0,
+ groupByFieldNames: groupByFieldNames,
+ recordWriterOptions: recordWriterOptions,
+ doAppend: doAppend,
+
+ outputHandler: nil,
+ previousQuotient: -1,
+ }
+
+ tr.outputHandlerManager = output.NewFileOutputHandlerManager(recordWriterOptions, doAppend)
+
+ if groupByFieldNames != nil {
+ tr.recordTransformerFunc = tr.splitGrouped
+ } else if doMod {
+ tr.recordTransformerFunc = tr.splitModUngrouped
+ } else {
+ tr.recordTransformerFunc = tr.splitSizeUngrouped
+ }
+
+ return tr, nil
+}
+
+func (tr *TransformerSplit) Transform(
+ inrecAndContext *types.RecordAndContext,
+ outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
+ inputDownstreamDoneChannel <-chan bool,
+ outputDownstreamDoneChannel chan<- bool,
+) {
+ HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel)
+ tr.recordTransformerFunc(inrecAndContext, outputRecordsAndContexts, inputDownstreamDoneChannel,
+ outputDownstreamDoneChannel)
+}
+
+func (tr *TransformerSplit) splitModUngrouped(
+ inrecAndContext *types.RecordAndContext,
+ outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
+ inputDownstreamDoneChannel <-chan bool,
+ outputDownstreamDoneChannel chan<- bool,
+) {
+ if !inrecAndContext.EndOfStream {
+ remainder := 1 + (tr.ungroupedCounter % tr.n)
+ filename := tr.makeUngroupedOutputFileName(remainder)
+
+ err := tr.outputHandlerManager.WriteRecordAndContext(inrecAndContext, filename)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "mlr: file-write error: %v\n", err)
+ os.Exit(1)
+ }
+
+ if tr.emitDownstream {
+ outputRecordsAndContexts.PushBack(inrecAndContext)
+ }
+
+ tr.ungroupedCounter++
+
+ } else {
+ outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker
+ errs := tr.outputHandlerManager.Close()
+ if len(errs) > 0 {
+ for _, err := range errs {
+ fmt.Fprintf(os.Stderr, "mlr: file-close error: %v\n", err)
+ }
+ os.Exit(1)
+ }
+ }
+}
+
+func (tr *TransformerSplit) splitSizeUngrouped(
+ inrecAndContext *types.RecordAndContext,
+ outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
+ inputDownstreamDoneChannel <-chan bool,
+ outputDownstreamDoneChannel chan<- bool,
+) {
+ var err error
+ if !inrecAndContext.EndOfStream {
+ quotient := 1 + (tr.ungroupedCounter / tr.n)
+
+ if quotient != tr.previousQuotient {
+ if tr.outputHandler != nil {
+ err = tr.outputHandler.Close()
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "mlr: file-close error: %v\n", err)
+ os.Exit(1)
+ }
+ }
+
+ filename := tr.makeUngroupedOutputFileName(quotient)
+ tr.outputHandler, err = output.NewFileOutputHandler(
+ filename,
+ tr.recordWriterOptions,
+ tr.doAppend,
+ )
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "mlr: file-open error: %v\n", err)
+ os.Exit(1)
+ }
+
+ tr.previousQuotient = quotient
+ }
+
+ err = tr.outputHandler.WriteRecordAndContext(inrecAndContext)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "mlr: file-write error: %v\n", err)
+ os.Exit(1)
+ }
+
+ if tr.emitDownstream {
+ outputRecordsAndContexts.PushBack(inrecAndContext)
+ }
+
+ tr.ungroupedCounter++
+
+ } else {
+ outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker
+
+ if tr.outputHandler != nil {
+ err := tr.outputHandler.Close()
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "mlr: file-close error: %v\n", err)
+ os.Exit(1)
+ }
+ }
+ }
+}
+
+func (tr *TransformerSplit) splitGrouped(
+ inrecAndContext *types.RecordAndContext,
+ outputRecordsAndContexts *list.List, // list of *types.RecordAndContext
+ inputDownstreamDoneChannel <-chan bool,
+ outputDownstreamDoneChannel chan<- bool,
+) {
+ if !inrecAndContext.EndOfStream {
+ var filename string
+ groupByFieldValues, ok := inrecAndContext.Record.GetSelectedValues(tr.groupByFieldNames)
+ if !ok {
+ filename = fmt.Sprintf("%s_ungrouped.%s", tr.outputFileNamePrefix, tr.outputFileNameSuffix)
+ } else {
+ filename = tr.makeGroupedOutputFileName(groupByFieldValues)
+ }
+ err := tr.outputHandlerManager.WriteRecordAndContext(inrecAndContext, filename)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "mlr: %v\n", err)
+ os.Exit(1)
+ }
+
+ if tr.emitDownstream {
+ outputRecordsAndContexts.PushBack(inrecAndContext)
+ }
+
+ } else {
+ outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker
+
+ errs := tr.outputHandlerManager.Close()
+ if len(errs) > 0 {
+ for _, err := range errs {
+ fmt.Fprintf(os.Stderr, "mlr: file-close error: %v\n", err)
+ }
+ os.Exit(1)
+ }
+ }
+}
+
+// makeUngroupedOutputFileName example: "split_53.csv"
+func (tr *TransformerSplit) makeUngroupedOutputFileName(k int) string {
+ return fmt.Sprintf("%s_%d.%s", tr.outputFileNamePrefix, k, tr.outputFileNameSuffix)
+}
+
+// makeGroupedOutputFileName example: "split_orange.csv"
+func (tr *TransformerSplit) makeGroupedOutputFileName(
+ groupByFieldValues []*mlrval.Mlrval,
+) string {
+ var buffer bytes.Buffer
+ buffer.WriteString(tr.outputFileNamePrefix)
+ for _, groupByFieldValue := range groupByFieldValues {
+ buffer.WriteString("_")
+ buffer.WriteString(url.QueryEscape(groupByFieldValue.String()))
+ }
+ buffer.WriteString(".")
+ buffer.WriteString(tr.outputFileNameSuffix)
+ return buffer.String()
+}
+
+// makeGroupedIndexedOutputFileName example: "split_yellow_53.csv"
+func (tr *TransformerSplit) makeGroupedIndexedOutputFileName(
+ groupByFieldValues []*mlrval.Mlrval,
+ index int,
+) string {
+ // URL-escape the fields which come from data and which may have '/'
+ // etc within. Don't URL-escape the prefix since people may want to
+ // use prefixes like '/tmp/split' to write to the /tmp directory, etc.
+ var buffer bytes.Buffer
+ buffer.WriteString(tr.outputFileNamePrefix)
+ for _, groupByFieldValue := range groupByFieldValues {
+ buffer.WriteString("_")
+ buffer.WriteString(url.QueryEscape(groupByFieldValue.String()))
+ }
+ buffer.WriteString(fmt.Sprintf("_%d", index))
+ buffer.WriteString(".")
+ buffer.WriteString(tr.outputFileNameSuffix)
+ return buffer.String()
+}
diff --git a/man/manpage.txt b/man/manpage.txt
index 8aa7753f3..bdba32306 100644
--- a/man/manpage.txt
+++ b/man/manpage.txt
@@ -174,8 +174,8 @@ VERB LIST
json-stringify join label least-frequent merge-fields most-frequent nest
nothing put regularize remove-empty-columns rename reorder repeat reshape
sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort
- sort-within-records stats1 stats2 step tac tail tee template top unflatten
- uniq unsparsify
+ sort-within-records split stats1 stats2 step tac tail tee template top
+ unflatten uniq unsparsify
FUNCTION LIST
abs acos acosh any append apply arrayify asin asinh asserting_absent
@@ -1716,6 +1716,46 @@ VERBS
-r Recursively sort subobjects/submaps, e.g. for JSON input.
-h|--help Show this message.
+ split
+ Usage: mlr split [options] {filename}
+ Options:
+ -n {n}: Cap file sizes at N records.
+ -m {m}: Produce M files, round-robining records among them.
+ -g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c.
+ Exactly one of -m, -n, or -g must be supplied.
+ --prefix {p} Specify filename prefix; default "split".
+ --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
+ -a Append to existing file(s), if any, rather than overwriting.
+ -v Send records along to downstream verbs as well as splitting to files.
+ -h|--help Show this message.
+ Any of the output-format command-line flags (see mlr -h). For example, using
+ mlr --icsv --from myfile.csv split --ojson -n 1000
+ the input is CSV, but the output files are JSON.
+
+ Examples: Suppose myfile.csv has 1,000,000 records.
+
+ 100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc.
+ mlr --csv --from myfile.csv split -n 10000
+
+ 10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc.
+ mlr --csv --from myfile.csv split -m 10
+ Same, but with JSON output.
+ mlr --csv --from myfile.csv split -m 10 -o json
+
+ Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc.
+ mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat
+ Same, but written to the /tmp/ directory.
+ mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat
+
+ If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv.
+ mlr --csv --from myfile.csv split -g shape
+
+ If the color field has values yellow and green, and the shape field has values triangle and square,
+ then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc.
+ mlr --csv --from myfile.csv split -g color,shape
+
+ See also the "tee" DSL function which lets you do more ad-hoc customization.
+
stats1
Usage: mlr stats1 [options]
Computes univariate statistics for one or more given fields, accumulated across
@@ -3070,4 +3110,4 @@ SEE ALSO
- 2022-01-25 MILLER(1)
+ 2022-01-27 MILLER(1)
diff --git a/man/mlr.1 b/man/mlr.1
index 6eb74a052..57011f3e4 100644
--- a/man/mlr.1
+++ b/man/mlr.1
@@ -2,12 +2,12 @@
.\" Title: mlr
.\" Author: [see the "AUTHOR" section]
.\" Generator: ./mkman.rb
-.\" Date: 2022-01-25
+.\" Date: 2022-01-27
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
-.TH "MILLER" "1" "2022-01-25" "\ \&" "\ \&"
+.TH "MILLER" "1" "2022-01-27" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Portability definitions
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -215,8 +215,8 @@ fraction gap grep group-by group-like having-fields head histogram json-parse
json-stringify join label least-frequent merge-fields most-frequent nest
nothing put regularize remove-empty-columns rename reorder repeat reshape
sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort
-sort-within-records stats1 stats2 step tac tail tee template top unflatten
-uniq unsparsify
+sort-within-records split stats1 stats2 step tac tail tee template top
+unflatten uniq unsparsify
.fi
.if n \{\
.RE
@@ -2169,6 +2169,52 @@ Options:
.fi
.if n \{\
.RE
+.SS "split"
+.if n \{\
+.RS 0
+.\}
+.nf
+Usage: mlr split [options] {filename}
+Options:
+-n {n}: Cap file sizes at N records.
+-m {m}: Produce M files, round-robining records among them.
+-g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c.
+Exactly one of -m, -n, or -g must be supplied.
+--prefix {p} Specify filename prefix; default "split".
+--suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
+-a Append to existing file(s), if any, rather than overwriting.
+-v Send records along to downstream verbs as well as splitting to files.
+-h|--help Show this message.
+Any of the output-format command-line flags (see mlr -h). For example, using
+ mlr --icsv --from myfile.csv split --ojson -n 1000
+the input is CSV, but the output files are JSON.
+
+Examples: Suppose myfile.csv has 1,000,000 records.
+
+100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc.
+ mlr --csv --from myfile.csv split -n 10000
+
+10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc.
+ mlr --csv --from myfile.csv split -m 10
+Same, but with JSON output.
+ mlr --csv --from myfile.csv split -m 10 -o json
+
+Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc.
+ mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat
+Same, but written to the /tmp/ directory.
+ mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat
+
+If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv.
+ mlr --csv --from myfile.csv split -g shape
+
+If the color field has values yellow and green, and the shape field has values triangle and square,
+then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc.
+ mlr --csv --from myfile.csv split -g color,shape
+
+See also the "tee" DSL function which lets you do more ad-hoc customization.
+.fi
+.if n \{\
+.RE
.SS "stats1"
.if n \{\
.RS 0
diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout
index 3da8c64e1..14fae72c6 100644
--- a/test/cases/cli-help/0001/expout
+++ b/test/cases/cli-help/0001/expout
@@ -929,6 +929,47 @@ Options:
-r Recursively sort subobjects/submaps, e.g. for JSON input.
-h|--help Show this message.
+================================================================
+split
+Usage: mlr split [options] {filename}
+Options:
+-n {n}: Cap file sizes at N records.
+-m {m}: Produce M files, round-robining records among them.
+-g {a,b,c}: Write separate files with records having distinct values for fields named a,b,c.
+Exactly one of -m, -n, or -g must be supplied.
+--prefix {p} Specify filename prefix; default "split".
+--suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
+-a Append to existing file(s), if any, rather than overwriting.
+-v Send records along to downstream verbs as well as splitting to files.
+-h|--help Show this message.
+Any of the output-format command-line flags (see mlr -h). For example, using
+ mlr --icsv --from myfile.csv split --ojson -n 1000
+the input is CSV, but the output files are JSON.
+
+Examples: Suppose myfile.csv has 1,000,000 records.
+
+100 output files, 10,000 records each. First 10,000 records in split_1.csv, next in split_2.csv, etc.
+ mlr --csv --from myfile.csv split -n 10000
+
+10 output files, 100,000 records each. Records 1,11,21,etc in split_1.csv, records 2,12,22, etc in split_2.csv, etc.
+ mlr --csv --from myfile.csv split -m 10
+Same, but with JSON output.
+ mlr --csv --from myfile.csv split -m 10 -o json
+
+Same but instead of split_1.csv, split_2.csv, etc. there are test_1.dat, test_2.dat, etc.
+ mlr --csv --from myfile.csv split -m 10 --prefix test --suffix dat
+Same, but written to the /tmp/ directory.
+ mlr --csv --from myfile.csv split -m 10 --prefix /tmp/test --suffix dat
+
+If the shape field has values triangle and square, then there will be split_triangle.csv and split_square.csv.
+ mlr --csv --from myfile.csv split -g shape
+
+If the color field has values yellow and green, and the shape field has values triangle and square,
+then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc.
+ mlr --csv --from myfile.csv split -g color,shape
+
+See also the "tee" DSL function which lets you do more ad-hoc customization.
+
================================================================
stats1
Usage: mlr stats1 [options]
diff --git a/test/cases/verb-split/0001/cmd b/test/cases/verb-split/0001/cmd
new file mode 100644
index 000000000..8ef25a57d
--- /dev/null
+++ b/test/cases/verb-split/0001/cmd
@@ -0,0 +1 @@
+mlr --csv split -m 2 --prefix ${CASEDIR}/split test/input/example.csv
diff --git a/test/cases/verb-split/0001/experr b/test/cases/verb-split/0001/experr
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0001/expout b/test/cases/verb-split/0001/expout
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0001/postcmp b/test/cases/verb-split/0001/postcmp
new file mode 100644
index 000000000..97b9056ae
--- /dev/null
+++ b/test/cases/verb-split/0001/postcmp
@@ -0,0 +1,3 @@
+${CASEDIR}/split_1.csv.expect ${CASEDIR}/split_1.csv
+${CASEDIR}/split_2.csv.expect ${CASEDIR}/split_2.csv
+
diff --git a/test/cases/verb-split/0001/split_1.csv.expect b/test/cases/verb-split/0001/split_1.csv.expect
new file mode 100644
index 000000000..f228ed651
--- /dev/null
+++ b/test/cases/verb-split/0001/split_1.csv.expect
@@ -0,0 +1,6 @@
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
+red,circle,true,3,16,13.8103,2.9010
+purple,triangle,false,5,51,81.2290,8.5910
+purple,triangle,false,7,65,80.1405,5.8240
+yellow,circle,true,9,87,63.5058,8.3350
diff --git a/test/cases/verb-split/0001/split_2.csv.expect b/test/cases/verb-split/0001/split_2.csv.expect
new file mode 100644
index 000000000..cf8dd0bd4
--- /dev/null
+++ b/test/cases/verb-split/0001/split_2.csv.expect
@@ -0,0 +1,6 @@
+color,shape,flag,k,index,quantity,rate
+red,square,true,2,15,79.2778,0.0130
+red,square,false,4,48,77.5542,7.4670
+red,square,false,6,64,77.1991,9.5310
+yellow,circle,true,8,73,63.9785,4.2370
+purple,square,false,10,91,72.3735,8.2430
diff --git a/test/cases/verb-split/0002/cmd b/test/cases/verb-split/0002/cmd
new file mode 100644
index 000000000..81c9af7dd
--- /dev/null
+++ b/test/cases/verb-split/0002/cmd
@@ -0,0 +1 @@
+mlr --csv split -n 2 --prefix ${CASEDIR}/split test/input/example.csv
diff --git a/test/cases/verb-split/0002/experr b/test/cases/verb-split/0002/experr
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0002/expout b/test/cases/verb-split/0002/expout
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0002/postcmp b/test/cases/verb-split/0002/postcmp
new file mode 100644
index 000000000..b0cb3514e
--- /dev/null
+++ b/test/cases/verb-split/0002/postcmp
@@ -0,0 +1,6 @@
+${CASEDIR}/split_1.csv.expect ${CASEDIR}/split_1.csv
+${CASEDIR}/split_2.csv.expect ${CASEDIR}/split_2.csv
+${CASEDIR}/split_3.csv.expect ${CASEDIR}/split_3.csv
+${CASEDIR}/split_4.csv.expect ${CASEDIR}/split_4.csv
+${CASEDIR}/split_5.csv.expect ${CASEDIR}/split_5.csv
+
diff --git a/test/cases/verb-split/0002/split_1.csv.expect b/test/cases/verb-split/0002/split_1.csv.expect
new file mode 100644
index 000000000..6203cbca0
--- /dev/null
+++ b/test/cases/verb-split/0002/split_1.csv.expect
@@ -0,0 +1,3 @@
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
+red,square,true,2,15,79.2778,0.0130
diff --git a/test/cases/verb-split/0002/split_2.csv.expect b/test/cases/verb-split/0002/split_2.csv.expect
new file mode 100644
index 000000000..9ad680950
--- /dev/null
+++ b/test/cases/verb-split/0002/split_2.csv.expect
@@ -0,0 +1,3 @@
+color,shape,flag,k,index,quantity,rate
+red,circle,true,3,16,13.8103,2.9010
+red,square,false,4,48,77.5542,7.4670
diff --git a/test/cases/verb-split/0002/split_3.csv.expect b/test/cases/verb-split/0002/split_3.csv.expect
new file mode 100644
index 000000000..bc2e5ba37
--- /dev/null
+++ b/test/cases/verb-split/0002/split_3.csv.expect
@@ -0,0 +1,3 @@
+color,shape,flag,k,index,quantity,rate
+purple,triangle,false,5,51,81.2290,8.5910
+red,square,false,6,64,77.1991,9.5310
diff --git a/test/cases/verb-split/0002/split_4.csv.expect b/test/cases/verb-split/0002/split_4.csv.expect
new file mode 100644
index 000000000..0be4a6258
--- /dev/null
+++ b/test/cases/verb-split/0002/split_4.csv.expect
@@ -0,0 +1,3 @@
+color,shape,flag,k,index,quantity,rate
+purple,triangle,false,7,65,80.1405,5.8240
+yellow,circle,true,8,73,63.9785,4.2370
diff --git a/test/cases/verb-split/0002/split_5.csv.expect b/test/cases/verb-split/0002/split_5.csv.expect
new file mode 100644
index 000000000..577f20e31
--- /dev/null
+++ b/test/cases/verb-split/0002/split_5.csv.expect
@@ -0,0 +1,3 @@
+color,shape,flag,k,index,quantity,rate
+yellow,circle,true,9,87,63.5058,8.3350
+purple,square,false,10,91,72.3735,8.2430
diff --git a/test/cases/verb-split/0003/cmd b/test/cases/verb-split/0003/cmd
new file mode 100644
index 000000000..32b90536e
--- /dev/null
+++ b/test/cases/verb-split/0003/cmd
@@ -0,0 +1 @@
+mlr --csv split -g shape --prefix ${CASEDIR}/split test/input/example.csv
diff --git a/test/cases/verb-split/0003/experr b/test/cases/verb-split/0003/experr
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0003/expout b/test/cases/verb-split/0003/expout
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0003/postcmp b/test/cases/verb-split/0003/postcmp
new file mode 100644
index 000000000..d00abe7f1
--- /dev/null
+++ b/test/cases/verb-split/0003/postcmp
@@ -0,0 +1,3 @@
+${CASEDIR}/split_square.csv.expect ${CASEDIR}/split_square.csv
+${CASEDIR}/split_circle.csv.expect ${CASEDIR}/split_circle.csv
+${CASEDIR}/split_triangle.csv.expect ${CASEDIR}/split_triangle.csv
diff --git a/test/cases/verb-split/0003/split_circle.csv.expect b/test/cases/verb-split/0003/split_circle.csv.expect
new file mode 100644
index 000000000..6ea6a0a93
--- /dev/null
+++ b/test/cases/verb-split/0003/split_circle.csv.expect
@@ -0,0 +1,4 @@
+color,shape,flag,k,index,quantity,rate
+red,circle,true,3,16,13.8103,2.9010
+yellow,circle,true,8,73,63.9785,4.2370
+yellow,circle,true,9,87,63.5058,8.3350
diff --git a/test/cases/verb-split/0003/split_square.csv.expect b/test/cases/verb-split/0003/split_square.csv.expect
new file mode 100644
index 000000000..122663bfe
--- /dev/null
+++ b/test/cases/verb-split/0003/split_square.csv.expect
@@ -0,0 +1,5 @@
+color,shape,flag,k,index,quantity,rate
+red,square,true,2,15,79.2778,0.0130
+red,square,false,4,48,77.5542,7.4670
+red,square,false,6,64,77.1991,9.5310
+purple,square,false,10,91,72.3735,8.2430
diff --git a/test/cases/verb-split/0003/split_triangle.csv.expect b/test/cases/verb-split/0003/split_triangle.csv.expect
new file mode 100644
index 000000000..70bce77e6
--- /dev/null
+++ b/test/cases/verb-split/0003/split_triangle.csv.expect
@@ -0,0 +1,4 @@
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
+purple,triangle,false,5,51,81.2290,8.5910
+purple,triangle,false,7,65,80.1405,5.8240
diff --git a/test/cases/verb-split/0004/cmd b/test/cases/verb-split/0004/cmd
new file mode 100644
index 000000000..938e16043
--- /dev/null
+++ b/test/cases/verb-split/0004/cmd
@@ -0,0 +1 @@
+mlr --csv split -g color,shape --prefix ${CASEDIR}/split test/input/example.csv
diff --git a/test/cases/verb-split/0004/experr b/test/cases/verb-split/0004/experr
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0004/expout b/test/cases/verb-split/0004/expout
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0004/postcmp b/test/cases/verb-split/0004/postcmp
new file mode 100644
index 000000000..25cc0362d
--- /dev/null
+++ b/test/cases/verb-split/0004/postcmp
@@ -0,0 +1,7 @@
+${CASEDIR}/split_purple_square.csv.expect ${CASEDIR}/split_purple_square.csv
+${CASEDIR}/split_purple_triangle.csv.expect ${CASEDIR}/split_purple_triangle.csv
+${CASEDIR}/split_red_circle.csv.expect ${CASEDIR}/split_red_circle.csv
+${CASEDIR}/split_red_square.csv.expect ${CASEDIR}/split_red_square.csv
+${CASEDIR}/split_yellow_circle.csv.expect ${CASEDIR}/split_yellow_circle.csv
+${CASEDIR}/split_yellow_triangle.csv.expect ${CASEDIR}/split_yellow_triangle.csv
+
diff --git a/test/cases/verb-split/0004/split_purple_square.csv.expect b/test/cases/verb-split/0004/split_purple_square.csv.expect
new file mode 100644
index 000000000..019f93431
--- /dev/null
+++ b/test/cases/verb-split/0004/split_purple_square.csv.expect
@@ -0,0 +1,2 @@
+color,shape,flag,k,index,quantity,rate
+purple,square,false,10,91,72.3735,8.2430
diff --git a/test/cases/verb-split/0004/split_purple_triangle.csv.expect b/test/cases/verb-split/0004/split_purple_triangle.csv.expect
new file mode 100644
index 000000000..7201c5aba
--- /dev/null
+++ b/test/cases/verb-split/0004/split_purple_triangle.csv.expect
@@ -0,0 +1,3 @@
+color,shape,flag,k,index,quantity,rate
+purple,triangle,false,5,51,81.2290,8.5910
+purple,triangle,false,7,65,80.1405,5.8240
diff --git a/test/cases/verb-split/0004/split_red_circle.csv.expect b/test/cases/verb-split/0004/split_red_circle.csv.expect
new file mode 100644
index 000000000..79d82eb67
--- /dev/null
+++ b/test/cases/verb-split/0004/split_red_circle.csv.expect
@@ -0,0 +1,2 @@
+color,shape,flag,k,index,quantity,rate
+red,circle,true,3,16,13.8103,2.9010
diff --git a/test/cases/verb-split/0004/split_red_square.csv.expect b/test/cases/verb-split/0004/split_red_square.csv.expect
new file mode 100644
index 000000000..439afffa0
--- /dev/null
+++ b/test/cases/verb-split/0004/split_red_square.csv.expect
@@ -0,0 +1,4 @@
+color,shape,flag,k,index,quantity,rate
+red,square,true,2,15,79.2778,0.0130
+red,square,false,4,48,77.5542,7.4670
+red,square,false,6,64,77.1991,9.5310
diff --git a/test/cases/verb-split/0004/split_yellow_circle.csv.expect b/test/cases/verb-split/0004/split_yellow_circle.csv.expect
new file mode 100644
index 000000000..cbeb34546
--- /dev/null
+++ b/test/cases/verb-split/0004/split_yellow_circle.csv.expect
@@ -0,0 +1,3 @@
+color,shape,flag,k,index,quantity,rate
+yellow,circle,true,8,73,63.9785,4.2370
+yellow,circle,true,9,87,63.5058,8.3350
diff --git a/test/cases/verb-split/0004/split_yellow_triangle.csv.expect b/test/cases/verb-split/0004/split_yellow_triangle.csv.expect
new file mode 100644
index 000000000..cc98d358e
--- /dev/null
+++ b/test/cases/verb-split/0004/split_yellow_triangle.csv.expect
@@ -0,0 +1,2 @@
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
diff --git a/test/cases/verb-split/0005/cmd b/test/cases/verb-split/0005/cmd
new file mode 100644
index 000000000..77ddf9077
--- /dev/null
+++ b/test/cases/verb-split/0005/cmd
@@ -0,0 +1 @@
+mlr --csv split -m 2 --prefix ${CASEDIR}/split --suffix dat test/input/example.csv
diff --git a/test/cases/verb-split/0005/experr b/test/cases/verb-split/0005/experr
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0005/expout b/test/cases/verb-split/0005/expout
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0005/postcmp b/test/cases/verb-split/0005/postcmp
new file mode 100644
index 000000000..a40bc1194
--- /dev/null
+++ b/test/cases/verb-split/0005/postcmp
@@ -0,0 +1,3 @@
+${CASEDIR}/split_1.dat.expect ${CASEDIR}/split_1.dat
+${CASEDIR}/split_2.dat.expect ${CASEDIR}/split_2.dat
+
diff --git a/test/cases/verb-split/0005/split_1.dat.expect b/test/cases/verb-split/0005/split_1.dat.expect
new file mode 100644
index 000000000..f228ed651
--- /dev/null
+++ b/test/cases/verb-split/0005/split_1.dat.expect
@@ -0,0 +1,6 @@
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
+red,circle,true,3,16,13.8103,2.9010
+purple,triangle,false,5,51,81.2290,8.5910
+purple,triangle,false,7,65,80.1405,5.8240
+yellow,circle,true,9,87,63.5058,8.3350
diff --git a/test/cases/verb-split/0005/split_2.dat.expect b/test/cases/verb-split/0005/split_2.dat.expect
new file mode 100644
index 000000000..cf8dd0bd4
--- /dev/null
+++ b/test/cases/verb-split/0005/split_2.dat.expect
@@ -0,0 +1,6 @@
+color,shape,flag,k,index,quantity,rate
+red,square,true,2,15,79.2778,0.0130
+red,square,false,4,48,77.5542,7.4670
+red,square,false,6,64,77.1991,9.5310
+yellow,circle,true,8,73,63.9785,4.2370
+purple,square,false,10,91,72.3735,8.2430
diff --git a/test/cases/verb-split/0006/cmd b/test/cases/verb-split/0006/cmd
new file mode 100644
index 000000000..a93d29864
--- /dev/null
+++ b/test/cases/verb-split/0006/cmd
@@ -0,0 +1 @@
+mlr --csv split -m 2 --prefix ${CASEDIR}/split --ojson test/input/example.csv
diff --git a/test/cases/verb-split/0006/experr b/test/cases/verb-split/0006/experr
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0006/expout b/test/cases/verb-split/0006/expout
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0006/postcmp b/test/cases/verb-split/0006/postcmp
new file mode 100644
index 000000000..e63a309cf
--- /dev/null
+++ b/test/cases/verb-split/0006/postcmp
@@ -0,0 +1,3 @@
+${CASEDIR}/split_1.json.expect ${CASEDIR}/split_1.json
+${CASEDIR}/split_2.json.expect ${CASEDIR}/split_2.json
+
diff --git a/test/cases/verb-split/0006/split_1.json.expect b/test/cases/verb-split/0006/split_1.json.expect
new file mode 100644
index 000000000..00534646c
--- /dev/null
+++ b/test/cases/verb-split/0006/split_1.json.expect
@@ -0,0 +1,47 @@
+[
+{
+ "color": "yellow",
+ "shape": "triangle",
+ "flag": "true",
+ "k": 1,
+ "index": 11,
+ "quantity": 43.6498,
+ "rate": 9.8870
+},
+{
+ "color": "red",
+ "shape": "circle",
+ "flag": "true",
+ "k": 3,
+ "index": 16,
+ "quantity": 13.8103,
+ "rate": 2.9010
+},
+{
+ "color": "purple",
+ "shape": "triangle",
+ "flag": "false",
+ "k": 5,
+ "index": 51,
+ "quantity": 81.2290,
+ "rate": 8.5910
+},
+{
+ "color": "purple",
+ "shape": "triangle",
+ "flag": "false",
+ "k": 7,
+ "index": 65,
+ "quantity": 80.1405,
+ "rate": 5.8240
+},
+{
+ "color": "yellow",
+ "shape": "circle",
+ "flag": "true",
+ "k": 9,
+ "index": 87,
+ "quantity": 63.5058,
+ "rate": 8.3350
+}
+]
diff --git a/test/cases/verb-split/0006/split_2.json.expect b/test/cases/verb-split/0006/split_2.json.expect
new file mode 100644
index 000000000..d2d370f1e
--- /dev/null
+++ b/test/cases/verb-split/0006/split_2.json.expect
@@ -0,0 +1,47 @@
+[
+{
+ "color": "red",
+ "shape": "square",
+ "flag": "true",
+ "k": 2,
+ "index": 15,
+ "quantity": 79.2778,
+ "rate": 0.0130
+},
+{
+ "color": "red",
+ "shape": "square",
+ "flag": "false",
+ "k": 4,
+ "index": 48,
+ "quantity": 77.5542,
+ "rate": 7.4670
+},
+{
+ "color": "red",
+ "shape": "square",
+ "flag": "false",
+ "k": 6,
+ "index": 64,
+ "quantity": 77.1991,
+ "rate": 9.5310
+},
+{
+ "color": "yellow",
+ "shape": "circle",
+ "flag": "true",
+ "k": 8,
+ "index": 73,
+ "quantity": 63.9785,
+ "rate": 4.2370
+},
+{
+ "color": "purple",
+ "shape": "square",
+ "flag": "false",
+ "k": 10,
+ "index": 91,
+ "quantity": 72.3735,
+ "rate": 8.2430
+}
+]
diff --git a/test/cases/verb-split/0007/cmd b/test/cases/verb-split/0007/cmd
new file mode 100644
index 000000000..44f51882f
--- /dev/null
+++ b/test/cases/verb-split/0007/cmd
@@ -0,0 +1 @@
+mlr --csv split -m 2 -v --prefix ${CASEDIR}/split test/input/example.csv
diff --git a/test/cases/verb-split/0007/experr b/test/cases/verb-split/0007/experr
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-split/0007/expout b/test/cases/verb-split/0007/expout
new file mode 100644
index 000000000..bf79dd5f7
--- /dev/null
+++ b/test/cases/verb-split/0007/expout
@@ -0,0 +1,11 @@
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
+red,square,true,2,15,79.2778,0.0130
+red,circle,true,3,16,13.8103,2.9010
+red,square,false,4,48,77.5542,7.4670
+purple,triangle,false,5,51,81.2290,8.5910
+red,square,false,6,64,77.1991,9.5310
+purple,triangle,false,7,65,80.1405,5.8240
+yellow,circle,true,8,73,63.9785,4.2370
+yellow,circle,true,9,87,63.5058,8.3350
+purple,square,false,10,91,72.3735,8.2430
diff --git a/test/cases/verb-split/0007/postcmp b/test/cases/verb-split/0007/postcmp
new file mode 100644
index 000000000..97b9056ae
--- /dev/null
+++ b/test/cases/verb-split/0007/postcmp
@@ -0,0 +1,3 @@
+${CASEDIR}/split_1.csv.expect ${CASEDIR}/split_1.csv
+${CASEDIR}/split_2.csv.expect ${CASEDIR}/split_2.csv
+
diff --git a/test/cases/verb-split/0007/split_1.csv.expect b/test/cases/verb-split/0007/split_1.csv.expect
new file mode 100644
index 000000000..f228ed651
--- /dev/null
+++ b/test/cases/verb-split/0007/split_1.csv.expect
@@ -0,0 +1,6 @@
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
+red,circle,true,3,16,13.8103,2.9010
+purple,triangle,false,5,51,81.2290,8.5910
+purple,triangle,false,7,65,80.1405,5.8240
+yellow,circle,true,9,87,63.5058,8.3350
diff --git a/test/cases/verb-split/0007/split_2.csv.expect b/test/cases/verb-split/0007/split_2.csv.expect
new file mode 100644
index 000000000..cf8dd0bd4
--- /dev/null
+++ b/test/cases/verb-split/0007/split_2.csv.expect
@@ -0,0 +1,6 @@
+color,shape,flag,k,index,quantity,rate
+red,square,true,2,15,79.2778,0.0130
+red,square,false,4,48,77.5542,7.4670
+red,square,false,6,64,77.1991,9.5310
+yellow,circle,true,8,73,63.9785,4.2370
+purple,square,false,10,91,72.3735,8.2430
diff --git a/test/input/example.csv b/test/input/example.csv
new file mode 100644
index 000000000..bf79dd5f7
--- /dev/null
+++ b/test/input/example.csv
@@ -0,0 +1,11 @@
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
+red,square,true,2,15,79.2778,0.0130
+red,circle,true,3,16,13.8103,2.9010
+red,square,false,4,48,77.5542,7.4670
+purple,triangle,false,5,51,81.2290,8.5910
+red,square,false,6,64,77.1991,9.5310
+purple,triangle,false,7,65,80.1405,5.8240
+yellow,circle,true,8,73,63.9785,4.2370
+yellow,circle,true,9,87,63.5058,8.3350
+purple,square,false,10,91,72.3735,8.2430
diff --git a/todo.txt b/todo.txt
index 92fda4e5f..3539fde95 100644
--- a/todo.txt
+++ b/todo.txt
@@ -1,4 +1,4 @@
-================================================================
+===============================================================
RELEASES
* follow ...
@@ -26,6 +26,10 @@ FEATURES
o format/unformat
o strmatch
o =~
+* separate examples from FAQs
+* mlr split -- needs an example page along with the tee DSL function
+* new example entry, with ccump and pgr
+ o slwin --prune (or somesuch) to only emit averages over full windows -- ?
----------------------------------------------------------------
k better print-interpolate with {} etc
@@ -33,15 +37,10 @@ k better print-interpolate with {} etc
----------------------------------------------------------------
! sysdate, sysdate_local; datediff ...
-----------------------------------------------------------------
-mlr split ... -n, -g -- ?
-- how to specify filenames?
-
----------------------------------------------------------------
! strmatch https://github.com/johnkerl/miller/issues/77#issuecomment-538790927
----------------------------------------------------------------
-* new example entry, with ccump and pgr
* make a lag-by-n and lead-by-n
----------------------------------------------------------------