diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index cfa66dd82..417aa4d9b 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -3126,6 +3126,23 @@ a b c 9 8 7 +## sparsify + +
+mlr sparsify --help ++
+Usage: mlr sparsify [options]
+Unsets fields for which the key is the empty string (or, optionally, another
+specified value). Only makes sense with output format not being CSV or TSV.
+Options:
+-s {filler string} What values to remove. Defaults to the empty string.
+-f {a,b,c} Specify field names to be operated on; any other fields won't be
+ modified. The default is to modify all fields.
+-h|--help Show this message.
+Example: if input is a=1,b=,c=3 then output is a=1,c=3.
+
+
## split
@@ -3409,14 +3426,14 @@ fields, optionally categorized by one or more fields. data/medium
-x_y_cov 0.000042574820827444476 -x_y_corr 0.0005042001844467462 -y_y_cov 0.08461122467974003 +x_y_cov 0.00004257482082749404 +x_y_corr 0.0005042001844473328 +y_y_cov 0.08461122467974005 y_y_corr 1 -x2_xy_cov 0.04188382281779374 -x2_xy_corr 0.630174342037994 -x2_y2_cov -0.00030953725962542085 -x2_y2_corr -0.0034249088761121966 +x2_xy_cov 0.041883822817793716 +x2_xy_corr 0.6301743420379936 +x2_y2_cov -0.0003095372596253918 +x2_y2_corr -0.003424908876111875
@@ -3425,12 +3442,12 @@ x2_y2_corr -0.0034249088761121966 data/medium
-a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2 -pan 0.01702551273681908 0.5004028922897639 2081 0.00028691820445814767 1 0 2081 1 0.8781320866715662 0.11908230147563566 2081 0.41749827377311266 -eks 0.0407804923685586 0.48140207967651016 1965 0.0016461239223448587 1 0 1965 1 0.8978728611690183 0.10734054433612333 1965 0.45563223864254526 -wye -0.03915349075204814 0.5255096523974456 1966 0.0015051268704373607 1 0 1966 1 0.8538317334220835 0.1267454301662969 1966 0.38991721818599295 -zee 0.0027812364960399147 0.5043070448033061 2047 0.000007751652858786137 1 0 2047 1 0.8524439912011013 0.12401684308018937 2047 0.39356598090006495 -hat -0.018620577041095078 0.5179005397264935 1941 0.0003520036646055585 1 0 1941 1 0.8412305086345014 0.13557328318623216 1941 0.3687944261732265 +a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2 +pan 0.017025512736819345 0.500402892289764 2081 0.00028691820445815624 1 -0.00000000000000002890430283104539 2081 1 0.8781320866715664 0.11908230147563569 2081 0.4174982737731127 +eks 0.04078049236855813 0.4814020796765104 1965 0.0016461239223448218 1 0.00000000000000017862676354313703 1965 1 0.897872861169018 0.1073405443361234 1965 0.4556322386425451 +wye -0.03915349075204785 0.5255096523974457 1966 0.0015051268704373377 1 0.00000000000000004464425401127647 1966 1 0.8538317334220837 0.1267454301662969 1966 0.3899172181859931 +zee 0.0027812364960401333 0.5043070448033061 2047 0.000007751652858787357 1 0.00000000000000004819404567023685 2047 1 0.8524439912011011 0.12401684308018947 2047 0.39356598090006495 +hat -0.018620577041095272 0.5179005397264937 1941 0.00035200366460556604 1 -0.00000000000000003400445761787692 1941 1 0.8412305086345017 0.13557328318623207 1941 0.3687944261732266Here's an example simple line-fit. The `x` and `y` @@ -3516,11 +3533,11 @@ upsec_count_pca_quality 0.9999590846136102 donesec 92.33051350964094 color purple -upsec_count_pca_m -39.03009744795354 -upsec_count_pca_b 979.9883413064914 +upsec_count_pca_m -39.030097447953594 +upsec_count_pca_b 979.9883413064917 upsec_count_pca_n 21 upsec_count_pca_quality 0.9999908956206317 -donesec 25.10852919630297 +donesec 25.108529196302943 ## step @@ -3797,9 +3814,9 @@ distinct_count 5 5 10000 10000 10000 mode pan wye 1 0.3467901443380824 0.7268028627434533 sum 0 0 50005000 4986.019681679581 5062.057444929905 mean - - 5000.5 0.49860196816795804 0.5062057444929905 -stddev - - 2886.8956799071675 0.2902925151144007 0.290880086426933 -var - - 8334166.666666667 0.08426974433144456 0.08461122467974003 -skewness - - 0 -0.0006899591185521965 -0.017849760120133784 +stddev - - 2886.8956799071675 0.29029251511440074 0.2908800864269331 +var - - 8334166.666666667 0.08426974433144457 0.08461122467974005 +skewness - - 0 -0.0006899591185517494 -0.01784976012013298 minlen 3 3 1 15 13 maxlen 3 3 5 22 22 min eks eks 1 0.00004509679127584487 0.00008818962627266114 diff --git a/docs/src/reference-verbs.md.in b/docs/src/reference-verbs.md.in index 44feda3de..8959ebf6b 100644 --- a/docs/src/reference-verbs.md.in +++ b/docs/src/reference-verbs.md.in @@ -995,6 +995,12 @@ GENMD-RUN-COMMAND mlr --ijson --opprint sort-within-records data/sort-within-records.json GENMD-EOF +## sparsify + +GENMD-RUN-COMMAND +mlr sparsify --help +GENMD-EOF + ## split GENMD-RUN-COMMAND diff --git a/pkg/transformers/aaa_transformer_table.go b/pkg/transformers/aaa_transformer_table.go index ed98af07f..34a5b6ea8 100644 --- a/pkg/transformers/aaa_transformer_table.go +++ b/pkg/transformers/aaa_transformer_table.go @@ -62,6 +62,7 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{ SkipTrivialRecordsSetup, SortSetup, SortWithinRecordsSetup, + SparsifySetup, SplitSetup, SsubSetup, Stats1Setup, diff --git a/pkg/transformers/sparsify.go b/pkg/transformers/sparsify.go new file mode 100644 index 000000000..b6ae40c51 --- /dev/null +++ b/pkg/transformers/sparsify.go @@ -0,0 +1,192 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "strings" + + "github.com/johnkerl/miller/pkg/cli" + "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/pkg/types" +) + +// ---------------------------------------------------------------- +const verbNameSparsify = "sparsify" + +var SparsifySetup = TransformerSetup{ + Verb: verbNameSparsify, + UsageFunc: transformerSparsifyUsage, + ParseCLIFunc: transformerSparsifyParseCLI, + IgnoresInput: false, +} + +func transformerSparsifyUsage( + o *os.File, +) { + fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameSparsify) + fmt.Fprint(o, + `Unsets fields for which the key is the empty string (or, optionally, another +specified value). Only makes sense with output format not being CSV or TSV. +`) + + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-s {filler string} What values to remove. Defaults to the empty string.\n") + fmt.Fprintf(o, "-f {a,b,c} Specify field names to be operated on; any other fields won't be\n") + fmt.Fprintf(o, " modified. The default is to modify all fields.\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") + + fmt.Fprint(o, + `Example: if input is a=1,b=,c=3 then output is a=1,c=3. +`) +} + +func transformerSparsifyParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + verb := args[argi] + argi++ + + fillerString := "" + var specifiedFieldNames []string = nil + + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + if args[argi] == "--" { + break // All transformers must do this so main-flags can follow verb-flags + } + argi++ + + if opt == "-h" || opt == "--help" { + transformerSparsifyUsage(os.Stdout) + os.Exit(0) + + } else if opt == "-s" { + fillerString = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) + + } else if opt == "-f" { + specifiedFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + + } else { + transformerSparsifyUsage(os.Stderr) + os.Exit(1) + } + } + + *pargi = argi + if !doConstruct { // All transformers must do this for main command-line parsing + return nil + } + + transformer, err := NewTransformerSparsify( + fillerString, + specifiedFieldNames, + ) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + + return transformer +} + +// ---------------------------------------------------------------- +type TransformerSparsify struct { + fillerString string + fieldNamesSet map[string]bool + recordTransformerFunc RecordTransformerFunc +} + +func NewTransformerSparsify( + fillerString string, + specifiedFieldNames []string, +) (*TransformerSparsify, error) { + + tr := &TransformerSparsify{ + fillerString: fillerString, + fieldNamesSet: lib.StringListToSet(specifiedFieldNames), + } + if specifiedFieldNames == nil { + tr.recordTransformerFunc = tr.transformAll + } else { + tr.recordTransformerFunc = tr.transformSome + } + + return tr, nil +} + +func (tr *TransformerSparsify) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + + if !inrecAndContext.EndOfStream { + tr.recordTransformerFunc( + inrecAndContext, + outputRecordsAndContexts, + inputDownstreamDoneChannel, + outputDownstreamDoneChannel, + ) + } else { + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker + } +} + +func (tr *TransformerSparsify) transformAll( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + inrec := inrecAndContext.Record + outrec := mlrval.NewMlrmapAsRecord() + + for pe := inrec.Head; pe != nil; pe = pe.Next { + if pe.Value.String() != tr.fillerString { + // Reference OK because ownership transfer + outrec.PutReference(pe.Key, pe.Value) + } + } + + outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(outrecAndContext) +} + +// ---------------------------------------------------------------- +func (tr *TransformerSparsify) transformSome( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + inrec := inrecAndContext.Record + outrec := mlrval.NewMlrmapAsRecord() + + for pe := inrec.Head; pe != nil; pe = pe.Next { + if tr.fieldNamesSet[pe.Key] { + if pe.Value.String() != tr.fillerString { + // Reference OK because ownership transfer + outrec.PutReference(pe.Key, pe.Value) + } + } else { + outrec.PutReference(pe.Key, pe.Value) + } + } + + outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(outrecAndContext) +} diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index b25e4a56d..95b4d3f14 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -988,6 +988,18 @@ Options: -r Recursively sort subobjects/submaps, e.g. for JSON input. -h|--help Show this message. +================================================================ +sparsify +Usage: mlr sparsify [options] +Unsets fields for which the key is the empty string (or, optionally, another +specified value). Only makes sense with output format not being CSV or TSV. +Options: +-s {filler string} What values to remove. Defaults to the empty string. +-f {a,b,c} Specify field names to be operated on; any other fields won't be + modified. The default is to modify all fields. +-h|--help Show this message. +Example: if input is a=1,b=,c=3 then output is a=1,c=3. + ================================================================ split Usage: mlr split [options] {filename} diff --git a/test/cases/verb-sparsify/0001/cmd b/test/cases/verb-sparsify/0001/cmd new file mode 100644 index 000000000..38ec29b15 --- /dev/null +++ b/test/cases/verb-sparsify/0001/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify diff --git a/test/cases/verb-sparsify/0001/experr b/test/cases/verb-sparsify/0001/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0001/expout b/test/cases/verb-sparsify/0001/expout new file mode 100644 index 000000000..e9c9893a9 --- /dev/null +++ b/test/cases/verb-sparsify/0001/expout @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5 +}, +{}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0002/cmd b/test/cases/verb-sparsify/0002/cmd new file mode 100644 index 000000000..3ac1c9630 --- /dev/null +++ b/test/cases/verb-sparsify/0002/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f a diff --git a/test/cases/verb-sparsify/0002/experr b/test/cases/verb-sparsify/0002/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0002/expout b/test/cases/verb-sparsify/0002/expout new file mode 100644 index 000000000..8bc89d0aa --- /dev/null +++ b/test/cases/verb-sparsify/0002/expout @@ -0,0 +1,21 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "b": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0003/cmd b/test/cases/verb-sparsify/0003/cmd new file mode 100644 index 000000000..fc08ebef9 --- /dev/null +++ b/test/cases/verb-sparsify/0003/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f b diff --git a/test/cases/verb-sparsify/0003/experr b/test/cases/verb-sparsify/0003/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0003/expout b/test/cases/verb-sparsify/0003/expout new file mode 100644 index 000000000..b607e3893 --- /dev/null +++ b/test/cases/verb-sparsify/0003/expout @@ -0,0 +1,21 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "a": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0004/cmd b/test/cases/verb-sparsify/0004/cmd new file mode 100644 index 000000000..5ea1aa7bd --- /dev/null +++ b/test/cases/verb-sparsify/0004/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f b,c diff --git a/test/cases/verb-sparsify/0004/experr b/test/cases/verb-sparsify/0004/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0004/expout b/test/cases/verb-sparsify/0004/expout new file mode 100644 index 000000000..ebf9878cd --- /dev/null +++ b/test/cases/verb-sparsify/0004/expout @@ -0,0 +1,19 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5 +}, +{ + "a": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0005/cmd b/test/cases/verb-sparsify/0005/cmd new file mode 100644 index 000000000..012aee2b6 --- /dev/null +++ b/test/cases/verb-sparsify/0005/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -s 1 diff --git a/test/cases/verb-sparsify/0005/experr b/test/cases/verb-sparsify/0005/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0005/expout b/test/cases/verb-sparsify/0005/expout new file mode 100644 index 000000000..839476d58 --- /dev/null +++ b/test/cases/verb-sparsify/0005/expout @@ -0,0 +1,21 @@ +[ +{ + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "a": "", + "b": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0006/cmd b/test/cases/verb-sparsify/0006/cmd new file mode 100644 index 000000000..42567786a --- /dev/null +++ b/test/cases/verb-sparsify/0006/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f a -s 1 diff --git a/test/cases/verb-sparsify/0006/experr b/test/cases/verb-sparsify/0006/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0006/expout b/test/cases/verb-sparsify/0006/expout new file mode 100644 index 000000000..839476d58 --- /dev/null +++ b/test/cases/verb-sparsify/0006/expout @@ -0,0 +1,21 @@ +[ +{ + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "a": "", + "b": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0007/cmd b/test/cases/verb-sparsify/0007/cmd new file mode 100644 index 000000000..99b590da4 --- /dev/null +++ b/test/cases/verb-sparsify/0007/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f b -s 1 diff --git a/test/cases/verb-sparsify/0007/experr b/test/cases/verb-sparsify/0007/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0007/expout b/test/cases/verb-sparsify/0007/expout new file mode 100644 index 000000000..d7f95feba --- /dev/null +++ b/test/cases/verb-sparsify/0007/expout @@ -0,0 +1,22 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "a": "", + "b": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/verb-sparsify/0008/cmd b/test/cases/verb-sparsify/0008/cmd new file mode 100644 index 000000000..b943d2c79 --- /dev/null +++ b/test/cases/verb-sparsify/0008/cmd @@ -0,0 +1 @@ +mlr --c2j --from test/input/sparsify-input.csv sparsify -f b,c -s 1 diff --git a/test/cases/verb-sparsify/0008/experr b/test/cases/verb-sparsify/0008/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-sparsify/0008/expout b/test/cases/verb-sparsify/0008/expout new file mode 100644 index 000000000..d7f95feba --- /dev/null +++ b/test/cases/verb-sparsify/0008/expout @@ -0,0 +1,22 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": "" +}, +{ + "a": "", + "b": "", + "c": "" +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/input/sparsify-input.csv b/test/input/sparsify-input.csv new file mode 100644 index 000000000..16916596e --- /dev/null +++ b/test/input/sparsify-input.csv @@ -0,0 +1,5 @@ +a,b,c +1,2,3 +4,5, +,, +7,8,9