From 4e9d32ed7fc9752c56f8cfbd43bb01d3338978f3 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 11 Jun 2017 08:53:09 -0700 Subject: [PATCH] weighted-means cookbook example --- c/draft-release-notes.md | 37 +++++++++++++++++++++------ c/mapping/mapper_uniq.c | 10 ++++---- c/reg_test/expected/out | 10 ++++++-- c/todo.txt | 20 +++------------ doc/content-for-cookbook2.html | 11 ++++++++ doc/cookbook2.html | 46 ++++++++++++++++++++++++++++++++++ doc/data/weighted-mean.sh | 25 ++++++++++++++++++ doc/reference-verbs.html | 12 +++++++-- 8 files changed, 138 insertions(+), 33 deletions(-) create mode 100644 doc/data/weighted-mean.sh diff --git a/c/draft-release-notes.md b/c/draft-release-notes.md index 2461dc0f1..027bb1057 100644 --- a/c/draft-release-notes.md +++ b/c/draft-release-notes.md @@ -1,26 +1,49 @@ -This is a relatively minor release, containing feature requests. +This release contains mostly feature requests. **Features:** -* There is a new DSL function [**mapexcept**](http://johnkerl.org/miller-releases/miller-5.2.0/doc/reference-dsl.html#mapexcept) which returns a copy of the argument with specified key(s), if any, unset. Likewise, [**mapselect**](http://johnkerl.org/miller-releases/miller-5.2.0/doc/reference-dsl.html#mapselect) returns a copy of the argument with only specified key(s), if any, set. This resolves https://github.com/johnkerl/miller/issues/137. +* There is a new DSL function +[**mapexcept**](http://johnkerl.org/miller-releases/miller-5.2.0/doc/reference-dsl.html#mapexcept) which returns a +copy of the argument with specified key(s), if any, unset. The motivating use-case is to split records to multiple +filenames depending on particular field value, which is omitted from the output: `mlr --from f.dat put 'tee > +"/tmp/data-".$a, mapexcept($*, "a")'` Likewise, +[**mapselect**](http://johnkerl.org/miller-releases/miller-5.2.0/doc/reference-dsl.html#mapselect) returns a copy of the +argument with only specified key(s), if any, set. This resolves https://github.com/johnkerl/miller/issues/137. -* xxx min/max functions and stats1/merge-fields min/max/percentile mix int and string. esp. string-only order statistics. doclink for mixed case. interpolation obv nonsensical. +* The [**min**](http://johnkerl.org/miller-releases/miller-5.2.0/doc/reference-dsl.html#min) +and [**max**](http://johnkerl.org/miller-releases/miller-5.2.0/doc/reference-dsl.html#max) DSL functions, and the +min/max/percentile aggregators for the +[**stats1**](http://johnkerl.org/miller-releases/miller-5.2.0/doc/reference-verbs.html#stats1) and +[**merge-fields**](http://johnkerl.org/miller-releases/miller-5.2.0/doc/reference-verbs.html#merge-fields) verbs, now +support numeric as well as string field values. (For mixed string/numeric fields, numbers compare before strings.) This +means in particular that order statistics are now possible on string-only fields. Interpolation is obviously nonsensical +for strings, so interpolated percentiles such as `mlr stats1 -a p50 -f a -i` yields an error for string-only fields. +Likewise, any other aggregations requiring arithmetic, such as mean, also produce an error on string-valued +input. * A new **-u** option for [**count-distinct**](http://johnkerl.org/miller-releases/miller-5.2.0/doc/reference-verbs.html#count-distinct) allows unlashed counts for multiple field names. For example, with `-f a,b` and without `-u`, `count-distinct` computes counts for distinct pairs of `a` and `b` field values. With `-f a,b` and with `-u`, it computes counts for distinct `a` field values and counts for distinct `b` field values separately. -* xxx `./configure` vs. `autoreconf -fiv` 1st, and which issue is resolved by this. +* If you [build from source](http://johnkerl.org/miller-releases/miller-5.2.0/doc/build.html), you can now +do `./configure` without first doing `autoreconf -fiv`. This resolves https://github.com/johnkerl/miller/issues/xxx. +**xxx to do**: figure out and fix the timestamp issue. +**xxx to do**: update the build.html page. -* xxx UTF-8 BOM strip for CSV files; resolves xxx +* The UTF-8 BOM sequence `0xef` `0xbb` `0xbf` is now automatically ignored from the start of CSV files. (The same is +already done for JSON files.) This resolves https://github.com/johnkerl/miller/issues/xxx. * For `put` and `filter` with `-S`, program literals such as the `6` in `$x = 6` were being parsed as strings. This is not sensible, since the `-S` option for `put` and `filter` is intended to suppress numeric conversion of record data, not program literals. To get string `6` one may use `$x = "6"`. **Documentation:** -* Suppose you have counters in a SQL database with different values in successive queries. A new cookbook example shows [**how to compute differences between successive queries**](http://www.johnkerl.org/miller-releases/miller-5.2.0/doc/cookbook.html#Showing_differences_between_successive_queries). +* A new cookbook example shows [**how to compute differences between successive +queries**](http://www.johnkerl.org/miller-releases/miller-5.2.0/doc/cookbook.html#Showing_differences_between_successive_queries), +e.g. to find out what changed in time-varying data when you run and rerun a SQL query. -* Another new cookbook example shows [**how to compute interquartile ranges**](http://www.johnkerl.org/miller-releases/miller-5.2.0/doc/cookbook2.html#Computing_interquartile_ranges) +* Another new cookbook example shows [**how to compute interquartile ranges**](http://www.johnkerl.org/miller-releases/miller-5.2.0/doc/cookbook2.html#Computing_interquartile_ranges). + +* A third new cookbook example shows [**how to compute weighted means**](http://www.johnkerl.org/miller-releases/miller-5.2.0/doc/cookbook2.html#Computing_weighted_means). **Bugfixes:** diff --git a/c/mapping/mapper_uniq.c b/c/mapping/mapper_uniq.c index 133697aac..758a57d47 100644 --- a/c/mapping/mapper_uniq.c +++ b/c/mapping/mapper_uniq.c @@ -223,16 +223,16 @@ static sllv_t* mapper_uniq_process_unlashed(lrec_t* pinrec, context_t* pctx, voi else { sllv_t* poutrecs = sllv_alloc(); for (lhmsve_t* pe = pstate->pcounts_unlashed->phead; pe != NULL; pe = pe->pnext) { - lrec_t* poutrec = lrec_unbacked_alloc(); char* field_name= pe->key; lhmsll_t* pcounts_for_field_name = pe->pvvalue; - lrec_put(poutrec, "field", field_name, NO_FREE); for (lhmslle_t* pf = pcounts_for_field_name->phead; pf != NULL; pf = pf->pnext) { char* field_value = pf->key; - lrec_put(poutrec, mlr_paste_2_strings(field_value, "_count"), mlr_alloc_string_from_ll(pf->value), - FREE_ENTRY_KEY|FREE_ENTRY_VALUE); + lrec_t* poutrec = lrec_unbacked_alloc(); + lrec_put(poutrec, "field", field_name, NO_FREE); + lrec_put(poutrec, "value", field_value, NO_FREE); + lrec_put(poutrec, "count", mlr_alloc_string_from_ll(pf->value), FREE_ENTRY_VALUE); + sllv_append(poutrecs, poutrec); } - sllv_append(poutrecs, poutrec); } sllv_append(poutrecs, NULL); return poutrecs; diff --git a/c/reg_test/expected/out b/c/reg_test/expected/out index 659450118..0c466cb4e 100644 --- a/c/reg_test/expected/out +++ b/c/reg_test/expected/out @@ -919,8 +919,14 @@ a=hat,b=wye,count=2 a=pan,b=wye,count=2 mlr count-distinct -f a,b -u ./reg_test/input/small ./reg_test/input/abixy -field=a,pan_count=4,eks_count=6,wye_count=4,zee_count=4,hat_count=2 -field=b,pan_count=8,wye_count=10,zee_count=2 +field=a,value=pan,count=4 +field=a,value=eks,count=6 +field=a,value=wye,count=4 +field=a,value=zee,count=4 +field=a,value=hat,count=2 +field=b,value=pan,count=8 +field=b,value=wye,count=10 +field=b,value=zee,count=2 mlr count-distinct -f a -n ./reg_test/input/small ./reg_test/input/abixy count=5 diff --git a/c/todo.txt b/c/todo.txt index d8bda360c..9bc178f39 100644 --- a/c/todo.txt +++ b/c/todo.txt @@ -11,37 +11,23 @@ BUGFIXES x=9223372036854775802,y=-9223372036854775806 x=9223372036854775805,y=-9223372036854775802 -mlr cat then sec2gmt: -Usage: mlr (null) [options] {comma-separated list of field names} -Replaces a numeric field representing seconds since the epoch with the -corresponding GMT timestamp; leaves non-numbers as-is. This is nothing -more than a keystroke-saver for the sec2gmt function: - mlr (null) time1,time2 -is the same as - ================================================================ FUNDAM: * synctool alias/flag handling ... ================================================================ -5.2.0 TO-DO: +5.3.0 TO-DO: ---------------------------------------------------------------- airable: -? count-distinct -u wtf ? - ! termcvt -I !!! aux-list -> main help; dox too * UT unhex ! faqent/cookbook/more: mlr termcvt --cr2lf foo.csv.cr > foo.csv -* IQR: - - IQR-put faqent - ? pn-pm aggr @ stats1 ?!? - ! !autoreconf doc note w/ as-of-5.2.0 caveat * reg_test/run --mlrexec flag @@ -141,13 +127,13 @@ MAPVAR CHECKLIST: * clarify ownership semantics in localstack & mlhmmv via function names, & top-of-file comments ================================================================ -5.2.0 ideas: +5.3.0 ideas: ---------------------------------------------------------------- ! multi-field x many verbs: -f/-r field-name-spec opportunities throughout which verbs: - * stats1 + k stats1 (done in 5.2.0) * stats2 * merge-fields -x - count-distinct diff --git a/doc/content-for-cookbook2.html b/doc/content-for-cookbook2.html index e3b5d0615..7b4c9e58d 100644 --- a/doc/content-for-cookbook2.html +++ b/doc/content-for-cookbook2.html @@ -117,6 +117,17 @@ POKI_INCLUDE_AND_RUN_ESCAPED(data/iqr1.sh)HERE POKI_INCLUDE_AND_RUN_ESCAPED(data/iqrn.sh)HERE + + +

Computing weighted means

+ +
+ +

This might be more elegantly implemented as an option within the stats1 verb. Meanwhile, it’s +expressible within the DSL: + +POKI_INCLUDE_AND_RUN_ESCAPED(data/weighted-mean.sh)HERE +

Generating random numbers from various distributions

diff --git a/doc/cookbook2.html b/doc/cookbook2.html index 47322bb93..fa68301e6 100644 --- a/doc/cookbook2.html +++ b/doc/cookbook2.html @@ -197,6 +197,7 @@ Miller commands were run with pretty-print-tabular output format. • Randomly generating jabberwocky words
• Program timing
• Computing interquartile ranges
+• Computing weighted means
• Generating random numbers from various distributions
• Sieve of Eratosthenes
• Mandelbrot-set generator
@@ -374,6 +375,51 @@ y_iqr 0.511866

+ + +

Computing weighted means

+ +
+ +

This might be more elegantly implemented as an option within the stats1 verb. Meanwhile, it’s +expressible within the DSL: + +

+

+
+$ mlr --from data/medium put -q '
+  # Using the y field for weighting in this example
+  weight = $y;
+
+  # Using the a field for weighted aggregation in this example
+  @sumwx[$a] += weight * $i;
+  @sumw[$a] += weight;
+
+  @sumx[$a] += $i;
+  @sumn[$a] += 1;
+
+  end {
+    map wmean = {};
+    map mean  = {};
+    for (a in @sumwx) {
+      wmean[a] = @sumwx[a] / @sumw[a]
+    }
+    for (a in @sumx) {
+      mean[a] = @sumx[a] / @sumn[a]
+    }
+    #emit wmean, "a";
+    #emit mean, "a";
+    emit (wmean, mean), "a";
+  }'
+a=pan,wmean=4979.563722,mean=5028.259010
+a=eks,wmean=4890.381593,mean=4956.290076
+a=wye,wmean=4946.987746,mean=4920.001017
+a=zee,wmean=5164.719685,mean=5123.092330
+a=hat,wmean=4925.533162,mean=4967.743946
+
+
+

+

Generating random numbers from various distributions

diff --git a/doc/data/weighted-mean.sh b/doc/data/weighted-mean.sh new file mode 100644 index 000000000..1b627b782 --- /dev/null +++ b/doc/data/weighted-mean.sh @@ -0,0 +1,25 @@ +mlr --from data/medium put -q ' + # Using the y field for weighting in this example + weight = $y; + + # Using the a field for weighted aggregation in this example + @sumwx[$a] += weight * $i; + @sumw[$a] += weight; + + @sumx[$a] += $i; + @sumn[$a] += 1; + + end { + map wmean = {}; + map mean = {}; + for (a in @sumwx) { + wmean[a] = @sumwx[a] / @sumw[a] + } + for (a in @sumx) { + mean[a] = @sumx[a] / @sumn[a] + } + #emit wmean, "a"; + #emit mean, "a"; + emit (wmean, mean), "a"; + }' + diff --git a/doc/reference-verbs.html b/doc/reference-verbs.html index bb069e4ab..e4bbcd231 100644 --- a/doc/reference-verbs.html +++ b/doc/reference-verbs.html @@ -655,8 +655,16 @@ a=eks,b=zee,count=357
 $ mlr count-distinct -u -f a,b data/medium
-field=a,pan_count=2081,eks_count=1965,wye_count=1966,zee_count=2047,hat_count=1941
-field=b,pan_count=1942,wye_count=2057,zee_count=1943,eks_count=2008,hat_count=2050
+field=a,value=pan,count=2081
+field=a,value=eks,count=1965
+field=a,value=wye,count=1966
+field=a,value=zee,count=2047
+field=a,value=hat,count=1941
+field=b,value=pan,count=1942
+field=b,value=wye,count=2057
+field=b,value=zee,count=1943
+field=b,value=eks,count=2008
+field=b,value=hat,count=2050