DSL functions for summary stats over arrays / maps (#1364)

* DSL stats functions [WIP]

* refactor

* move percentile computation to bifs module; iterate

* mode and antimode

* percentile iterate

* percentile sketching

* neaten

* unit-test iterate

* unify old & new min & max functions

* unit-test cases

* code-dedupe between mode and antimode

* make mode/antimode ties deterministic via first-found-wins rule

* online help strings for new stats DSL functions

* artifacts from `make dev`

* help info on how min/max now recurse into collections

* artifacts from `make dev`

* typofix
This commit is contained in:
John Kerl 2023-08-26 16:02:30 -04:00 committed by GitHub
parent 392b34fd04
commit d341cc6dd3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
93 changed files with 3731 additions and 361 deletions

View file

@ -160,11 +160,11 @@ CITRUS COUNTY 1332.9 79974.9 483785.1
<b> stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012</b>
</pre>
<pre class="pre-non-highlight-in-pair">
tiv_2011_tiv_2012_corr 0.9730497632351692
tiv_2011_tiv_2012_ols_m 0.9835583980337723
tiv_2011_tiv_2012_ols_b 433854.6428968317
tiv_2011_tiv_2012_corr 0.9730497632351701
tiv_2011_tiv_2012_ols_m 0.9835583980337732
tiv_2011_tiv_2012_ols_b 433854.6428968301
tiv_2011_tiv_2012_ols_n 36634
tiv_2011_tiv_2012_r2 0.9468258417320189
tiv_2011_tiv_2012_r2 0.9468258417320204
</pre>
<pre class="pre-highlight-in-pair">
@ -322,7 +322,7 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
</pre>
<pre class="pre-non-highlight-in-pair">
u_v_corr w_x_corr
0.1334180491027861 -0.011319841199866178
0.1334180491027861 -0.011319841199852926
</pre>
<pre class="pre-highlight-in-pair">
@ -332,22 +332,22 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
</pre>
<pre class="pre-non-highlight-in-pair">
color shape u_v_corr w_x_corr
red circle 0.9807984401887236 -0.01856553658708754
orange square 0.17685855992752927 -0.07104431573806054
green circle 0.05764419437577255 0.01179572988801509
red square 0.05574477124893523 -0.0006801456507510942
yellow triangle 0.04457273771962798 0.024604310103081825
yellow square 0.04379172927296089 -0.04462197201631237
purple circle 0.03587354936895086 0.1341133954140899
blue square 0.03241153095761164 -0.053507648119643196
blue triangle 0.015356427073158766 -0.0006089997461435399
orange circle 0.010518953877704048 -0.16279397329279383
red triangle 0.00809782571528034 0.012486621357942596
purple triangle 0.005155190909099334 -0.045057909256220656
purple square -0.025680276963377404 0.05769429647930396
green square -0.0257760734502851 -0.003265173252087127
orange triangle -0.030456661186085785 -0.1318699981926352
yellow circle -0.06477331572781474 0.07369449819706045
blue circle -0.10234761901929677 -0.030528539069837757
green triangle -0.10901825107358765 -0.04848782060162929
red circle 0.9807984401887242 -0.018565536587084836
orange square 0.17685855992752933 -0.07104431573805543
green circle 0.05764419437577257 0.011795729888018455
red square 0.0557447712489348 -0.0006801456507506415
yellow triangle 0.0445727377196281 0.024604310103079844
yellow square 0.0437917292729612 -0.044621972016306265
purple circle 0.03587354936895115 0.13411339541407613
blue square 0.03241153095761152 -0.05350764811965621
blue triangle 0.015356427073158612 -0.0006089997461408209
orange circle 0.010518953877704181 -0.1627939732927932
red triangle 0.00809782571528054 0.01248662135795501
purple triangle 0.005155190909099739 -0.04505790925621933
purple square -0.02568027696337717 0.057694296479293694
green square -0.025776073450284875 -0.0032651732520739014
orange triangle -0.030456661186085584 -0.13186999819263814
yellow circle -0.06477331572781515 0.0736944981970553
blue circle -0.1023476190192966 -0.030528539069839333
green triangle -0.10901825107358747 -0.04848782060162855
</pre>

View file

@ -251,6 +251,7 @@ a=eks,b=pan,i=2,y=0.522151,ab=ekspan,iy=2.522151,ta=String,tb=String,ti=Integer,
a=wye,b=wye,i=3,y=0.338318,ab=wyewye,iy=3.338318,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
a=eks,b=wye,i=4,y=0.134188,ab=ekswye,iy=4.134188,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
a=wye,b=pan,i=5,y=0.863624,ab=wyepan,iy=5.863624,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/lib/ruby/2.6.0/universal-darwin22/rbconfig.rb:21: warning: Insecure world writable dir /usr/local/bin in PATH, mode 040777
</pre>
Run as-is, then pipe to Miller for pretty-printing:
@ -265,4 +266,5 @@ eks pan 2 0.522151 ekspan 2.522151 String String Integer Float String Float
wye wye 3 0.338318 wyewye 3.338318 String String Integer Float String Float
eks wye 4 0.134188 ekswye 4.134188 String String Integer Float String Float
wye pan 5 0.863624 wyepan 5.863624 String String Integer Float String Float
/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/lib/ruby/2.6.0/universal-darwin22/rbconfig.rb:21: warning: Insecure world writable dir /usr/local/bin in PATH, mode 040777
</pre>

View file

@ -203,32 +203,34 @@ MILLER(1) MILLER(1)
unsparsify
1mFUNCTION LIST0m
abs acos acosh any append apply arrayify asin asinh asserting_absent
abs acos acosh antimode any append apply arrayify asin asinh asserting_absent
asserting_array asserting_bool asserting_boolean asserting_empty
asserting_empty_map asserting_error asserting_float asserting_int
asserting_map asserting_nonempty_map asserting_not_array asserting_not_empty
asserting_not_map asserting_not_null asserting_null asserting_numeric
asserting_present asserting_string atan atan2 atanh bitcount boolean
capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh
depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor
fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values
gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec
hostname index int invqnorm is_absent is_array is_bool is_boolean is_empty
is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map
count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp expm1
flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys
get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec
hms2sec hostname index int invqnorm is_absent is_array is_bool is_boolean
is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map
is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present
is_string joink joinkv joinv json_parse json_stringify latin1_to_utf8
is_string joink joinkv joinv json_parse json_stringify kurtosis latin1_to_utf8
leafcount leftpad length localtime2gmt localtime2nsec localtime2sec log log10
log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 mexp min
mmul msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime os pow qnorm
log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max maxlen md5
mean meaneb median mexp min minlen mmul mode msub nsec2gmt nsec2gmtdate
nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm
reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms
sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256
sha512 sin sinh sort splita splitax splitkv splitkvx splitnv splitnvx sqrt
ssub strfntime strfntime_local strftime strftime_local string strip strlen
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
sysntime system systime systimeint tan tanh tolower toupper truncate typeof
unflatten unformat unformatx upntime uptime urand urand32 urandelement
urandint urandrange utf8_to_latin1 version ! != !=~ % & && * ** + - . .* .+ .-
./ / // &lt; &lt;&lt; &lt;= &lt;=&gt; == =~ &gt; &gt;= &gt;&gt; &gt;&gt;&gt; ?: ?? ??? ^ ^^ | || ~
sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx
splitnv splitnvx sqrt ssub stddev strfntime strfntime_local strftime
strftime_local string strip strlen strpntime strpntime_local strptime
strptime_local sub substr substr0 substr1 sum sum2 sum3 sum4 sysntime system
systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat
unformatx upntime uptime urand urand32 urandelement urandint urandrange
utf8_to_latin1 variance version ! != !=~ % & && * ** + - . .* .+ .- ./ / // &lt;
&lt;&lt; &lt;= &lt;=&gt; == =~ &gt; &gt;= &gt;&gt; &gt;&gt;&gt; ?: ?? ??? ^ ^^ | || ~
1mCOMMENTS-IN-DATA FLAGS0m
Miller lets you put comments in your data, such as
@ -2185,6 +2187,12 @@ MILLER(1) MILLER(1)
1macosh0m
(class=math #args=1) Inverse hyperbolic cosine.
1mantimode0m
(class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.
Examples:
antimode([3,3,4,4,4]) is 3
antimode([3,3,4,4]) is 3
1many0m
(class=higher-order-functions #args=2) Given a map or array as first argument and a function as second argument, yields a boolean true if the argument function returns true for any array/map element, false otherwise. For arrays, the function should take one argument, for array element; for maps, it should take two, for map-element key and value. In either case it should return a boolean.
Examples:
@ -2309,6 +2317,12 @@ MILLER(1) MILLER(1)
1mcosh0m
(class=math #args=1) Hyperbolic cosine.
1mcount0m
(class=stats #args=1) Returns the length of an array or map. Returns error for non-array/non-map types.
Examples:
count([7,8,9]) is 3
count({"a":7,"b":8,"c":9}) is 3
1mdepth0m
(class=collections #args=1) Prints maximum depth of map/array. Scalars have depth 0.
@ -2318,6 +2332,13 @@ MILLER(1) MILLER(1)
1mdhms2sec0m
(class=time #args=1) Recovers integer seconds as in dhms2sec("5d18h53m20s") = 500000
1mdistinct_count0m
(class=stats #args=1) Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.
Examples:
distinct_count([7,8,9,7]) is 3
distinct_count([1,"1"]) is 1
distinct_count([1,1.0]) is 2
1merf0m
(class=math #args=1) Error function.
@ -2542,6 +2563,11 @@ MILLER(1) MILLER(1)
1mjson_stringify0m
(class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output.
1mkurtosis0m
(class=stats #args=1) Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
kurtosis([4,5,9,10,11]) is -1.6703688
1mlatin1_to_utf80m
(class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it.
Examples:
@ -2610,20 +2636,53 @@ MILLER(1) MILLER(1)
(class=collections #args=variadic) With 0 args, returns empty map. With &gt;= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'.
1mmax0m
(class=math #args=variadic) Max of n numbers; null loses.
(class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
1mmaxlen0m
(class=stats #args=1) Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
maxlen(["ao", "alto"]) is 4
1mmd50m
(class=hashing #args=1) MD5 hash.
1mmean0m
(class=stats #args=1) Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.
Example:
mean([4,5,7,10]) is 6.5
1mmeaneb0m
(class=stats #args=1) Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.
Example:
meaneb([4,5,7,10]) is 1.3228756
1mmedian0m
(class=stats #args=1,2) Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.
Examples:
median([3,4,5,6,9,10]) is 6
median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5
median(["abc", "def", "ghi", "ghi"]) is "ghi"
1mmexp0m
(class=arithmetic #args=3) a ** b mod m (integers)
1mmin0m
(class=math #args=variadic) Min of n numbers; null loses.
(class=math #args=variadic) Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
1mminlen0m
(class=stats #args=1) Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
minlen(["ao", "alto"]) is 3
1mmmul0m
(class=arithmetic #args=3) a * b mod m (integers)
1mmode0m
(class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.
Examples:
mode([3,3,4,4,4]) is 4
mode([3,3,4,4]) is 3
1mmsub0m
(class=arithmetic #args=3) a - b mod m (integers)
@ -2653,9 +2712,70 @@ MILLER(1) MILLER(1)
nsec2localtime(1234567890123456789, 6) = "2009-02-14 01:31:30.123456" with TZ="Asia/Istanbul"
nsec2localtime(1234567890123456789, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.123456"
1mnull_count0m
(class=stats #args=1) Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.
Example:
null_count(["a", "", "c"]) is 1
1mos0m
(class=system #args=0) Returns the operating-system name as a string.
1mpercentile0m
(class=stats #args=2,3) Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.
Examples:
percentile([3,4,5,6,9,10], 90) is 10
percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5
percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi"
1mpercentiles0m
(class=stats #args=2,3) Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags.
Examples:
Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort
the input before computing percentiles:
percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 }
percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" }
Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array:
percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9]
Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces
,error on string inputs:
percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 }
The percentiles function always sorts its inputs before computing percentiles. If you know your input
is already sorted -- see also the sort_collection function -- then computation will be faster on
large input if you pass in "array_is_sorted":
x = [6,5,9,10,4,3]
percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect
x = sort_collection(x)
percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct
You can also leverage this feature to compute percentiles on a sort of your choosing. For example:
Non-sorted input:
x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ")
x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"]
Percentiles are taken over the original positions of the words in the array -- "dogs" is last
and hence appears as p99:
percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"]
With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99:
percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]
With default sorting done outside percentiles, the same:
x = sort(x) # or x = sort_collection(x)
x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"]
percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"]
percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]
Now sorting by word length, "loquaciously" is longest and hence is the p99:
x = sort(x, func(a,b) { return strlen(a) &lt;=&gt; strlen(b) } )
x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"]
percentiles(x, [50, 99], {"oa":true, "ais":true})
["over", "loquaciously"]
1mpow0m
(class=arithmetic #args=2) Exponentiation. Same as **, but as a function.
@ -2752,6 +2872,11 @@ MILLER(1) MILLER(1)
1msinh0m
(class=math #args=1) Hyperbolic sine.
1mskewness0m
(class=stats #args=1) Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
skewness([4,5,9,10,11]) is -0.2097285
1msort0m
(class=higher-order-functions #args=1-2) Given a map or array as first argument and string flags or function as optional second argument, returns a sorted copy of the input. With one argument, sorts array elements with numbers first numerically and then strings lexically, and map elements likewise by map keys. If the second argument is a string, it can contain any of "f" for lexical ("n" is for the above default), "c" for case-folded lexical, or "t" for natural sort order. An additional "r" in that string is for reverse. An additional "v" in that string means sort maps by value, rather than by key. If the second argument is a function, then for arrays it should take two arguments a and b, returning &lt; 0, 0, or &gt; 0 as a &lt; b, a == b, or a &gt; b respectively; for maps the function should take four arguments ak, av, bk, and bv, again returning &lt; 0, 0, or &gt; 0, using a and b's keys and values.
Examples:
@ -2768,6 +2893,9 @@ MILLER(1) MILLER(1)
Map without function: sort({"c":2,"a":3,"b":1}, "v") returns {"b":1,"c":2,"a":3}.
Map without function: sort({"c":2,"a":3,"b":1}, "vnr") returns {"a":3,"c":2,"b":1}.
1msort_collection0m
(class=stats #args=1) This is a helper function for the percentiles function; please see its online help for details.
1msplita0m
(class=conversion #args=2) Splits string into array with type inference. First argument is string to split; second is the separator to split on.
Example:
@ -2806,6 +2934,11 @@ MILLER(1) MILLER(1)
Example:
ssub("abc.def", ".", "X") gives "abcXdef"
1mstddev0m
(class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
stddev([4,5,9,10,11]) is 3.1144823
1mstrfntime0m
(class=time #args=2) Formats integer nanoseconds since the epoch as timestamp. Format strings are as at https://pkg.go.dev/github.com/lestrrat-go/strftime, with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also https://miller.readthedocs.io/en/latest/reference-dsl-time/ for more information on the differences from the C library ("man strftime" on your system). See also strftime_local.
Examples:
@ -2893,6 +3026,26 @@ MILLER(1) MILLER(1)
1msubstr10m
(class=string #args=3) substr1(s,m,n) gives substring of s from 1-up position m to n inclusive. Negative indices -len .. -1 alias to 1 .. len. See also substr and substr0.
1msum0m
(class=stats #args=1) Returns the sum of values in an array or map. Returns error for non-array/non-map types.
Example:
sum([1,2,3,4,5]) is 15
1msum20m
(class=stats #args=1) Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types.
Example:
sum2([1,2,3,4,5]) is 55
1msum30m
(class=stats #args=1) Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types.
Example:
sum3([1,2,3,4,5]) is 225
1msum40m
(class=stats #args=1) Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types.
Example:
sum4([1,2,3,4,5]) is 979
1msysntime0m
(class=time #args=0) Returns the system time in 64-bit nanoseconds since the epoch.
@ -2971,6 +3124,11 @@ MILLER(1) MILLER(1)
$y = utf8_to_latin1($x)
$* = utf8_to_latin1($*)
1mvariance0m
(class=stats #args=1) Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
variance([4,5,9,10,11]) is 9.7
1mversion0m
(class=system #args=0) Returns the Miller version as a string.
@ -3472,5 +3630,5 @@ MILLER(1) MILLER(1)
2023-08-23 MILLER(1)
2023-08-26 MILLER(1)
</pre>

View file

@ -182,32 +182,34 @@ MILLER(1) MILLER(1)
unsparsify
1mFUNCTION LIST0m
abs acos acosh any append apply arrayify asin asinh asserting_absent
abs acos acosh antimode any append apply arrayify asin asinh asserting_absent
asserting_array asserting_bool asserting_boolean asserting_empty
asserting_empty_map asserting_error asserting_float asserting_int
asserting_map asserting_nonempty_map asserting_not_array asserting_not_empty
asserting_not_map asserting_not_null asserting_null asserting_numeric
asserting_present asserting_string atan atan2 atanh bitcount boolean
capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh
depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor
fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values
gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec
hostname index int invqnorm is_absent is_array is_bool is_boolean is_empty
is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map
count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp expm1
flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys
get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec
hms2sec hostname index int invqnorm is_absent is_array is_bool is_boolean
is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map
is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present
is_string joink joinkv joinv json_parse json_stringify latin1_to_utf8
is_string joink joinkv joinv json_parse json_stringify kurtosis latin1_to_utf8
leafcount leftpad length localtime2gmt localtime2nsec localtime2sec log log10
log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 mexp min
mmul msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime os pow qnorm
log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max maxlen md5
mean meaneb median mexp min minlen mmul mode msub nsec2gmt nsec2gmtdate
nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm
reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms
sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256
sha512 sin sinh sort splita splitax splitkv splitkvx splitnv splitnvx sqrt
ssub strfntime strfntime_local strftime strftime_local string strip strlen
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
sysntime system systime systimeint tan tanh tolower toupper truncate typeof
unflatten unformat unformatx upntime uptime urand urand32 urandelement
urandint urandrange utf8_to_latin1 version ! != !=~ % & && * ** + - . .* .+ .-
./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx
splitnv splitnvx sqrt ssub stddev strfntime strfntime_local strftime
strftime_local string strip strlen strpntime strpntime_local strptime
strptime_local sub substr substr0 substr1 sum sum2 sum3 sum4 sysntime system
systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat
unformatx upntime uptime urand urand32 urandelement urandint urandrange
utf8_to_latin1 variance version ! != !=~ % & && * ** + - . .* .+ .- ./ / // <
<< <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
1mCOMMENTS-IN-DATA FLAGS0m
Miller lets you put comments in your data, such as
@ -2164,6 +2166,12 @@ MILLER(1) MILLER(1)
1macosh0m
(class=math #args=1) Inverse hyperbolic cosine.
1mantimode0m
(class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.
Examples:
antimode([3,3,4,4,4]) is 3
antimode([3,3,4,4]) is 3
1many0m
(class=higher-order-functions #args=2) Given a map or array as first argument and a function as second argument, yields a boolean true if the argument function returns true for any array/map element, false otherwise. For arrays, the function should take one argument, for array element; for maps, it should take two, for map-element key and value. In either case it should return a boolean.
Examples:
@ -2288,6 +2296,12 @@ MILLER(1) MILLER(1)
1mcosh0m
(class=math #args=1) Hyperbolic cosine.
1mcount0m
(class=stats #args=1) Returns the length of an array or map. Returns error for non-array/non-map types.
Examples:
count([7,8,9]) is 3
count({"a":7,"b":8,"c":9}) is 3
1mdepth0m
(class=collections #args=1) Prints maximum depth of map/array. Scalars have depth 0.
@ -2297,6 +2311,13 @@ MILLER(1) MILLER(1)
1mdhms2sec0m
(class=time #args=1) Recovers integer seconds as in dhms2sec("5d18h53m20s") = 500000
1mdistinct_count0m
(class=stats #args=1) Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.
Examples:
distinct_count([7,8,9,7]) is 3
distinct_count([1,"1"]) is 1
distinct_count([1,1.0]) is 2
1merf0m
(class=math #args=1) Error function.
@ -2521,6 +2542,11 @@ MILLER(1) MILLER(1)
1mjson_stringify0m
(class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output.
1mkurtosis0m
(class=stats #args=1) Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
kurtosis([4,5,9,10,11]) is -1.6703688
1mlatin1_to_utf80m
(class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it.
Examples:
@ -2589,20 +2615,53 @@ MILLER(1) MILLER(1)
(class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'.
1mmax0m
(class=math #args=variadic) Max of n numbers; null loses.
(class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
1mmaxlen0m
(class=stats #args=1) Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
maxlen(["ao", "alto"]) is 4
1mmd50m
(class=hashing #args=1) MD5 hash.
1mmean0m
(class=stats #args=1) Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.
Example:
mean([4,5,7,10]) is 6.5
1mmeaneb0m
(class=stats #args=1) Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.
Example:
meaneb([4,5,7,10]) is 1.3228756
1mmedian0m
(class=stats #args=1,2) Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.
Examples:
median([3,4,5,6,9,10]) is 6
median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5
median(["abc", "def", "ghi", "ghi"]) is "ghi"
1mmexp0m
(class=arithmetic #args=3) a ** b mod m (integers)
1mmin0m
(class=math #args=variadic) Min of n numbers; null loses.
(class=math #args=variadic) Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
1mminlen0m
(class=stats #args=1) Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
minlen(["ao", "alto"]) is 3
1mmmul0m
(class=arithmetic #args=3) a * b mod m (integers)
1mmode0m
(class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.
Examples:
mode([3,3,4,4,4]) is 4
mode([3,3,4,4]) is 3
1mmsub0m
(class=arithmetic #args=3) a - b mod m (integers)
@ -2632,9 +2691,70 @@ MILLER(1) MILLER(1)
nsec2localtime(1234567890123456789, 6) = "2009-02-14 01:31:30.123456" with TZ="Asia/Istanbul"
nsec2localtime(1234567890123456789, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.123456"
1mnull_count0m
(class=stats #args=1) Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.
Example:
null_count(["a", "", "c"]) is 1
1mos0m
(class=system #args=0) Returns the operating-system name as a string.
1mpercentile0m
(class=stats #args=2,3) Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.
Examples:
percentile([3,4,5,6,9,10], 90) is 10
percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5
percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi"
1mpercentiles0m
(class=stats #args=2,3) Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags.
Examples:
Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort
the input before computing percentiles:
percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 }
percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" }
Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array:
percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9]
Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces
,error on string inputs:
percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 }
The percentiles function always sorts its inputs before computing percentiles. If you know your input
is already sorted -- see also the sort_collection function -- then computation will be faster on
large input if you pass in "array_is_sorted":
x = [6,5,9,10,4,3]
percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect
x = sort_collection(x)
percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct
You can also leverage this feature to compute percentiles on a sort of your choosing. For example:
Non-sorted input:
x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ")
x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"]
Percentiles are taken over the original positions of the words in the array -- "dogs" is last
and hence appears as p99:
percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"]
With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99:
percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]
With default sorting done outside percentiles, the same:
x = sort(x) # or x = sort_collection(x)
x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"]
percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"]
percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]
Now sorting by word length, "loquaciously" is longest and hence is the p99:
x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } )
x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"]
percentiles(x, [50, 99], {"oa":true, "ais":true})
["over", "loquaciously"]
1mpow0m
(class=arithmetic #args=2) Exponentiation. Same as **, but as a function.
@ -2731,6 +2851,11 @@ MILLER(1) MILLER(1)
1msinh0m
(class=math #args=1) Hyperbolic sine.
1mskewness0m
(class=stats #args=1) Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
skewness([4,5,9,10,11]) is -0.2097285
1msort0m
(class=higher-order-functions #args=1-2) Given a map or array as first argument and string flags or function as optional second argument, returns a sorted copy of the input. With one argument, sorts array elements with numbers first numerically and then strings lexically, and map elements likewise by map keys. If the second argument is a string, it can contain any of "f" for lexical ("n" is for the above default), "c" for case-folded lexical, or "t" for natural sort order. An additional "r" in that string is for reverse. An additional "v" in that string means sort maps by value, rather than by key. If the second argument is a function, then for arrays it should take two arguments a and b, returning < 0, 0, or > 0 as a < b, a == b, or a > b respectively; for maps the function should take four arguments ak, av, bk, and bv, again returning < 0, 0, or > 0, using a and b's keys and values.
Examples:
@ -2747,6 +2872,9 @@ MILLER(1) MILLER(1)
Map without function: sort({"c":2,"a":3,"b":1}, "v") returns {"b":1,"c":2,"a":3}.
Map without function: sort({"c":2,"a":3,"b":1}, "vnr") returns {"a":3,"c":2,"b":1}.
1msort_collection0m
(class=stats #args=1) This is a helper function for the percentiles function; please see its online help for details.
1msplita0m
(class=conversion #args=2) Splits string into array with type inference. First argument is string to split; second is the separator to split on.
Example:
@ -2785,6 +2913,11 @@ MILLER(1) MILLER(1)
Example:
ssub("abc.def", ".", "X") gives "abcXdef"
1mstddev0m
(class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
stddev([4,5,9,10,11]) is 3.1144823
1mstrfntime0m
(class=time #args=2) Formats integer nanoseconds since the epoch as timestamp. Format strings are as at https://pkg.go.dev/github.com/lestrrat-go/strftime, with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also https://miller.readthedocs.io/en/latest/reference-dsl-time/ for more information on the differences from the C library ("man strftime" on your system). See also strftime_local.
Examples:
@ -2872,6 +3005,26 @@ MILLER(1) MILLER(1)
1msubstr10m
(class=string #args=3) substr1(s,m,n) gives substring of s from 1-up position m to n inclusive. Negative indices -len .. -1 alias to 1 .. len. See also substr and substr0.
1msum0m
(class=stats #args=1) Returns the sum of values in an array or map. Returns error for non-array/non-map types.
Example:
sum([1,2,3,4,5]) is 15
1msum20m
(class=stats #args=1) Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types.
Example:
sum2([1,2,3,4,5]) is 55
1msum30m
(class=stats #args=1) Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types.
Example:
sum3([1,2,3,4,5]) is 225
1msum40m
(class=stats #args=1) Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types.
Example:
sum4([1,2,3,4,5]) is 979
1msysntime0m
(class=time #args=0) Returns the system time in 64-bit nanoseconds since the epoch.
@ -2950,6 +3103,11 @@ MILLER(1) MILLER(1)
$y = utf8_to_latin1($x)
$* = utf8_to_latin1($*)
1mvariance0m
(class=stats #args=1) Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
variance([4,5,9,10,11]) is 9.7
1mversion0m
(class=system #args=0) Returns the Miller version as a string.
@ -3451,4 +3609,4 @@ MILLER(1) MILLER(1)
2023-08-23 MILLER(1)
2023-08-26 MILLER(1)

View file

@ -74,6 +74,7 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary
* [**Hashing functions**](#hashing-functions): [md5](#md5), [sha1](#sha1), [sha256](#sha256), [sha512](#sha512).
* [**Higher-order-functions functions**](#higher-order-functions-functions): [any](#any), [apply](#apply), [every](#every), [fold](#fold), [reduce](#reduce), [select](#select), [sort](#sort).
* [**Math functions**](#math-functions): [abs](#abs), [acos](#acos), [acosh](#acosh), [asin](#asin), [asinh](#asinh), [atan](#atan), [atan2](#atan2), [atanh](#atanh), [cbrt](#cbrt), [ceil](#ceil), [cos](#cos), [cosh](#cosh), [erf](#erf), [erfc](#erfc), [exp](#exp), [expm1](#expm1), [floor](#floor), [invqnorm](#invqnorm), [log](#log), [log10](#log10), [log1p](#log1p), [logifit](#logifit), [max](#max), [min](#min), [qnorm](#qnorm), [round](#round), [roundm](#roundm), [sgn](#sgn), [sin](#sin), [sinh](#sinh), [sqrt](#sqrt), [tan](#tan), [tanh](#tanh), [urand](#urand), [urand32](#urand32), [urandelement](#urandelement), [urandint](#urandint), [urandrange](#urandrange).
* [**Stats functions**](#stats-functions): [antimode](#antimode), [count](#count), [distinct_count](#distinct_count), [kurtosis](#kurtosis), [maxlen](#maxlen), [mean](#mean), [meaneb](#meaneb), [median](#median), [minlen](#minlen), [mode](#mode), [null_count](#null_count), [percentile](#percentile), [percentiles](#percentiles), [skewness](#skewness), [sort_collection](#sort_collection), [stddev](#stddev), [sum](#sum), [sum2](#sum2), [sum3](#sum3), [sum4](#sum4), [variance](#variance).
* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot).
* [**System functions**](#system-functions): [exec](#exec), [hostname](#hostname), [os](#os), [system](#system), [version](#version).
* [**Time functions**](#time-functions): [dhms2fsec](#dhms2fsec), [dhms2sec](#dhms2sec), [fsec2dhms](#fsec2dhms), [fsec2hms](#fsec2hms), [gmt2localtime](#gmt2localtime), [gmt2nsec](#gmt2nsec), [gmt2sec](#gmt2sec), [hms2fsec](#hms2fsec), [hms2sec](#hms2sec), [localtime2gmt](#localtime2gmt), [localtime2nsec](#localtime2nsec), [localtime2sec](#localtime2sec), [nsec2gmt](#nsec2gmt), [nsec2gmtdate](#nsec2gmtdate), [nsec2localdate](#nsec2localdate), [nsec2localtime](#nsec2localtime), [sec2dhms](#sec2dhms), [sec2gmt](#sec2gmt), [sec2gmtdate](#sec2gmtdate), [sec2hms](#sec2hms), [sec2localdate](#sec2localdate), [sec2localtime](#sec2localtime), [strfntime](#strfntime), [strfntime_local](#strfntime_local), [strftime](#strftime), [strftime_local](#strftime_local), [strpntime](#strpntime), [strpntime_local](#strpntime_local), [strptime](#strptime), [strptime_local](#strptime_local), [sysntime](#sysntime), [systime](#systime), [systimeint](#systimeint), [upntime](#upntime), [uptime](#uptime).
@ -877,13 +878,13 @@ logifit (class=math #args=3) Given m and b from logistic regression, compute fi
### max
<pre class="pre-non-highlight-non-pair">
max (class=math #args=variadic) Max of n numbers; null loses.
max (class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
</pre>
### min
<pre class="pre-non-highlight-non-pair">
min (class=math #args=variadic) Min of n numbers; null loses.
min (class=math #args=variadic) Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
</pre>
@ -972,6 +973,227 @@ urandint (class=math #args=2) Integer uniformly distributed between inclusive i
urandrange (class=math #args=2) Floating-point numbers uniformly distributed on the interval [a, b).
</pre>
## Stats functions
### antimode
<pre class="pre-non-highlight-non-pair">
antimode (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.
Examples:
antimode([3,3,4,4,4]) is 3
antimode([3,3,4,4]) is 3
</pre>
### count
<pre class="pre-non-highlight-non-pair">
count (class=stats #args=1) Returns the length of an array or map. Returns error for non-array/non-map types.
Examples:
count([7,8,9]) is 3
count({"a":7,"b":8,"c":9}) is 3
</pre>
### distinct_count
<pre class="pre-non-highlight-non-pair">
distinct_count (class=stats #args=1) Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.
Examples:
distinct_count([7,8,9,7]) is 3
distinct_count([1,"1"]) is 1
distinct_count([1,1.0]) is 2
</pre>
### kurtosis
<pre class="pre-non-highlight-non-pair">
kurtosis (class=stats #args=1) Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
kurtosis([4,5,9,10,11]) is -1.6703688
</pre>
### maxlen
<pre class="pre-non-highlight-non-pair">
maxlen (class=stats #args=1) Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
maxlen(["año", "alto"]) is 4
</pre>
### mean
<pre class="pre-non-highlight-non-pair">
mean (class=stats #args=1) Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.
Example:
mean([4,5,7,10]) is 6.5
</pre>
### meaneb
<pre class="pre-non-highlight-non-pair">
meaneb (class=stats #args=1) Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.
Example:
meaneb([4,5,7,10]) is 1.3228756
</pre>
### median
<pre class="pre-non-highlight-non-pair">
median (class=stats #args=1,2) Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.
Examples:
median([3,4,5,6,9,10]) is 6
median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5
median(["abc", "def", "ghi", "ghi"]) is "ghi"
</pre>
### minlen
<pre class="pre-non-highlight-non-pair">
minlen (class=stats #args=1) Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
minlen(["año", "alto"]) is 3
</pre>
### mode
<pre class="pre-non-highlight-non-pair">
mode (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.
Examples:
mode([3,3,4,4,4]) is 4
mode([3,3,4,4]) is 3
</pre>
### null_count
<pre class="pre-non-highlight-non-pair">
null_count (class=stats #args=1) Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.
Example:
null_count(["a", "", "c"]) is 1
</pre>
### percentile
<pre class="pre-non-highlight-non-pair">
percentile (class=stats #args=2,3) Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.
Examples:
percentile([3,4,5,6,9,10], 90) is 10
percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5
percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi"
</pre>
### percentiles
<pre class="pre-non-highlight-non-pair">
percentiles (class=stats #args=2,3) Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags.
Examples:
Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort
the input before computing percentiles:
percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 }
percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" }
Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array:
percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9]
Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces
,error on string inputs:
percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 }
The percentiles function always sorts its inputs before computing percentiles. If you know your input
is already sorted -- see also the sort_collection function -- then computation will be faster on
large input if you pass in "array_is_sorted":
x = [6,5,9,10,4,3]
percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect
x = sort_collection(x)
percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct
You can also leverage this feature to compute percentiles on a sort of your choosing. For example:
Non-sorted input:
x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ")
x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"]
Percentiles are taken over the original positions of the words in the array -- "dogs" is last
and hence appears as p99:
percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"]
With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99:
percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]
With default sorting done outside percentiles, the same:
x = sort(x) # or x = sort_collection(x)
x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"]
percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"]
percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]
Now sorting by word length, "loquaciously" is longest and hence is the p99:
x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } )
x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"]
percentiles(x, [50, 99], {"oa":true, "ais":true})
["over", "loquaciously"]
</pre>
### skewness
<pre class="pre-non-highlight-non-pair">
skewness (class=stats #args=1) Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
skewness([4,5,9,10,11]) is -0.2097285
</pre>
### sort_collection
<pre class="pre-non-highlight-non-pair">
sort_collection (class=stats #args=1) This is a helper function for the percentiles function; please see its online help for details.
</pre>
### stddev
<pre class="pre-non-highlight-non-pair">
stddev (class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
stddev([4,5,9,10,11]) is 3.1144823
</pre>
### sum
<pre class="pre-non-highlight-non-pair">
sum (class=stats #args=1) Returns the sum of values in an array or map. Returns error for non-array/non-map types.
Example:
sum([1,2,3,4,5]) is 15
</pre>
### sum2
<pre class="pre-non-highlight-non-pair">
sum2 (class=stats #args=1) Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types.
Example:
sum2([1,2,3,4,5]) is 55
</pre>
### sum3
<pre class="pre-non-highlight-non-pair">
sum3 (class=stats #args=1) Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types.
Example:
sum3([1,2,3,4,5]) is 225
</pre>
### sum4
<pre class="pre-non-highlight-non-pair">
sum4 (class=stats #args=1) Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types.
Example:
sum4([1,2,3,4,5]) is 979
</pre>
### variance
<pre class="pre-non-highlight-non-pair">
variance (class=stats #args=1) Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
variance([4,5,9,10,11]) is 9.7
</pre>
## String functions
@ -1765,3 +1987,4 @@ is_string (class=typing #args=1) True if field is present with string (includin
typeof (class=typing #args=1) Convert argument to type of argument (e.g. "str"). For debug.
</pre>
/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/lib/ruby/2.6.0/universal-darwin22/rbconfig.rb:21: warning: Insecure world writable dir /usr/local/bin in PATH, mode 040777

View file

@ -35,6 +35,7 @@ i j k
7 8 15
8 9 17
9 10 19
/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/lib/ruby/2.6.0/universal-darwin22/rbconfig.rb:21: warning: Insecure world writable dir /usr/local/bin in PATH, mode 040777
</pre>
Newlines within the expression are ignored, which can help increase legibility of complex expressions:

View file

@ -495,3 +495,4 @@ Notes about all other separators:
* `--repifs`: Let IFS be repeated: e.g. for splitting on multiple spaces.
* `--rs {string}`: Specify RS for input and output.
/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/lib/ruby/2.6.0/universal-darwin22/rbconfig.rb:21: warning: Insecure world writable dir /usr/local/bin in PATH, mode 040777

View file

@ -3406,14 +3406,14 @@ fields, optionally categorized by one or more fields.
<b> data/medium</b>
</pre>
<pre class="pre-non-highlight-in-pair">
x_y_cov 0.000042574820827444476
x_y_corr 0.0005042001844467462
y_y_cov 0.08461122467974003
x_y_cov 0.00004257482082749404
x_y_corr 0.0005042001844473328
y_y_cov 0.08461122467974005
y_y_corr 1
x2_xy_cov 0.04188382281779374
x2_xy_corr 0.630174342037994
x2_y2_cov -0.00030953725962542085
x2_y2_corr -0.0034249088761121966
x2_xy_cov 0.041883822817793716
x2_xy_corr 0.6301743420379936
x2_y2_cov -0.0003095372596253918
x2_y2_corr -0.003424908876111875
</pre>
<pre class="pre-highlight-in-pair">
@ -3422,12 +3422,12 @@ x2_y2_corr -0.0034249088761121966
<b> data/medium</b>
</pre>
<pre class="pre-non-highlight-in-pair">
a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2
pan 0.01702551273681908 0.5004028922897639 2081 0.00028691820445814767 1 0 2081 1 0.8781320866715662 0.11908230147563566 2081 0.41749827377311266
eks 0.0407804923685586 0.48140207967651016 1965 0.0016461239223448587 1 0 1965 1 0.8978728611690183 0.10734054433612333 1965 0.45563223864254526
wye -0.03915349075204814 0.5255096523974456 1966 0.0015051268704373607 1 0 1966 1 0.8538317334220835 0.1267454301662969 1966 0.38991721818599295
zee 0.0027812364960399147 0.5043070448033061 2047 0.000007751652858786137 1 0 2047 1 0.8524439912011013 0.12401684308018937 2047 0.39356598090006495
hat -0.018620577041095078 0.5179005397264935 1941 0.0003520036646055585 1 0 1941 1 0.8412305086345014 0.13557328318623216 1941 0.3687944261732265
a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2
pan 0.017025512736819345 0.500402892289764 2081 0.00028691820445815624 1 -0.00000000000000002890430283104539 2081 1 0.8781320866715664 0.11908230147563569 2081 0.4174982737731127
eks 0.04078049236855813 0.4814020796765104 1965 0.0016461239223448218 1 0.00000000000000017862676354313703 1965 1 0.897872861169018 0.1073405443361234 1965 0.4556322386425451
wye -0.03915349075204785 0.5255096523974457 1966 0.0015051268704373377 1 0.00000000000000004464425401127647 1966 1 0.8538317334220837 0.1267454301662969 1966 0.3899172181859931
zee 0.0027812364960401333 0.5043070448033061 2047 0.000007751652858787357 1 0.00000000000000004819404567023685 2047 1 0.8524439912011011 0.12401684308018947 2047 0.39356598090006495
hat -0.018620577041095272 0.5179005397264937 1941 0.00035200366460556604 1 -0.00000000000000003400445761787692 1941 1 0.8412305086345017 0.13557328318623207 1941 0.3687944261732266
</pre>
Here's an example simple line-fit. The `x` and `y`
@ -3513,11 +3513,11 @@ upsec_count_pca_quality 0.9999590846136102
donesec 92.33051350964094
color purple
upsec_count_pca_m -39.03009744795354
upsec_count_pca_b 979.9883413064914
upsec_count_pca_m -39.030097447953594
upsec_count_pca_b 979.9883413064917
upsec_count_pca_n 21
upsec_count_pca_quality 0.9999908956206317
donesec 25.10852919630297
donesec 25.108529196302943
</pre>
## step
@ -3794,9 +3794,9 @@ distinct_count 5 5 10000 10000 10000
mode pan wye 1 0.3467901443380824 0.7268028627434533
sum 0 0 50005000 4986.019681679581 5062.057444929905
mean - - 5000.5 0.49860196816795804 0.5062057444929905
stddev - - 2886.8956799071675 0.2902925151144007 0.290880086426933
var - - 8334166.666666667 0.08426974433144456 0.08461122467974003
skewness - - 0 -0.0006899591185521965 -0.017849760120133784
stddev - - 2886.8956799071675 0.29029251511440074 0.2908800864269331
var - - 8334166.666666667 0.08426974433144457 0.08461122467974005
skewness - - 0 -0.0006899591185517494 -0.01784976012013298
minlen 3 3 1 15 13
maxlen 3 3 5 22 22
min eks eks 1 0.00004509679127584487 0.00008818962627266114

View file

@ -598,8 +598,8 @@ hat pan 0.4643355557376876
x_count 10000
x_sum 4986.019681679581
x_mean 0.49860196816795804
x_var 0.08426974433144456
x_stddev 0.2902925151144007
x_var 0.08426974433144457
x_stddev 0.29029251511440074
</pre>
<pre class="pre-highlight-in-pair">

View file

@ -3,6 +3,7 @@ package bifs
import (
"math"
"github.com/johnkerl/miller/internal/pkg/lib"
"github.com/johnkerl/miller/internal/pkg/mlrval"
)
@ -793,7 +794,7 @@ func min_s_ss(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
}
var min_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{
// . INT FLOAT BOOL VOID STRING ARRAY MAP FUNC ERROR NULL ABSENT
// . INT FLOAT BOOL VOID STRING ARRAY MAP FUNC ERROR NULL ABSENT
/*INT */ {min_i_ii, min_f_if, _1___, _1___, _1___, _absn, _absn, _erro, _erro, _1___, _1___},
/*FLOAT */ {min_f_fi, min_f_ff, _1___, _1___, _1___, _absn, _absn, _erro, _erro, _1___, _1___},
/*BOOL */ {_2___, _2___, min_b_bb, _1___, _1___, _absn, _absn, _erro, _erro, _1___, _1___},
@ -807,6 +808,8 @@ var min_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{
/*ABSENT */ {_2___, _2___, _2___, _2___, _2___, _absn, _absn, _erro, _erro, _null, _absn},
}
// BIF_min_binary is not a direct DSL function. It's a helper here,
// and is also exposed publicly for use by the stats1 verb.
func BIF_min_binary(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
return (min_dispositions[input1.Type()][input2.Type()])(input1, input2)
}
@ -814,15 +817,91 @@ func BIF_min_binary(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
func BIF_min_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval {
if len(mlrvals) == 0 {
return mlrval.VOID
} else {
retval := mlrvals[0]
for i := range mlrvals {
if i > 0 {
retval = BIF_min_binary(retval, mlrvals[i])
}
}
return retval
}
return mlrval.ArrayFold(
mlrvals,
bif_min_unary(mlrvals[0]),
func(a, b *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_min_binary(bif_min_unary(a), bif_min_unary(b))
},
)
}
func BIF_min_within_map_values(m *mlrval.Mlrmap) *mlrval.Mlrval {
if m.Head == nil {
return mlrval.VOID
}
return mlrval.MapFold(
m,
m.Head.Value,
func(a, b *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_min_binary(a, b)
},
)
}
// bif_min_unary allows recursion into arguments, so users can do either
// min(1,2,3) or min([1,2,3]).
func bif_min_unary_array(input1 *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_min_variadic(input1.AcquireArrayValue())
}
func bif_min_unary_map(input1 *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_min_within_map_values(input1.AcquireMapValue())
}
// We get a Golang "initialization loop" due to recursive depth computation
// if this is defined statically. So, we use a "package init" function.
var min_unary_dispositions = [mlrval.MT_DIM]UnaryFunc{}
func init() {
min_unary_dispositions = [mlrval.MT_DIM]UnaryFunc{
/*INT */ _1u___,
/*FLOAT */ _1u___,
/*BOOL */ _1u___,
/*VOID */ _1u___,
/*STRING */ _1u___,
/*ARRAY */ bif_min_unary_array,
/*MAP */ bif_min_unary_map,
/*FUNC */ _erro1,
/*ERROR */ _erro1,
/*NULL */ _null1,
/*ABSENT */ _absn1,
}
}
func bif_min_unary(input1 *mlrval.Mlrval) *mlrval.Mlrval {
return min_unary_dispositions[input1.Type()](input1)
}
// ----------------------------------------------------------------
func BIF_minlen_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval {
if len(mlrvals) == 0 {
return mlrval.VOID
}
// Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation.
retval := lib.UTF8Strlen(mlrvals[0].OriginalString())
for i, _ := range mlrvals {
clen := lib.UTF8Strlen(mlrvals[i].OriginalString())
if clen < retval {
retval = clen
}
}
return mlrval.FromInt(retval)
}
func BIF_minlen_within_map_values(m *mlrval.Mlrmap) *mlrval.Mlrval {
if m.Head == nil {
return mlrval.VOID
}
// Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation.
retval := lib.UTF8Strlen(m.Head.Value.OriginalString())
for pe := m.Head.Next; pe != nil; pe = pe.Next {
clen := lib.UTF8Strlen(pe.Value.OriginalString())
if clen < retval {
retval = clen
}
}
return mlrval.FromInt(retval)
}
// ----------------------------------------------------------------
@ -891,6 +970,8 @@ var max_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{
/*ABSENT */ {_2___, _2___, _2___, _2___, _2___, _absn, _absn, _erro, _erro, _absn, _absn},
}
// BIF_max_binary is not a direct DSL function. It's a helper here,
// and is also exposed publicly for use by the stats1 verb.
func BIF_max_binary(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
return (max_dispositions[input1.Type()][input2.Type()])(input1, input2)
}
@ -898,13 +979,89 @@ func BIF_max_binary(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
func BIF_max_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval {
if len(mlrvals) == 0 {
return mlrval.VOID
} else {
retval := mlrvals[0]
for i := range mlrvals {
if i > 0 {
retval = BIF_max_binary(retval, mlrvals[i])
}
}
return retval
}
return mlrval.ArrayFold(
mlrvals,
bif_max_unary(mlrvals[0]),
func(a, b *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_max_binary(bif_max_unary(a), bif_max_unary(b))
},
)
}
func BIF_max_within_map_values(m *mlrval.Mlrmap) *mlrval.Mlrval {
if m.Head == nil {
return mlrval.VOID
}
return mlrval.MapFold(
m,
m.Head.Value,
func(a, b *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_max_binary(a, b)
},
)
}
// bif_max_unary allows recursion into arguments, so users can do either
// max(1,2,3) or max([1,2,3]).
func bif_max_unary_array(input1 *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_max_variadic(input1.AcquireArrayValue())
}
func bif_max_unary_map(input1 *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_max_within_map_values(input1.AcquireMapValue())
}
// We get a Golang "initialization loop" due to recursive depth computation
// if this is defined statically. So, we use a "package init" function.
var max_unary_dispositions = [mlrval.MT_DIM]UnaryFunc{}
func init() {
max_unary_dispositions = [mlrval.MT_DIM]UnaryFunc{
/*INT */ _1u___,
/*FLOAT */ _1u___,
/*BOOL */ _1u___,
/*VOID */ _1u___,
/*STRING */ _1u___,
/*ARRAY */ bif_max_unary_array,
/*MAP */ bif_max_unary_map,
/*FUNC */ _erro1,
/*ERROR */ _erro1,
/*NULL */ _null1,
/*ABSENT */ _absn1,
}
}
func bif_max_unary(input1 *mlrval.Mlrval) *mlrval.Mlrval {
return max_unary_dispositions[input1.Type()](input1)
}
// ----------------------------------------------------------------
func BIF_maxlen_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval {
if len(mlrvals) == 0 {
return mlrval.VOID
}
// Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation.
retval := lib.UTF8Strlen(mlrvals[0].OriginalString())
for i, _ := range mlrvals {
clen := lib.UTF8Strlen(mlrvals[i].OriginalString())
if clen > retval {
retval = clen
}
}
return mlrval.FromInt(retval)
}
func BIF_maxlen_within_map_values(m *mlrval.Mlrmap) *mlrval.Mlrval {
if m.Head == nil {
return mlrval.VOID
}
// Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation.
retval := lib.UTF8Strlen(m.Head.Value.OriginalString())
for pe := m.Head.Next; pe != nil; pe = pe.Next {
clen := lib.UTF8Strlen(pe.Value.OriginalString())
if clen > retval {
retval = clen
}
}
return mlrval.FromInt(retval)
}

View file

@ -0,0 +1,217 @@
package bifs
import (
"math"
"github.com/johnkerl/miller/internal/pkg/mlrval"
)
func GetPercentileLinearlyInterpolated(
array []*mlrval.Mlrval,
n int,
p float64,
) *mlrval.Mlrval {
findex := (p / 100.0) * (float64(n) - 1)
if findex < 0.0 {
findex = 0.0
}
iindex := int(math.Floor(findex))
if iindex >= n-1 {
return array[iindex].Copy()
} else {
// TODO: just do this in float64:
// array[iindex] + frac * (array[iindex+1] - array[iindex])
frac := mlrval.FromFloat(findex - float64(iindex))
diff := BIF_minus_binary(array[iindex+1], array[iindex])
prod := BIF_times(frac, diff)
return BIF_plus_binary(array[iindex], prod)
}
}
// ================================================================
// Non-interpolated percentiles (see also https://en.wikipedia.org/wiki/Percentile)
// ----------------------------------------------------------------
// OPTION 1: int index = p*n/100.0;
//
// x
// 0
// 20
// 40
// 60
// 80
// 100
//
// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 60 x_p60 60 x_p70 80 x_p80 80 x_p90 100 x_p100 100
// x_p01 0 x_p11 0 x_p21 20 x_p31 20 x_p41 40 x_p51 60 x_p61 60 x_p71 80 x_p81 80 x_p91 100
// x_p02 0 x_p12 0 x_p22 20 x_p32 20 x_p42 40 x_p52 60 x_p62 60 x_p72 80 x_p82 80 x_p92 100
// x_p03 0 x_p13 0 x_p23 20 x_p33 20 x_p43 40 x_p53 60 x_p63 60 x_p73 80 x_p83 80 x_p93 100
// x_p04 0 x_p14 0 x_p24 20 x_p34 40 x_p44 40 x_p54 60 x_p64 60 x_p74 80 x_p84 100 x_p94 100
// x_p05 0 x_p15 0 x_p25 20 x_p35 40 x_p45 40 x_p55 60 x_p65 60 x_p75 80 x_p85 100 x_p95 100
// x_p06 0 x_p16 0 x_p26 20 x_p36 40 x_p46 40 x_p56 60 x_p66 60 x_p76 80 x_p86 100 x_p96 100
// x_p07 0 x_p17 20 x_p27 20 x_p37 40 x_p47 40 x_p57 60 x_p67 80 x_p77 80 x_p87 100 x_p97 100
// x_p08 0 x_p18 20 x_p28 20 x_p38 40 x_p48 40 x_p58 60 x_p68 80 x_p78 80 x_p88 100 x_p98 100
// x_p09 0 x_p19 20 x_p29 20 x_p39 40 x_p49 40 x_p59 60 x_p69 80 x_p79 80 x_p89 100 x_p99 100
//
// x
// 0
// 25
// 50
// 75
// 100
//
// x_p00 0 x_p10 0 x_p20 25 x_p30 25 x_p40 50 x_p50 50 x_p60 75 x_p70 75 x_p80 100 x_p90 100 x_p100 100
// x_p01 0 x_p11 0 x_p21 25 x_p31 25 x_p41 50 x_p51 50 x_p61 75 x_p71 75 x_p81 100 x_p91 100
// x_p02 0 x_p12 0 x_p22 25 x_p32 25 x_p42 50 x_p52 50 x_p62 75 x_p72 75 x_p82 100 x_p92 100
// x_p03 0 x_p13 0 x_p23 25 x_p33 25 x_p43 50 x_p53 50 x_p63 75 x_p73 75 x_p83 100 x_p93 100
// x_p04 0 x_p14 0 x_p24 25 x_p34 25 x_p44 50 x_p54 50 x_p64 75 x_p74 75 x_p84 100 x_p94 100
// x_p05 0 x_p15 0 x_p25 25 x_p35 25 x_p45 50 x_p55 50 x_p65 75 x_p75 75 x_p85 100 x_p95 100
// x_p06 0 x_p16 0 x_p26 25 x_p36 25 x_p46 50 x_p56 50 x_p66 75 x_p76 75 x_p86 100 x_p96 100
// x_p07 0 x_p17 0 x_p27 25 x_p37 25 x_p47 50 x_p57 50 x_p67 75 x_p77 75 x_p87 100 x_p97 100
// x_p08 0 x_p18 0 x_p28 25 x_p38 25 x_p48 50 x_p58 50 x_p68 75 x_p78 75 x_p88 100 x_p98 100
// x_p09 0 x_p19 0 x_p29 25 x_p39 25 x_p49 50 x_p59 50 x_p69 75 x_p79 75 x_p89 100 x_p99 100
//
// ----------------------------------------------------------------
// OPTION 2: int index = p*(n-1)/100.0;
//
// x
// 0
// 20
// 40
// 60
// 80
// 100
//
// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 40 x_p60 60 x_p70 60 x_p80 80 x_p90 80 x_p100 100
// x_p01 0 x_p11 0 x_p21 20 x_p31 20 x_p41 40 x_p51 40 x_p61 60 x_p71 60 x_p81 80 x_p91 80
// x_p02 0 x_p12 0 x_p22 20 x_p32 20 x_p42 40 x_p52 40 x_p62 60 x_p72 60 x_p82 80 x_p92 80
// x_p03 0 x_p13 0 x_p23 20 x_p33 20 x_p43 40 x_p53 40 x_p63 60 x_p73 60 x_p83 80 x_p93 80
// x_p04 0 x_p14 0 x_p24 20 x_p34 20 x_p44 40 x_p54 40 x_p64 60 x_p74 60 x_p84 80 x_p94 80
// x_p05 0 x_p15 0 x_p25 20 x_p35 20 x_p45 40 x_p55 40 x_p65 60 x_p75 60 x_p85 80 x_p95 80
// x_p06 0 x_p16 0 x_p26 20 x_p36 20 x_p46 40 x_p56 40 x_p66 60 x_p76 60 x_p86 80 x_p96 80
// x_p07 0 x_p17 0 x_p27 20 x_p37 20 x_p47 40 x_p57 40 x_p67 60 x_p77 60 x_p87 80 x_p97 80
// x_p08 0 x_p18 0 x_p28 20 x_p38 20 x_p48 40 x_p58 40 x_p68 60 x_p78 60 x_p88 80 x_p98 80
// x_p09 0 x_p19 0 x_p29 20 x_p39 20 x_p49 40 x_p59 40 x_p69 60 x_p79 60 x_p89 80 x_p99 80
//
// x
// 0
// 25
// 50
// 75
// 100
//
// x_p00 0 x_p10 0 x_p20 0 x_p30 25 x_p40 25 x_p50 50 x_p60 50 x_p70 50 x_p80 75 x_p90 75 x_p100 100
// x_p01 0 x_p11 0 x_p21 0 x_p31 25 x_p41 25 x_p51 50 x_p61 50 x_p71 50 x_p81 75 x_p91 75
// x_p02 0 x_p12 0 x_p22 0 x_p32 25 x_p42 25 x_p52 50 x_p62 50 x_p72 50 x_p82 75 x_p92 75
// x_p03 0 x_p13 0 x_p23 0 x_p33 25 x_p43 25 x_p53 50 x_p63 50 x_p73 50 x_p83 75 x_p93 75
// x_p04 0 x_p14 0 x_p24 0 x_p34 25 x_p44 25 x_p54 50 x_p64 50 x_p74 50 x_p84 75 x_p94 75
// x_p05 0 x_p15 0 x_p25 25 x_p35 25 x_p45 25 x_p55 50 x_p65 50 x_p75 75 x_p85 75 x_p95 75
// x_p06 0 x_p16 0 x_p26 25 x_p36 25 x_p46 25 x_p56 50 x_p66 50 x_p76 75 x_p86 75 x_p96 75
// x_p07 0 x_p17 0 x_p27 25 x_p37 25 x_p47 25 x_p57 50 x_p67 50 x_p77 75 x_p87 75 x_p97 75
// x_p08 0 x_p18 0 x_p28 25 x_p38 25 x_p48 25 x_p58 50 x_p68 50 x_p78 75 x_p88 75 x_p98 75
// x_p09 0 x_p19 0 x_p29 25 x_p39 25 x_p49 25 x_p59 50 x_p69 50 x_p79 75 x_p89 75 x_p99 75
//
// ----------------------------------------------------------------
// OPTION 3: int index = (int)ceil(p*(n-1)/100.0);
//
// x
// 0
// 20
// 40
// 60
// 80
// 100
//
// x_p00 0 x_p10 20 x_p20 20 x_p30 40 x_p40 40 x_p50 60 x_p60 60 x_p70 80 x_p80 80 x_p90 100 x_p100 100
// x_p01 20 x_p11 20 x_p21 40 x_p31 40 x_p41 60 x_p51 60 x_p61 80 x_p71 80 x_p81 100 x_p91 100
// x_p02 20 x_p12 20 x_p22 40 x_p32 40 x_p42 60 x_p52 60 x_p62 80 x_p72 80 x_p82 100 x_p92 100
// x_p03 20 x_p13 20 x_p23 40 x_p33 40 x_p43 60 x_p53 60 x_p63 80 x_p73 80 x_p83 100 x_p93 100
// x_p04 20 x_p14 20 x_p24 40 x_p34 40 x_p44 60 x_p54 60 x_p64 80 x_p74 80 x_p84 100 x_p94 100
// x_p05 20 x_p15 20 x_p25 40 x_p35 40 x_p45 60 x_p55 60 x_p65 80 x_p75 80 x_p85 100 x_p95 100
// x_p06 20 x_p16 20 x_p26 40 x_p36 40 x_p46 60 x_p56 60 x_p66 80 x_p76 80 x_p86 100 x_p96 100
// x_p07 20 x_p17 20 x_p27 40 x_p37 40 x_p47 60 x_p57 60 x_p67 80 x_p77 80 x_p87 100 x_p97 100
// x_p08 20 x_p18 20 x_p28 40 x_p38 40 x_p48 60 x_p58 60 x_p68 80 x_p78 80 x_p88 100 x_p98 100
// x_p09 20 x_p19 20 x_p29 40 x_p39 40 x_p49 60 x_p59 60 x_p69 80 x_p79 80 x_p89 100 x_p99 100
//
// x
// 0
// 25
// 50
// 75
// 100
//
// x_p00 0 x_p10 25 x_p20 25 x_p30 50 x_p40 50 x_p50 50 x_p60 75 x_p70 75 x_p80 100 x_p90 100 x_p100 100
// x_p01 25 x_p11 25 x_p21 25 x_p31 50 x_p41 50 x_p51 75 x_p61 75 x_p71 75 x_p81 100 x_p91 100
// x_p02 25 x_p12 25 x_p22 25 x_p32 50 x_p42 50 x_p52 75 x_p62 75 x_p72 75 x_p82 100 x_p92 100
// x_p03 25 x_p13 25 x_p23 25 x_p33 50 x_p43 50 x_p53 75 x_p63 75 x_p73 75 x_p83 100 x_p93 100
// x_p04 25 x_p14 25 x_p24 25 x_p34 50 x_p44 50 x_p54 75 x_p64 75 x_p74 75 x_p84 100 x_p94 100
// x_p05 25 x_p15 25 x_p25 25 x_p35 50 x_p45 50 x_p55 75 x_p65 75 x_p75 75 x_p85 100 x_p95 100
// x_p06 25 x_p16 25 x_p26 50 x_p36 50 x_p46 50 x_p56 75 x_p66 75 x_p76 100 x_p86 100 x_p96 100
// x_p07 25 x_p17 25 x_p27 50 x_p37 50 x_p47 50 x_p57 75 x_p67 75 x_p77 100 x_p87 100 x_p97 100
// x_p08 25 x_p18 25 x_p28 50 x_p38 50 x_p48 50 x_p58 75 x_p68 75 x_p78 100 x_p88 100 x_p98 100
// x_p09 25 x_p19 25 x_p29 50 x_p39 50 x_p49 50 x_p59 75 x_p69 75 x_p79 100 x_p89 100 x_p99 100
//
// ----------------------------------------------------------------
// OPTION 4: int index = (int)ceil(-0.5 + p*(n-1)/100.0);
//
// x
// 0
// 20
// 40
// 60
// 80
// 100
//
// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 40 x_p60 60 x_p70 60 x_p80 80 x_p90 80 x_p100 100
// x_p01 0 x_p11 20 x_p21 20 x_p31 40 x_p41 40 x_p51 60 x_p61 60 x_p71 80 x_p81 80 x_p91 100
// x_p02 0 x_p12 20 x_p22 20 x_p32 40 x_p42 40 x_p52 60 x_p62 60 x_p72 80 x_p82 80 x_p92 100
// x_p03 0 x_p13 20 x_p23 20 x_p33 40 x_p43 40 x_p53 60 x_p63 60 x_p73 80 x_p83 80 x_p93 100
// x_p04 0 x_p14 20 x_p24 20 x_p34 40 x_p44 40 x_p54 60 x_p64 60 x_p74 80 x_p84 80 x_p94 100
// x_p05 0 x_p15 20 x_p25 20 x_p35 40 x_p45 40 x_p55 60 x_p65 60 x_p75 80 x_p85 80 x_p95 100
// x_p06 0 x_p16 20 x_p26 20 x_p36 40 x_p46 40 x_p56 60 x_p66 60 x_p76 80 x_p86 80 x_p96 100
// x_p07 0 x_p17 20 x_p27 20 x_p37 40 x_p47 40 x_p57 60 x_p67 60 x_p77 80 x_p87 80 x_p97 100
// x_p08 0 x_p18 20 x_p28 20 x_p38 40 x_p48 40 x_p58 60 x_p68 60 x_p78 80 x_p88 80 x_p98 100
// x_p09 0 x_p19 20 x_p29 20 x_p39 40 x_p49 40 x_p59 60 x_p69 60 x_p79 80 x_p89 80 x_p99 100
//
// x
// 0
// 25
// 50
// 75
// 100
//
// x_p00 0 x_p10 0 x_p20 25 x_p30 25 x_p40 50 x_p50 50 x_p60 50 x_p70 75 x_p80 75 x_p90 100 x_p100 100
// x_p01 0 x_p11 0 x_p21 25 x_p31 25 x_p41 50 x_p51 50 x_p61 50 x_p71 75 x_p81 75 x_p91 100
// x_p02 0 x_p12 0 x_p22 25 x_p32 25 x_p42 50 x_p52 50 x_p62 50 x_p72 75 x_p82 75 x_p92 100
// x_p03 0 x_p13 25 x_p23 25 x_p33 25 x_p43 50 x_p53 50 x_p63 75 x_p73 75 x_p83 75 x_p93 100
// x_p04 0 x_p14 25 x_p24 25 x_p34 25 x_p44 50 x_p54 50 x_p64 75 x_p74 75 x_p84 75 x_p94 100
// x_p05 0 x_p15 25 x_p25 25 x_p35 25 x_p45 50 x_p55 50 x_p65 75 x_p75 75 x_p85 75 x_p95 100
// x_p06 0 x_p16 25 x_p26 25 x_p36 25 x_p46 50 x_p56 50 x_p66 75 x_p76 75 x_p86 75 x_p96 100
// x_p07 0 x_p17 25 x_p27 25 x_p37 25 x_p47 50 x_p57 50 x_p67 75 x_p77 75 x_p87 75 x_p97 100
// x_p08 0 x_p18 25 x_p28 25 x_p38 50 x_p48 50 x_p58 50 x_p68 75 x_p78 75 x_p88 100 x_p98 100
// x_p09 0 x_p19 25 x_p29 25 x_p39 50 x_p49 50 x_p59 50 x_p69 75 x_p79 75 x_p89 100 x_p99 100
//
// ----------------------------------------------------------------
// CONCLUSION:
// * I like option 2 for its simplicity ...
// * ... but option 1 matches R's quantile with type=1.
// * (Note that Miller's interpolated percentiles match match R's quantile with type=7)
// ----------------------------------------------------------------
func GetPercentileNonInterpolated(
array []*mlrval.Mlrval,
n int,
p float64,
) *mlrval.Mlrval {
index := int(p * float64(n) / 100.0)
//index := p * (float64(float64(n)) - 1) / 100.0
//index := int(ceil(p * (float64(n) - 1) / 100.0))
//index := int(ceil(-0.5 + p*(float64(n)-1)/100.0))
if index >= n {
index = n - 1
}
if index < 0 {
index = 0
}
return array[index].Copy()
}

View file

@ -2,6 +2,7 @@ package bifs
import (
"math"
"sort"
"github.com/johnkerl/miller/internal/pkg/lib"
"github.com/johnkerl/miller/internal/pkg/mlrval"
@ -24,7 +25,7 @@ import (
// output = [m, b, math.sqrt(var_m), math.sqrt(var_b)]
// ----------------------------------------------------------------
func BIF_get_var(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval {
func BIF_finalize_variance(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval {
n, isInt := mn.GetIntValue()
lib.InternalCodingErrorIf(!isInt)
sum, isNumber := msum.GetNumericToFloatValue()
@ -46,8 +47,8 @@ func BIF_get_var(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval {
}
// ----------------------------------------------------------------
func BIF_get_stddev(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval {
mvar := BIF_get_var(mn, msum, msum2)
func BIF_finalize_stddev(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval {
mvar := BIF_finalize_variance(mn, msum, msum2)
if mvar.IsVoid() {
return mvar
}
@ -55,8 +56,8 @@ func BIF_get_stddev(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval {
}
// ----------------------------------------------------------------
func BIF_get_mean_EB(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval {
mvar := BIF_get_var(mn, msum, msum2)
func BIF_finalize_mean_eb(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval {
mvar := BIF_finalize_variance(mn, msum, msum2)
if mvar.IsVoid() {
return mvar
}
@ -87,7 +88,7 @@ func BIF_get_mean_EB(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval {
// = sumx2 - n mean^2
// ----------------------------------------------------------------
func BIF_get_skewness(mn, msum, msum2, msum3 *mlrval.Mlrval) *mlrval.Mlrval {
func BIF_finalize_skewness(mn, msum, msum2, msum3 *mlrval.Mlrval) *mlrval.Mlrval {
n, isInt := mn.GetIntValue()
lib.InternalCodingErrorIf(!isInt)
if n < 2 {
@ -124,7 +125,7 @@ func BIF_get_skewness(mn, msum, msum2, msum3 *mlrval.Mlrval) *mlrval.Mlrval {
// = sumx4 - mean*(4 sumx3 - mean*(6 sumx2 - 3 n mean^2))
// ----------------------------------------------------------------
func BIF_get_kurtosis(mn, msum, msum2, msum3, msum4 *mlrval.Mlrval) *mlrval.Mlrval {
func BIF_finalize_kurtosis(mn, msum, msum2, msum3, msum4 *mlrval.Mlrval) *mlrval.Mlrval {
n, isInt := mn.GetIntValue()
lib.InternalCodingErrorIf(!isInt)
if n < 2 {
@ -149,3 +150,485 @@ func BIF_get_kurtosis(mn, msum, msum2, msum3, msum4 *mlrval.Mlrval) *mlrval.Mlrv
return mlrval.FromFloat(numerator/denominator - 3.0)
}
// ================================================================
// STATS ROUTINES -- other than min/max which are placed separately.
// This is a helper function for BIFs which operate only on array or map.
// It shorthands what values to return for non-collection inputs.
func check_collection(c *mlrval.Mlrval) (bool, *mlrval.Mlrval) {
vtype := c.Type()
switch vtype {
case mlrval.MT_ARRAY:
return true, c
case mlrval.MT_MAP:
return true, c
case mlrval.MT_ABSENT:
return false, mlrval.ABSENT
default:
return false, mlrval.ERROR
}
}
// collection_sum_of_function sums f(value) for value in the array or map:
// e.g. sum of values, sum of squares of values, etc.
func collection_sum_of_function(
collection *mlrval.Mlrval,
f func(element *mlrval.Mlrval) *mlrval.Mlrval,
) *mlrval.Mlrval {
return mlrval.CollectionFold(
collection,
mlrval.FromInt(0),
func(a, b *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_plus_binary(a, f(b))
},
)
}
func BIF_count(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
if collection.IsArray() {
arrayval := collection.AcquireArrayValue()
return mlrval.FromInt(int64(len(arrayval)))
} else {
mapval := collection.AcquireMapValue()
return mlrval.FromInt(mapval.FieldCount)
}
}
func BIF_null_count(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
f := func(element *mlrval.Mlrval) *mlrval.Mlrval {
if element.IsVoid() || element.IsNull() {
return mlrval.FromInt(1)
} else {
return mlrval.FromInt(0)
}
}
return mlrval.CollectionFold(
collection,
mlrval.FromInt(0),
func(a, b *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_plus_binary(a, f(b))
},
)
}
func BIF_distinct_count(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
counts := make(map[string]int)
if collection.IsArray() {
a := collection.AcquireArrayValue()
for _, e := range a {
valueString := e.OriginalString()
counts[valueString] += 1
}
} else {
m := collection.AcquireMapValue()
for pe := m.Head; pe != nil; pe = pe.Next {
valueString := pe.Value.OriginalString()
counts[valueString] += 1
}
}
return mlrval.FromInt(int64(len(counts)))
}
func BIF_mode(collection *mlrval.Mlrval) *mlrval.Mlrval {
return bif_mode_or_antimode(collection, func(a, b int) bool { return a > b })
}
func BIF_antimode(collection *mlrval.Mlrval) *mlrval.Mlrval {
return bif_mode_or_antimode(collection, func(a, b int) bool { return a < b })
}
func bif_mode_or_antimode(
collection *mlrval.Mlrval,
cmp func(int, int) bool,
) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
// Do not use a Go map[string]int as that makes the output in the case of ties
// (e.g. input = [3,3,4,4]) non-determinstic. That's bad for unit tests and also
// simply bad UX.
counts := lib.NewOrderedMap()
// We use stringification to detect uniqueness. Yet we want the output to be typed,
// e.g. mode of an array of ints should be an int, not a string. Here we store
// a reference to one representative for each equivalence class.
reps := lib.NewOrderedMap()
if collection.IsArray() {
a := collection.AcquireArrayValue()
if len(a) == 0 {
return mlrval.VOID
}
for _, e := range a {
valueString := e.OriginalString()
if counts.Has(valueString) {
counts.Put(valueString, counts.Get(valueString).(int)+1)
} else {
counts.Put(valueString, 1)
reps.Put(valueString, e)
}
}
} else {
m := collection.AcquireMapValue()
if m.Head == nil {
return mlrval.VOID
}
for pe := m.Head; pe != nil; pe = pe.Next {
valueString := pe.Value.OriginalString()
if counts.Has(valueString) {
counts.Put(valueString, counts.Get(valueString).(int)+1)
} else {
counts.Put(valueString, 1)
reps.Put(valueString, pe.Value)
}
}
}
first := true
maxk := ""
maxv := -1
for pf := counts.Head; pf != nil; pf = pf.Next {
k := pf.Key
v := pf.Value.(int)
if first || cmp(v, maxv) {
maxk = k
maxv = v
first = false
}
}
// OrderedMap has interface{} values, so dereference as Mlrval. Then, copy the Mlrval
// so we're not returning a pointer to input data.
return reps.Get(maxk).(*mlrval.Mlrval).Copy()
}
func BIF_sum(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
return collection_sum_of_function(
collection,
func(e *mlrval.Mlrval) *mlrval.Mlrval {
return e
},
)
}
func BIF_sum2(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
f := func(element *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_times(element, element)
}
return collection_sum_of_function(collection, f)
}
func BIF_sum3(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
f := func(element *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_times(element, BIF_times(element, element))
}
return collection_sum_of_function(collection, f)
}
func BIF_sum4(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
f := func(element *mlrval.Mlrval) *mlrval.Mlrval {
sq := BIF_times(element, element)
return BIF_times(sq, sq)
}
return collection_sum_of_function(collection, f)
}
func BIF_mean(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
n := BIF_count(collection)
if n.AcquireIntValue() == 0 {
return mlrval.VOID
}
sum := BIF_sum(collection)
return BIF_divide(sum, n)
}
func BIF_meaneb(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
n := BIF_count(collection)
sum := BIF_sum(collection)
sum2 := BIF_sum2(collection)
return BIF_finalize_mean_eb(n, sum, sum2)
}
func BIF_variance(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
n := BIF_count(collection)
sum := BIF_sum(collection)
sum2 := BIF_sum2(collection)
return BIF_finalize_variance(n, sum, sum2)
}
func BIF_stddev(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
n := BIF_count(collection)
sum := BIF_sum(collection)
sum2 := BIF_sum2(collection)
return BIF_finalize_stddev(n, sum, sum2)
}
func BIF_skewness(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
n := BIF_count(collection)
sum := BIF_sum(collection)
sum2 := BIF_sum2(collection)
sum3 := BIF_sum3(collection)
return BIF_finalize_skewness(n, sum, sum2, sum3)
}
func BIF_kurtosis(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
n := BIF_count(collection)
sum := BIF_sum(collection)
sum2 := BIF_sum2(collection)
sum3 := BIF_sum3(collection)
sum4 := BIF_sum4(collection)
return BIF_finalize_kurtosis(n, sum, sum2, sum3, sum4)
}
func BIF_minlen(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
if collection.IsArray() {
return BIF_minlen_variadic(collection.AcquireArrayValue())
} else {
return BIF_minlen_within_map_values(collection.AcquireMapValue())
}
}
func BIF_maxlen(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
if collection.IsArray() {
return BIF_maxlen_variadic(collection.AcquireArrayValue())
} else {
return BIF_maxlen_within_map_values(collection.AcquireMapValue())
}
}
func BIF_sort_collection(collection *mlrval.Mlrval) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
var array []*mlrval.Mlrval
if collection.IsArray() {
arrayval := collection.AcquireArrayValue()
n := len(arrayval)
array = make([]*mlrval.Mlrval, n)
for i := 0; i < n; i++ {
array[i] = arrayval[i].Copy()
}
} else {
mapval := collection.AcquireMapValue()
n := mapval.FieldCount
array = make([]*mlrval.Mlrval, n)
i := 0
for pe := mapval.Head; pe != nil; pe = pe.Next {
array[i] = pe.Value.Copy()
i++
}
}
sort.Slice(array, func(i, j int) bool {
return mlrval.LessThan(array[i], array[j])
})
return mlrval.FromArray(array)
}
func BIF_median(
collection *mlrval.Mlrval,
) *mlrval.Mlrval {
return BIF_percentile(collection, mlrval.FromFloat(50.0))
}
func BIF_median_with_options(
collection *mlrval.Mlrval,
options *mlrval.Mlrval,
) *mlrval.Mlrval {
return BIF_percentile_with_options(collection, mlrval.FromFloat(50.0), options)
}
func BIF_percentile(
collection *mlrval.Mlrval,
percentile *mlrval.Mlrval,
) *mlrval.Mlrval {
return BIF_percentile_with_options(collection, percentile, nil)
}
func BIF_percentile_with_options(
collection *mlrval.Mlrval,
percentile *mlrval.Mlrval,
options *mlrval.Mlrval,
) *mlrval.Mlrval {
percentiles := mlrval.FromSingletonArray(percentile)
outputs := BIF_percentiles_with_options(collection, percentiles, options)
return outputs.AcquireMapValue().Head.Value
}
func BIF_percentiles(
collection *mlrval.Mlrval,
percentiles *mlrval.Mlrval,
) *mlrval.Mlrval {
return BIF_percentiles_with_options(collection, percentiles, nil)
}
func BIF_percentiles_with_options(
collection *mlrval.Mlrval,
percentiles *mlrval.Mlrval,
options *mlrval.Mlrval,
) *mlrval.Mlrval {
ok, value_if_not := check_collection(collection)
if !ok {
return value_if_not
}
array_is_sorted := false
interpolate_linearly := false
output_array_not_map := false
if options != nil {
om := options.GetMap()
if om == nil { // not a map
return mlrval.ERROR
}
for pe := om.Head; pe != nil; pe = pe.Next {
if pe.Key == "array_is_sorted" || pe.Key == "ais" {
if mlrval.Equals(pe.Value, mlrval.TRUE) {
array_is_sorted = true
} else if mlrval.Equals(pe.Value, mlrval.FALSE) {
array_is_sorted = false
} else {
return mlrval.ERROR
}
} else if pe.Key == "interpolate_linearly" || pe.Key == "il" {
if mlrval.Equals(pe.Value, mlrval.TRUE) {
interpolate_linearly = true
} else if mlrval.Equals(pe.Value, mlrval.FALSE) {
interpolate_linearly = false
} else {
return mlrval.ERROR
}
} else if pe.Key == "output_array_not_map" || pe.Key == "oa" {
if mlrval.Equals(pe.Value, mlrval.TRUE) {
output_array_not_map = true
} else if mlrval.Equals(pe.Value, mlrval.FALSE) {
output_array_not_map = false
} else {
return mlrval.ERROR
}
}
}
}
var sorted_array *mlrval.Mlrval
if array_is_sorted {
if !collection.IsArray() {
return mlrval.ERROR
}
sorted_array = collection
} else {
sorted_array = BIF_sort_collection(collection)
}
return bif_percentiles(
sorted_array.AcquireArrayValue(),
percentiles,
interpolate_linearly,
output_array_not_map,
)
}
func bif_percentiles(
sorted_array []*mlrval.Mlrval,
percentiles *mlrval.Mlrval,
interpolate_linearly bool,
output_array_not_map bool,
) *mlrval.Mlrval {
ps := percentiles.GetArray()
if ps == nil { // not an array
return mlrval.ERROR
}
outputs := make([]*mlrval.Mlrval, len(ps))
for i, _ := range ps {
p, ok := ps[i].GetNumericToFloatValue()
if !ok {
outputs[i] = mlrval.ERROR.Copy()
} else if len(sorted_array) == 0 {
outputs[i] = mlrval.VOID
} else {
if interpolate_linearly {
outputs[i] = GetPercentileLinearlyInterpolated(sorted_array, len(sorted_array), p)
} else {
outputs[i] = GetPercentileNonInterpolated(sorted_array, len(sorted_array), p)
}
}
}
if output_array_not_map {
return mlrval.FromArray(outputs)
} else {
m := mlrval.NewMlrmap()
for i, _ := range ps {
sp := ps[i].String()
m.PutCopy(sp, outputs[i])
}
return mlrval.FromMap(m)
}
}

View file

@ -0,0 +1,192 @@
package bifs
import (
"fmt"
"testing"
"github.com/stretchr/testify/assert"
"github.com/johnkerl/miller/internal/pkg/mlrval"
)
func stats_test_array(n int) *mlrval.Mlrval {
a := make([]*mlrval.Mlrval, n)
for i := 0; i < n; i++ {
a[i] = mlrval.FromInt(int64(i))
}
return mlrval.FromArray(a)
}
func array_to_map_for_test(a *mlrval.Mlrval) *mlrval.Mlrval {
array := a.AcquireArrayValue()
m := mlrval.NewMlrmap()
for i := 0; i < len(array); i++ {
key := fmt.Sprint(i)
val := array[i]
m.PutCopy(key, val)
}
return mlrval.FromMap(m)
}
func TestBIF_count(t *testing.T) {
// Needs array or map
input := mlrval.FromInt(3)
output := BIF_count(input)
assert.True(t, output.IsError())
for n := 0; n < 5; n++ {
input = stats_test_array(n)
assert.True(t, mlrval.Equals(BIF_count(input), mlrval.FromInt(int64(n))))
input = array_to_map_for_test(input)
assert.True(t, mlrval.Equals(BIF_count(input), mlrval.FromInt(int64(n))))
}
}
func TestBIF_distinct_count(t *testing.T) {
// Needs array or map
input := mlrval.FromInt(3)
output := BIF_count(input)
assert.True(t, output.IsError())
input = mlrval.FromArray([]*mlrval.Mlrval{
mlrval.FromInt(1),
mlrval.FromInt(2),
mlrval.FromInt(3),
mlrval.FromInt(1),
mlrval.FromInt(2),
})
assert.True(t, mlrval.Equals(BIF_distinct_count(input), mlrval.FromInt(3)))
input = array_to_map_for_test(input)
assert.True(t, mlrval.Equals(BIF_distinct_count(input), mlrval.FromInt(3)))
}
func TestBIF_null_count(t *testing.T) {
// Needs array or map
input := mlrval.FromInt(3)
output := BIF_count(input)
assert.True(t, output.IsError())
input = mlrval.FromArray([]*mlrval.Mlrval{
mlrval.FromInt(1),
mlrval.FromString("two"),
mlrval.FromString(""), // this counts
mlrval.ERROR,
mlrval.ABSENT,
mlrval.NULL, // this counts
})
assert.True(t, mlrval.Equals(BIF_null_count(input), mlrval.FromInt(2)))
input = array_to_map_for_test(input)
assert.True(t, mlrval.Equals(BIF_null_count(input), mlrval.FromInt(2)))
}
func TestBIF_mode_and_antimode(t *testing.T) {
// Needs array or map
input := mlrval.FromInt(3)
output := BIF_count(input)
assert.True(t, output.IsError())
// Empty array
input = mlrval.FromArray([]*mlrval.Mlrval{})
assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.VOID))
assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.VOID))
// Empty map
input = array_to_map_for_test(input)
assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.VOID))
assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.VOID))
// Clear winner as array
input = mlrval.FromArray([]*mlrval.Mlrval{
mlrval.FromInt(1),
mlrval.FromInt(2),
mlrval.FromInt(3),
mlrval.FromInt(1),
mlrval.FromInt(1),
mlrval.FromInt(2),
})
assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.FromInt(1)))
assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.FromInt(3)))
// Clear winner as map
input = array_to_map_for_test(input)
assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.FromInt(1)))
assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.FromInt(3)))
// Ties as array -- first-found breaks the tie
input = mlrval.FromArray([]*mlrval.Mlrval{
mlrval.FromInt(1),
mlrval.FromInt(1),
mlrval.FromInt(1),
mlrval.FromInt(2),
mlrval.FromInt(2),
mlrval.FromInt(2),
})
assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.FromInt(1)))
assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.FromInt(1)))
// Clear winner as map
input = array_to_map_for_test(input)
assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.FromInt(1)))
assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.FromInt(1)))
}
func TestBIF_sum(t *testing.T) {
// Needs array or map
input := mlrval.FromInt(3)
output := BIF_count(input)
assert.True(t, output.IsError())
// TODO: test empty array/map
for n := 1; n < 5; n++ {
input = stats_test_array(n)
var isum1 int64
var isum2 int64
var isum3 int64
var isum4 int64
for _, e := range input.AcquireArrayValue() {
v := e.AcquireIntValue()
isum1 += v
isum2 += v * v
isum3 += v * v * v
isum4 += v * v * v * v
}
assert.True(t, mlrval.Equals(BIF_sum(input), mlrval.FromInt(isum1)))
assert.True(t, mlrval.Equals(BIF_sum2(input), mlrval.FromInt(isum2)))
assert.True(t, mlrval.Equals(BIF_sum3(input), mlrval.FromInt(isum3)))
assert.True(t, mlrval.Equals(BIF_sum4(input), mlrval.FromInt(isum4)))
input = array_to_map_for_test(input)
assert.True(t, mlrval.Equals(BIF_sum(input), mlrval.FromInt(isum1)))
assert.True(t, mlrval.Equals(BIF_sum2(input), mlrval.FromInt(isum2)))
assert.True(t, mlrval.Equals(BIF_sum3(input), mlrval.FromInt(isum3)))
assert.True(t, mlrval.Equals(BIF_sum4(input), mlrval.FromInt(isum4)))
}
}
// More easily tested (much lower keystroking) within the regression-test framework:
// BIF_mean
// BIF_meaneb
// BIF_variance
// BIF_stddev
// BIF_skewness
// BIF_kurtosis
// BIF_min
// BIF_max
// BIF_minlen
// BIF_maxlen
// BIF_median
// BIF_median_with_options
// BIF_percentile
// BIF_percentile_with_options
// BIF_percentiles
// BIF_percentiles_with_options
// BIF_sort_collection

View file

@ -29,6 +29,7 @@ type TFunctionClass string
const (
FUNC_CLASS_ARITHMETIC TFunctionClass = "arithmetic"
FUNC_CLASS_MATH TFunctionClass = "math"
FUNC_CLASS_STATS TFunctionClass = "stats"
FUNC_CLASS_BOOLEAN TFunctionClass = "boolean"
FUNC_CLASS_STRING TFunctionClass = "string"
FUNC_CLASS_HASHING TFunctionClass = "hashing"
@ -846,14 +847,14 @@ is normally distributed.`,
{
name: "max",
class: FUNC_CLASS_MATH,
help: `Max of n numbers; null loses.`,
help: `Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.`,
variadicFunc: bifs.BIF_max_variadic,
},
{
name: "min",
class: FUNC_CLASS_MATH,
help: `Min of n numbers; null loses.`,
help: `Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.`,
variadicFunc: bifs.BIF_min_variadic,
},
@ -958,6 +959,276 @@ is normally distributed.`,
unaryFunc: bifs.BIF_urandelement,
},
// ----------------------------------------------------------------
// FUNC_CLASS_STATS
{
name: "count",
class: FUNC_CLASS_STATS,
help: `Returns the length of an array or map. Returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_count,
examples: []string{
"count([7,8,9]) is 3",
`count({"a":7,"b":8,"c":9}) is 3`,
},
},
{
name: "distinct_count",
class: FUNC_CLASS_STATS,
help: `Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.`,
unaryFunc: bifs.BIF_distinct_count,
examples: []string{
`distinct_count([7,8,9,7]) is 3`,
`distinct_count([1,"1"]) is 1`,
`distinct_count([1,1.0]) is 2`,
},
},
{
name: "null_count",
class: FUNC_CLASS_STATS,
help: `Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.`,
unaryFunc: bifs.BIF_null_count,
examples: []string{
`null_count(["a", "", "c"]) is 1`,
},
},
{
name: "mode",
class: FUNC_CLASS_STATS,
help: `Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.`,
unaryFunc: bifs.BIF_mode,
examples: []string{
`mode([3,3,4,4,4]) is 4`,
`mode([3,3,4,4]) is 3`,
},
},
{
name: "antimode",
class: FUNC_CLASS_STATS,
help: `Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.`,
unaryFunc: bifs.BIF_antimode,
examples: []string{
`antimode([3,3,4,4,4]) is 3`,
`antimode([3,3,4,4]) is 3`,
},
},
{
name: "sum",
class: FUNC_CLASS_STATS,
help: `Returns the sum of values in an array or map. Returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_sum,
examples: []string{
`sum([1,2,3,4,5]) is 15`,
},
},
{
name: "sum2",
class: FUNC_CLASS_STATS,
help: `Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_sum2,
examples: []string{
`sum2([1,2,3,4,5]) is 55`,
},
},
{
name: "sum3",
class: FUNC_CLASS_STATS,
help: `Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_sum3,
examples: []string{
`sum3([1,2,3,4,5]) is 225`,
},
},
{
name: "sum4",
class: FUNC_CLASS_STATS,
help: `Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_sum4,
examples: []string{
`sum4([1,2,3,4,5]) is 979`,
},
},
{
name: "mean",
class: FUNC_CLASS_STATS,
help: `Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_mean,
examples: []string{
`mean([4,5,7,10]) is 6.5`,
},
},
{
name: "meaneb",
class: FUNC_CLASS_STATS,
help: `Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_meaneb,
examples: []string{
`meaneb([4,5,7,10]) is 1.3228756`,
},
},
{
name: "variance",
class: FUNC_CLASS_STATS,
help: `Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_variance,
examples: []string{
`variance([4,5,9,10,11]) is 9.7`,
},
},
{
name: "stddev",
class: FUNC_CLASS_STATS,
help: `Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_stddev,
examples: []string{
`stddev([4,5,9,10,11]) is 3.1144823`,
},
},
{
name: "skewness",
class: FUNC_CLASS_STATS,
help: `Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_skewness,
examples: []string{
`skewness([4,5,9,10,11]) is -0.2097285`,
},
},
{
name: "kurtosis",
class: FUNC_CLASS_STATS,
help: `Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_kurtosis,
examples: []string{
`kurtosis([4,5,9,10,11]) is -1.6703688`,
},
},
{
name: "minlen",
class: FUNC_CLASS_STATS,
help: `Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_minlen,
examples: []string{
`minlen(["año", "alto"]) is 3`,
},
},
{
name: "maxlen",
class: FUNC_CLASS_STATS,
help: `Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`,
unaryFunc: bifs.BIF_maxlen,
examples: []string{
`maxlen(["año", "alto"]) is 4`,
},
},
{
name: "median",
class: FUNC_CLASS_STATS,
help: `Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.`,
unaryFunc: bifs.BIF_median,
binaryFunc: bifs.BIF_median_with_options,
hasMultipleArities: true,
examples: []string{
`median([3,4,5,6,9,10]) is 6`,
`median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5`,
`median(["abc", "def", "ghi", "ghi"]) is "ghi"`,
},
},
{
name: "percentile",
class: FUNC_CLASS_STATS,
help: `Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.`,
binaryFunc: bifs.BIF_percentile,
ternaryFunc: bifs.BIF_percentile_with_options,
hasMultipleArities: true,
examples: []string{
`percentile([3,4,5,6,9,10], 90) is 10`,
`percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5`,
`percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi"`,
},
},
{
name: "percentiles",
class: FUNC_CLASS_STATS,
help: `Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags.`,
binaryFunc: bifs.BIF_percentiles,
ternaryFunc: bifs.BIF_percentiles_with_options,
hasMultipleArities: true,
examples: []string{
``,
`Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort`,
`the input before computing percentiles:`,
``,
` percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 }`,
` percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" }`,
``,
`Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array:`,
``,
` percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9]`,
``,
`Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces`,
`,error on string inputs:`,
``,
` percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 }`,
``,
`The percentiles function always sorts its inputs before computing percentiles. If you know your input`,
`is already sorted -- see also the sort_collection function -- then computation will be faster on`,
`large input if you pass in "array_is_sorted":`,
``,
` x = [6,5,9,10,4,3]`,
` percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect`,
` x = sort_collection(x)`,
` percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct`,
``,
`You can also leverage this feature to compute percentiles on a sort of your choosing. For example:`,
``,
` Non-sorted input:`,
` x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ")`,
` x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"]`,
` Percentiles are taken over the original positions of the words in the array -- "dogs" is last`,
` and hence appears as p99:`,
` percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"]`,
` With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99:`,
` percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]`,
` With default sorting done outside percentiles, the same:`,
` x = sort(x) # or x = sort_collection(x)`,
` x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"]`,
` percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"]`,
` percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]`,
` Now sorting by word length, "loquaciously" is longest and hence is the p99:`,
` x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } )`,
` x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"]`,
` percentiles(x, [50, 99], {"oa":true, "ais":true})`,
` ["over", "loquaciously"]`,
},
},
{
name: "sort_collection",
class: FUNC_CLASS_STATS,
help: `This is a helper function for the percentiles function; please see its online help for details.`,
unaryFunc: bifs.BIF_sort_collection,
examples: []string{},
},
// ----------------------------------------------------------------
// FUNC_CLASS_TIME

View file

@ -739,3 +739,51 @@ func LengthenMlrvalArray(array *[]*Mlrval, newLength64 int) {
*array = newArray
}
}
// ArrayFold reduces an array to a single value, with a user-supplied starting value and pairwise
// element-reducer function. Canonical example: start value is 0 and reducer f(a,b) is a+b: this
// will sum up the values in the array.
func ArrayFold(
a []*Mlrval,
initval *Mlrval,
f func(a, b *Mlrval) *Mlrval,
) *Mlrval {
acc := initval
for _, e := range a {
acc = f(acc, e)
}
return acc
}
// MapFold reduces a map's values to a single value, with a user-supplied starting value and
// pairwise element-reducer function. Canonical example: start value is 0 and reducer f(a,b) is a+b:
// this will sum up the values in the map. Nothing here accesses map keys.
func MapFold(
m *Mlrmap,
initval *Mlrval,
f func(a, b *Mlrval) *Mlrval,
) *Mlrval {
acc := initval
for pe := m.Head; pe != nil; pe = pe.Next {
acc = f(acc, pe.Value)
}
return acc
}
// CollectionFold multiplexes ArrayFold or MapFold. The panic here is not robust, but is done to
// avoid adding an error-return that would frictionalize the API. The idea is that the caller
// (internal/library functions, not directly user-facing) must have pre-validated that the argument
// is an array or map. The panic here is merely a fallback, not the primary check.
func CollectionFold(
c *Mlrval,
initval *Mlrval,
f func(a, b *Mlrval) *Mlrval,
) *Mlrval {
if c.IsArray() {
return ArrayFold(c.AcquireArrayValue(), initval, f)
} else if c.IsMap() {
return MapFold(c.AcquireMapValue(), initval, f)
} else {
panic("CollectionFold argument is neither array nor map")
}
}

View file

@ -222,6 +222,12 @@ func FromArray(arrayval []*Mlrval) *Mlrval {
}
}
func FromSingletonArray(element *Mlrval) *Mlrval {
a := make([]*Mlrval, 1)
a[0] = element
return FromArray(a)
}
func FromEmptyArray() *Mlrval {
return FromArray(make([]*Mlrval, 0))
}

View file

@ -6,7 +6,6 @@ package utils
import (
"fmt"
"math"
"sort"
"github.com/johnkerl/miller/internal/pkg/bifs"
@ -55,209 +54,6 @@ func (keeper *PercentileKeeper) Ingest(value *mlrval.Mlrval) {
keeper.sorted = false
}
// ================================================================
// Non-interpolated percentiles (see also https://en.wikipedia.org/wiki/Percentile)
// ----------------------------------------------------------------
// OPTION 1: int index = p*n/100.0;
//
// x
// 0
// 20
// 40
// 60
// 80
// 100
//
// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 60 x_p60 60 x_p70 80 x_p80 80 x_p90 100 x_p100 100
// x_p01 0 x_p11 0 x_p21 20 x_p31 20 x_p41 40 x_p51 60 x_p61 60 x_p71 80 x_p81 80 x_p91 100
// x_p02 0 x_p12 0 x_p22 20 x_p32 20 x_p42 40 x_p52 60 x_p62 60 x_p72 80 x_p82 80 x_p92 100
// x_p03 0 x_p13 0 x_p23 20 x_p33 20 x_p43 40 x_p53 60 x_p63 60 x_p73 80 x_p83 80 x_p93 100
// x_p04 0 x_p14 0 x_p24 20 x_p34 40 x_p44 40 x_p54 60 x_p64 60 x_p74 80 x_p84 100 x_p94 100
// x_p05 0 x_p15 0 x_p25 20 x_p35 40 x_p45 40 x_p55 60 x_p65 60 x_p75 80 x_p85 100 x_p95 100
// x_p06 0 x_p16 0 x_p26 20 x_p36 40 x_p46 40 x_p56 60 x_p66 60 x_p76 80 x_p86 100 x_p96 100
// x_p07 0 x_p17 20 x_p27 20 x_p37 40 x_p47 40 x_p57 60 x_p67 80 x_p77 80 x_p87 100 x_p97 100
// x_p08 0 x_p18 20 x_p28 20 x_p38 40 x_p48 40 x_p58 60 x_p68 80 x_p78 80 x_p88 100 x_p98 100
// x_p09 0 x_p19 20 x_p29 20 x_p39 40 x_p49 40 x_p59 60 x_p69 80 x_p79 80 x_p89 100 x_p99 100
//
// x
// 0
// 25
// 50
// 75
// 100
//
// x_p00 0 x_p10 0 x_p20 25 x_p30 25 x_p40 50 x_p50 50 x_p60 75 x_p70 75 x_p80 100 x_p90 100 x_p100 100
// x_p01 0 x_p11 0 x_p21 25 x_p31 25 x_p41 50 x_p51 50 x_p61 75 x_p71 75 x_p81 100 x_p91 100
// x_p02 0 x_p12 0 x_p22 25 x_p32 25 x_p42 50 x_p52 50 x_p62 75 x_p72 75 x_p82 100 x_p92 100
// x_p03 0 x_p13 0 x_p23 25 x_p33 25 x_p43 50 x_p53 50 x_p63 75 x_p73 75 x_p83 100 x_p93 100
// x_p04 0 x_p14 0 x_p24 25 x_p34 25 x_p44 50 x_p54 50 x_p64 75 x_p74 75 x_p84 100 x_p94 100
// x_p05 0 x_p15 0 x_p25 25 x_p35 25 x_p45 50 x_p55 50 x_p65 75 x_p75 75 x_p85 100 x_p95 100
// x_p06 0 x_p16 0 x_p26 25 x_p36 25 x_p46 50 x_p56 50 x_p66 75 x_p76 75 x_p86 100 x_p96 100
// x_p07 0 x_p17 0 x_p27 25 x_p37 25 x_p47 50 x_p57 50 x_p67 75 x_p77 75 x_p87 100 x_p97 100
// x_p08 0 x_p18 0 x_p28 25 x_p38 25 x_p48 50 x_p58 50 x_p68 75 x_p78 75 x_p88 100 x_p98 100
// x_p09 0 x_p19 0 x_p29 25 x_p39 25 x_p49 50 x_p59 50 x_p69 75 x_p79 75 x_p89 100 x_p99 100
//
// ----------------------------------------------------------------
// OPTION 2: int index = p*(n-1)/100.0;
//
// x
// 0
// 20
// 40
// 60
// 80
// 100
//
// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 40 x_p60 60 x_p70 60 x_p80 80 x_p90 80 x_p100 100
// x_p01 0 x_p11 0 x_p21 20 x_p31 20 x_p41 40 x_p51 40 x_p61 60 x_p71 60 x_p81 80 x_p91 80
// x_p02 0 x_p12 0 x_p22 20 x_p32 20 x_p42 40 x_p52 40 x_p62 60 x_p72 60 x_p82 80 x_p92 80
// x_p03 0 x_p13 0 x_p23 20 x_p33 20 x_p43 40 x_p53 40 x_p63 60 x_p73 60 x_p83 80 x_p93 80
// x_p04 0 x_p14 0 x_p24 20 x_p34 20 x_p44 40 x_p54 40 x_p64 60 x_p74 60 x_p84 80 x_p94 80
// x_p05 0 x_p15 0 x_p25 20 x_p35 20 x_p45 40 x_p55 40 x_p65 60 x_p75 60 x_p85 80 x_p95 80
// x_p06 0 x_p16 0 x_p26 20 x_p36 20 x_p46 40 x_p56 40 x_p66 60 x_p76 60 x_p86 80 x_p96 80
// x_p07 0 x_p17 0 x_p27 20 x_p37 20 x_p47 40 x_p57 40 x_p67 60 x_p77 60 x_p87 80 x_p97 80
// x_p08 0 x_p18 0 x_p28 20 x_p38 20 x_p48 40 x_p58 40 x_p68 60 x_p78 60 x_p88 80 x_p98 80
// x_p09 0 x_p19 0 x_p29 20 x_p39 20 x_p49 40 x_p59 40 x_p69 60 x_p79 60 x_p89 80 x_p99 80
//
// x
// 0
// 25
// 50
// 75
// 100
//
// x_p00 0 x_p10 0 x_p20 0 x_p30 25 x_p40 25 x_p50 50 x_p60 50 x_p70 50 x_p80 75 x_p90 75 x_p100 100
// x_p01 0 x_p11 0 x_p21 0 x_p31 25 x_p41 25 x_p51 50 x_p61 50 x_p71 50 x_p81 75 x_p91 75
// x_p02 0 x_p12 0 x_p22 0 x_p32 25 x_p42 25 x_p52 50 x_p62 50 x_p72 50 x_p82 75 x_p92 75
// x_p03 0 x_p13 0 x_p23 0 x_p33 25 x_p43 25 x_p53 50 x_p63 50 x_p73 50 x_p83 75 x_p93 75
// x_p04 0 x_p14 0 x_p24 0 x_p34 25 x_p44 25 x_p54 50 x_p64 50 x_p74 50 x_p84 75 x_p94 75
// x_p05 0 x_p15 0 x_p25 25 x_p35 25 x_p45 25 x_p55 50 x_p65 50 x_p75 75 x_p85 75 x_p95 75
// x_p06 0 x_p16 0 x_p26 25 x_p36 25 x_p46 25 x_p56 50 x_p66 50 x_p76 75 x_p86 75 x_p96 75
// x_p07 0 x_p17 0 x_p27 25 x_p37 25 x_p47 25 x_p57 50 x_p67 50 x_p77 75 x_p87 75 x_p97 75
// x_p08 0 x_p18 0 x_p28 25 x_p38 25 x_p48 25 x_p58 50 x_p68 50 x_p78 75 x_p88 75 x_p98 75
// x_p09 0 x_p19 0 x_p29 25 x_p39 25 x_p49 25 x_p59 50 x_p69 50 x_p79 75 x_p89 75 x_p99 75
//
// ----------------------------------------------------------------
// OPTION 3: int index = (int)ceil(p*(n-1)/100.0);
//
// x
// 0
// 20
// 40
// 60
// 80
// 100
//
// x_p00 0 x_p10 20 x_p20 20 x_p30 40 x_p40 40 x_p50 60 x_p60 60 x_p70 80 x_p80 80 x_p90 100 x_p100 100
// x_p01 20 x_p11 20 x_p21 40 x_p31 40 x_p41 60 x_p51 60 x_p61 80 x_p71 80 x_p81 100 x_p91 100
// x_p02 20 x_p12 20 x_p22 40 x_p32 40 x_p42 60 x_p52 60 x_p62 80 x_p72 80 x_p82 100 x_p92 100
// x_p03 20 x_p13 20 x_p23 40 x_p33 40 x_p43 60 x_p53 60 x_p63 80 x_p73 80 x_p83 100 x_p93 100
// x_p04 20 x_p14 20 x_p24 40 x_p34 40 x_p44 60 x_p54 60 x_p64 80 x_p74 80 x_p84 100 x_p94 100
// x_p05 20 x_p15 20 x_p25 40 x_p35 40 x_p45 60 x_p55 60 x_p65 80 x_p75 80 x_p85 100 x_p95 100
// x_p06 20 x_p16 20 x_p26 40 x_p36 40 x_p46 60 x_p56 60 x_p66 80 x_p76 80 x_p86 100 x_p96 100
// x_p07 20 x_p17 20 x_p27 40 x_p37 40 x_p47 60 x_p57 60 x_p67 80 x_p77 80 x_p87 100 x_p97 100
// x_p08 20 x_p18 20 x_p28 40 x_p38 40 x_p48 60 x_p58 60 x_p68 80 x_p78 80 x_p88 100 x_p98 100
// x_p09 20 x_p19 20 x_p29 40 x_p39 40 x_p49 60 x_p59 60 x_p69 80 x_p79 80 x_p89 100 x_p99 100
//
// x
// 0
// 25
// 50
// 75
// 100
//
// x_p00 0 x_p10 25 x_p20 25 x_p30 50 x_p40 50 x_p50 50 x_p60 75 x_p70 75 x_p80 100 x_p90 100 x_p100 100
// x_p01 25 x_p11 25 x_p21 25 x_p31 50 x_p41 50 x_p51 75 x_p61 75 x_p71 75 x_p81 100 x_p91 100
// x_p02 25 x_p12 25 x_p22 25 x_p32 50 x_p42 50 x_p52 75 x_p62 75 x_p72 75 x_p82 100 x_p92 100
// x_p03 25 x_p13 25 x_p23 25 x_p33 50 x_p43 50 x_p53 75 x_p63 75 x_p73 75 x_p83 100 x_p93 100
// x_p04 25 x_p14 25 x_p24 25 x_p34 50 x_p44 50 x_p54 75 x_p64 75 x_p74 75 x_p84 100 x_p94 100
// x_p05 25 x_p15 25 x_p25 25 x_p35 50 x_p45 50 x_p55 75 x_p65 75 x_p75 75 x_p85 100 x_p95 100
// x_p06 25 x_p16 25 x_p26 50 x_p36 50 x_p46 50 x_p56 75 x_p66 75 x_p76 100 x_p86 100 x_p96 100
// x_p07 25 x_p17 25 x_p27 50 x_p37 50 x_p47 50 x_p57 75 x_p67 75 x_p77 100 x_p87 100 x_p97 100
// x_p08 25 x_p18 25 x_p28 50 x_p38 50 x_p48 50 x_p58 75 x_p68 75 x_p78 100 x_p88 100 x_p98 100
// x_p09 25 x_p19 25 x_p29 50 x_p39 50 x_p49 50 x_p59 75 x_p69 75 x_p79 100 x_p89 100 x_p99 100
//
// ----------------------------------------------------------------
// OPTION 4: int index = (int)ceil(-0.5 + p*(n-1)/100.0);
//
// x
// 0
// 20
// 40
// 60
// 80
// 100
//
// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 40 x_p60 60 x_p70 60 x_p80 80 x_p90 80 x_p100 100
// x_p01 0 x_p11 20 x_p21 20 x_p31 40 x_p41 40 x_p51 60 x_p61 60 x_p71 80 x_p81 80 x_p91 100
// x_p02 0 x_p12 20 x_p22 20 x_p32 40 x_p42 40 x_p52 60 x_p62 60 x_p72 80 x_p82 80 x_p92 100
// x_p03 0 x_p13 20 x_p23 20 x_p33 40 x_p43 40 x_p53 60 x_p63 60 x_p73 80 x_p83 80 x_p93 100
// x_p04 0 x_p14 20 x_p24 20 x_p34 40 x_p44 40 x_p54 60 x_p64 60 x_p74 80 x_p84 80 x_p94 100
// x_p05 0 x_p15 20 x_p25 20 x_p35 40 x_p45 40 x_p55 60 x_p65 60 x_p75 80 x_p85 80 x_p95 100
// x_p06 0 x_p16 20 x_p26 20 x_p36 40 x_p46 40 x_p56 60 x_p66 60 x_p76 80 x_p86 80 x_p96 100
// x_p07 0 x_p17 20 x_p27 20 x_p37 40 x_p47 40 x_p57 60 x_p67 60 x_p77 80 x_p87 80 x_p97 100
// x_p08 0 x_p18 20 x_p28 20 x_p38 40 x_p48 40 x_p58 60 x_p68 60 x_p78 80 x_p88 80 x_p98 100
// x_p09 0 x_p19 20 x_p29 20 x_p39 40 x_p49 40 x_p59 60 x_p69 60 x_p79 80 x_p89 80 x_p99 100
//
// x
// 0
// 25
// 50
// 75
// 100
//
// x_p00 0 x_p10 0 x_p20 25 x_p30 25 x_p40 50 x_p50 50 x_p60 50 x_p70 75 x_p80 75 x_p90 100 x_p100 100
// x_p01 0 x_p11 0 x_p21 25 x_p31 25 x_p41 50 x_p51 50 x_p61 50 x_p71 75 x_p81 75 x_p91 100
// x_p02 0 x_p12 0 x_p22 25 x_p32 25 x_p42 50 x_p52 50 x_p62 50 x_p72 75 x_p82 75 x_p92 100
// x_p03 0 x_p13 25 x_p23 25 x_p33 25 x_p43 50 x_p53 50 x_p63 75 x_p73 75 x_p83 75 x_p93 100
// x_p04 0 x_p14 25 x_p24 25 x_p34 25 x_p44 50 x_p54 50 x_p64 75 x_p74 75 x_p84 75 x_p94 100
// x_p05 0 x_p15 25 x_p25 25 x_p35 25 x_p45 50 x_p55 50 x_p65 75 x_p75 75 x_p85 75 x_p95 100
// x_p06 0 x_p16 25 x_p26 25 x_p36 25 x_p46 50 x_p56 50 x_p66 75 x_p76 75 x_p86 75 x_p96 100
// x_p07 0 x_p17 25 x_p27 25 x_p37 25 x_p47 50 x_p57 50 x_p67 75 x_p77 75 x_p87 75 x_p97 100
// x_p08 0 x_p18 25 x_p28 25 x_p38 50 x_p48 50 x_p58 50 x_p68 75 x_p78 75 x_p88 100 x_p98 100
// x_p09 0 x_p19 25 x_p29 25 x_p39 50 x_p49 50 x_p59 50 x_p69 75 x_p79 75 x_p89 100 x_p99 100
//
// ----------------------------------------------------------------
// CONCLUSION:
// * I like option 2 for its simplicity ...
// * ... but option 1 matches R's quantile with type=1.
// * (Note that Miller's interpolated percentiles match match R's quantile with type=7)
// ----------------------------------------------------------------
func computeIndexNoninterpolated(n int, p float64) int {
index := int(p * float64(n) / 100.0)
//index := p * (float64(float64(n)) - 1) / 100.0
//index := int(ceil(p * (float64(n) - 1) / 100.0))
//index := int(ceil(-0.5 + p*(float64(n)-1)/100.0))
if index >= n {
index = n - 1
}
if index < 0 {
index = 0
}
return index
}
// xxx pending pointer-output refactor
func getPercentileLinearlyInterpolated(array []*mlrval.Mlrval, n int, p float64) mlrval.Mlrval {
findex := (p / 100.0) * (float64(n) - 1)
if findex < 0.0 {
findex = 0.0
}
iindex := int(math.Floor(findex))
if iindex >= n-1 {
return *array[iindex].Copy()
} else {
// array[iindex] + frac * (array[iindex+1] - array[iindex])
// TODO: just do this in float64.
frac := mlrval.FromFloat(findex - float64(iindex))
diff := bifs.BIF_minus_binary(array[iindex+1], array[iindex])
prod := bifs.BIF_times(frac, diff)
return *bifs.BIF_plus_binary(array[iindex], prod)
}
}
// ----------------------------------------------------------------
func (keeper *PercentileKeeper) sortIfNecessary() {
if !keeper.sorted {
@ -282,7 +78,7 @@ func (keeper *PercentileKeeper) EmitNonInterpolated(percentile float64) *mlrval.
return mlrval.VOID
}
keeper.sortIfNecessary()
return keeper.data[computeIndexNoninterpolated(int(len(keeper.data)), percentile)].Copy()
return bifs.GetPercentileNonInterpolated(keeper.data, int(len(keeper.data)), percentile)
}
func (keeper *PercentileKeeper) EmitLinearlyInterpolated(percentile float64) *mlrval.Mlrval {
@ -290,8 +86,7 @@ func (keeper *PercentileKeeper) EmitLinearlyInterpolated(percentile float64) *ml
return mlrval.VOID
}
keeper.sortIfNecessary()
output := getPercentileLinearlyInterpolated(keeper.data, int(len(keeper.data)), percentile)
return output.Copy()
return bifs.GetPercentileLinearlyInterpolated(keeper.data, int(len(keeper.data)), percentile)
}
// ----------------------------------------------------------------

View file

@ -615,7 +615,7 @@ func (acc *Stats1VarAccumulator) Ingest(value *mlrval.Mlrval) {
}
}
func (acc *Stats1VarAccumulator) Emit() *mlrval.Mlrval {
return bifs.BIF_get_var(mlrval.FromInt(acc.count), acc.sum, acc.sum2)
return bifs.BIF_finalize_variance(mlrval.FromInt(acc.count), acc.sum, acc.sum2)
}
func (acc *Stats1VarAccumulator) Reset() {
acc.count = 0
@ -646,7 +646,7 @@ func (acc *Stats1StddevAccumulator) Ingest(value *mlrval.Mlrval) {
}
}
func (acc *Stats1StddevAccumulator) Emit() *mlrval.Mlrval {
return bifs.BIF_get_stddev(mlrval.FromInt(acc.count), acc.sum, acc.sum2)
return bifs.BIF_finalize_stddev(mlrval.FromInt(acc.count), acc.sum, acc.sum2)
}
func (acc *Stats1StddevAccumulator) Reset() {
acc.count = 0
@ -678,7 +678,7 @@ func (acc *Stats1MeanEBAccumulator) Ingest(value *mlrval.Mlrval) {
}
func (acc *Stats1MeanEBAccumulator) Emit() *mlrval.Mlrval {
mcount := mlrval.FromInt(acc.count)
return bifs.BIF_get_mean_EB(mcount, acc.sum, acc.sum2)
return bifs.BIF_finalize_mean_eb(mcount, acc.sum, acc.sum2)
}
func (acc *Stats1MeanEBAccumulator) Reset() {
acc.count = 0
@ -714,7 +714,7 @@ func (acc *Stats1SkewnessAccumulator) Ingest(value *mlrval.Mlrval) {
}
func (acc *Stats1SkewnessAccumulator) Emit() *mlrval.Mlrval {
mcount := mlrval.FromInt(acc.count)
return bifs.BIF_get_skewness(mcount, acc.sum, acc.sum2, acc.sum3)
return bifs.BIF_finalize_skewness(mcount, acc.sum, acc.sum2, acc.sum3)
}
func (acc *Stats1SkewnessAccumulator) Reset() {
acc.count = 0
@ -755,7 +755,7 @@ func (acc *Stats1KurtosisAccumulator) Ingest(value *mlrval.Mlrval) {
}
func (acc *Stats1KurtosisAccumulator) Emit() *mlrval.Mlrval {
mcount := mlrval.FromInt(acc.count)
return bifs.BIF_get_kurtosis(mcount, acc.sum, acc.sum2, acc.sum3, acc.sum4)
return bifs.BIF_finalize_kurtosis(mcount, acc.sum, acc.sum2, acc.sum3, acc.sum4)
}
func (acc *Stats1KurtosisAccumulator) Reset() {
acc.count = 0

View file

@ -182,32 +182,34 @@ MILLER(1) MILLER(1)
unsparsify
1mFUNCTION LIST0m
abs acos acosh any append apply arrayify asin asinh asserting_absent
abs acos acosh antimode any append apply arrayify asin asinh asserting_absent
asserting_array asserting_bool asserting_boolean asserting_empty
asserting_empty_map asserting_error asserting_float asserting_int
asserting_map asserting_nonempty_map asserting_not_array asserting_not_empty
asserting_not_map asserting_not_null asserting_null asserting_numeric
asserting_present asserting_string atan atan2 atanh bitcount boolean
capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh
depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor
fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values
gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec
hostname index int invqnorm is_absent is_array is_bool is_boolean is_empty
is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map
count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp expm1
flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys
get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec
hms2sec hostname index int invqnorm is_absent is_array is_bool is_boolean
is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map
is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present
is_string joink joinkv joinv json_parse json_stringify latin1_to_utf8
is_string joink joinkv joinv json_parse json_stringify kurtosis latin1_to_utf8
leafcount leftpad length localtime2gmt localtime2nsec localtime2sec log log10
log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 mexp min
mmul msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime os pow qnorm
log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max maxlen md5
mean meaneb median mexp min minlen mmul mode msub nsec2gmt nsec2gmtdate
nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm
reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms
sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256
sha512 sin sinh sort splita splitax splitkv splitkvx splitnv splitnvx sqrt
ssub strfntime strfntime_local strftime strftime_local string strip strlen
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
sysntime system systime systimeint tan tanh tolower toupper truncate typeof
unflatten unformat unformatx upntime uptime urand urand32 urandelement
urandint urandrange utf8_to_latin1 version ! != !=~ % & && * ** + - . .* .+ .-
./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx
splitnv splitnvx sqrt ssub stddev strfntime strfntime_local strftime
strftime_local string strip strlen strpntime strpntime_local strptime
strptime_local sub substr substr0 substr1 sum sum2 sum3 sum4 sysntime system
systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat
unformatx upntime uptime urand urand32 urandelement urandint urandrange
utf8_to_latin1 variance version ! != !=~ % & && * ** + - . .* .+ .- ./ / // <
<< <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
1mCOMMENTS-IN-DATA FLAGS0m
Miller lets you put comments in your data, such as
@ -2164,6 +2166,12 @@ MILLER(1) MILLER(1)
1macosh0m
(class=math #args=1) Inverse hyperbolic cosine.
1mantimode0m
(class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.
Examples:
antimode([3,3,4,4,4]) is 3
antimode([3,3,4,4]) is 3
1many0m
(class=higher-order-functions #args=2) Given a map or array as first argument and a function as second argument, yields a boolean true if the argument function returns true for any array/map element, false otherwise. For arrays, the function should take one argument, for array element; for maps, it should take two, for map-element key and value. In either case it should return a boolean.
Examples:
@ -2288,6 +2296,12 @@ MILLER(1) MILLER(1)
1mcosh0m
(class=math #args=1) Hyperbolic cosine.
1mcount0m
(class=stats #args=1) Returns the length of an array or map. Returns error for non-array/non-map types.
Examples:
count([7,8,9]) is 3
count({"a":7,"b":8,"c":9}) is 3
1mdepth0m
(class=collections #args=1) Prints maximum depth of map/array. Scalars have depth 0.
@ -2297,6 +2311,13 @@ MILLER(1) MILLER(1)
1mdhms2sec0m
(class=time #args=1) Recovers integer seconds as in dhms2sec("5d18h53m20s") = 500000
1mdistinct_count0m
(class=stats #args=1) Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.
Examples:
distinct_count([7,8,9,7]) is 3
distinct_count([1,"1"]) is 1
distinct_count([1,1.0]) is 2
1merf0m
(class=math #args=1) Error function.
@ -2521,6 +2542,11 @@ MILLER(1) MILLER(1)
1mjson_stringify0m
(class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output.
1mkurtosis0m
(class=stats #args=1) Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
kurtosis([4,5,9,10,11]) is -1.6703688
1mlatin1_to_utf80m
(class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it.
Examples:
@ -2589,20 +2615,53 @@ MILLER(1) MILLER(1)
(class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'.
1mmax0m
(class=math #args=variadic) Max of n numbers; null loses.
(class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
1mmaxlen0m
(class=stats #args=1) Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
maxlen(["ao", "alto"]) is 4
1mmd50m
(class=hashing #args=1) MD5 hash.
1mmean0m
(class=stats #args=1) Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.
Example:
mean([4,5,7,10]) is 6.5
1mmeaneb0m
(class=stats #args=1) Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.
Example:
meaneb([4,5,7,10]) is 1.3228756
1mmedian0m
(class=stats #args=1,2) Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.
Examples:
median([3,4,5,6,9,10]) is 6
median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5
median(["abc", "def", "ghi", "ghi"]) is "ghi"
1mmexp0m
(class=arithmetic #args=3) a ** b mod m (integers)
1mmin0m
(class=math #args=variadic) Min of n numbers; null loses.
(class=math #args=variadic) Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
1mminlen0m
(class=stats #args=1) Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
minlen(["ao", "alto"]) is 3
1mmmul0m
(class=arithmetic #args=3) a * b mod m (integers)
1mmode0m
(class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.
Examples:
mode([3,3,4,4,4]) is 4
mode([3,3,4,4]) is 3
1mmsub0m
(class=arithmetic #args=3) a - b mod m (integers)
@ -2632,9 +2691,70 @@ MILLER(1) MILLER(1)
nsec2localtime(1234567890123456789, 6) = "2009-02-14 01:31:30.123456" with TZ="Asia/Istanbul"
nsec2localtime(1234567890123456789, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.123456"
1mnull_count0m
(class=stats #args=1) Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.
Example:
null_count(["a", "", "c"]) is 1
1mos0m
(class=system #args=0) Returns the operating-system name as a string.
1mpercentile0m
(class=stats #args=2,3) Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.
Examples:
percentile([3,4,5,6,9,10], 90) is 10
percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5
percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi"
1mpercentiles0m
(class=stats #args=2,3) Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags.
Examples:
Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort
the input before computing percentiles:
percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 }
percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" }
Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array:
percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9]
Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces
,error on string inputs:
percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 }
The percentiles function always sorts its inputs before computing percentiles. If you know your input
is already sorted -- see also the sort_collection function -- then computation will be faster on
large input if you pass in "array_is_sorted":
x = [6,5,9,10,4,3]
percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect
x = sort_collection(x)
percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct
You can also leverage this feature to compute percentiles on a sort of your choosing. For example:
Non-sorted input:
x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ")
x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"]
Percentiles are taken over the original positions of the words in the array -- "dogs" is last
and hence appears as p99:
percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"]
With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99:
percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]
With default sorting done outside percentiles, the same:
x = sort(x) # or x = sort_collection(x)
x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"]
percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"]
percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]
Now sorting by word length, "loquaciously" is longest and hence is the p99:
x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } )
x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"]
percentiles(x, [50, 99], {"oa":true, "ais":true})
["over", "loquaciously"]
1mpow0m
(class=arithmetic #args=2) Exponentiation. Same as **, but as a function.
@ -2731,6 +2851,11 @@ MILLER(1) MILLER(1)
1msinh0m
(class=math #args=1) Hyperbolic sine.
1mskewness0m
(class=stats #args=1) Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
skewness([4,5,9,10,11]) is -0.2097285
1msort0m
(class=higher-order-functions #args=1-2) Given a map or array as first argument and string flags or function as optional second argument, returns a sorted copy of the input. With one argument, sorts array elements with numbers first numerically and then strings lexically, and map elements likewise by map keys. If the second argument is a string, it can contain any of "f" for lexical ("n" is for the above default), "c" for case-folded lexical, or "t" for natural sort order. An additional "r" in that string is for reverse. An additional "v" in that string means sort maps by value, rather than by key. If the second argument is a function, then for arrays it should take two arguments a and b, returning < 0, 0, or > 0 as a < b, a == b, or a > b respectively; for maps the function should take four arguments ak, av, bk, and bv, again returning < 0, 0, or > 0, using a and b's keys and values.
Examples:
@ -2747,6 +2872,9 @@ MILLER(1) MILLER(1)
Map without function: sort({"c":2,"a":3,"b":1}, "v") returns {"b":1,"c":2,"a":3}.
Map without function: sort({"c":2,"a":3,"b":1}, "vnr") returns {"a":3,"c":2,"b":1}.
1msort_collection0m
(class=stats #args=1) This is a helper function for the percentiles function; please see its online help for details.
1msplita0m
(class=conversion #args=2) Splits string into array with type inference. First argument is string to split; second is the separator to split on.
Example:
@ -2785,6 +2913,11 @@ MILLER(1) MILLER(1)
Example:
ssub("abc.def", ".", "X") gives "abcXdef"
1mstddev0m
(class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
stddev([4,5,9,10,11]) is 3.1144823
1mstrfntime0m
(class=time #args=2) Formats integer nanoseconds since the epoch as timestamp. Format strings are as at https://pkg.go.dev/github.com/lestrrat-go/strftime, with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also https://miller.readthedocs.io/en/latest/reference-dsl-time/ for more information on the differences from the C library ("man strftime" on your system). See also strftime_local.
Examples:
@ -2872,6 +3005,26 @@ MILLER(1) MILLER(1)
1msubstr10m
(class=string #args=3) substr1(s,m,n) gives substring of s from 1-up position m to n inclusive. Negative indices -len .. -1 alias to 1 .. len. See also substr and substr0.
1msum0m
(class=stats #args=1) Returns the sum of values in an array or map. Returns error for non-array/non-map types.
Example:
sum([1,2,3,4,5]) is 15
1msum20m
(class=stats #args=1) Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types.
Example:
sum2([1,2,3,4,5]) is 55
1msum30m
(class=stats #args=1) Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types.
Example:
sum3([1,2,3,4,5]) is 225
1msum40m
(class=stats #args=1) Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types.
Example:
sum4([1,2,3,4,5]) is 979
1msysntime0m
(class=time #args=0) Returns the system time in 64-bit nanoseconds since the epoch.
@ -2950,6 +3103,11 @@ MILLER(1) MILLER(1)
$y = utf8_to_latin1($x)
$* = utf8_to_latin1($*)
1mvariance0m
(class=stats #args=1) Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
variance([4,5,9,10,11]) is 9.7
1mversion0m
(class=system #args=0) Returns the Miller version as a string.
@ -3451,4 +3609,4 @@ MILLER(1) MILLER(1)
2023-08-23 MILLER(1)
2023-08-26 MILLER(1)

324
man/mlr.1
View file

@ -2,12 +2,12 @@
.\" Title: mlr
.\" Author: [see the "AUTHOR" section]
.\" Generator: ./mkman.rb
.\" Date: 2023-08-23
.\" Date: 2023-08-26
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "MILLER" "1" "2023-08-23" "\ \&" "\ \&"
.TH "MILLER" "1" "2023-08-26" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Portability definitions
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -229,32 +229,34 @@ unsparsify
.RS 0
.\}
.nf
abs acos acosh any append apply arrayify asin asinh asserting_absent
abs acos acosh antimode any append apply arrayify asin asinh asserting_absent
asserting_array asserting_bool asserting_boolean asserting_empty
asserting_empty_map asserting_error asserting_float asserting_int
asserting_map asserting_nonempty_map asserting_not_array asserting_not_empty
asserting_not_map asserting_not_null asserting_null asserting_numeric
asserting_present asserting_string atan atan2 atanh bitcount boolean
capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh
depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor
fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values
gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec
hostname index int invqnorm is_absent is_array is_bool is_boolean is_empty
is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map
count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp expm1
flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys
get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec
hms2sec hostname index int invqnorm is_absent is_array is_bool is_boolean
is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map
is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present
is_string joink joinkv joinv json_parse json_stringify latin1_to_utf8
is_string joink joinkv joinv json_parse json_stringify kurtosis latin1_to_utf8
leafcount leftpad length localtime2gmt localtime2nsec localtime2sec log log10
log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 mexp min
mmul msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime os pow qnorm
log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max maxlen md5
mean meaneb median mexp min minlen mmul mode msub nsec2gmt nsec2gmtdate
nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm
reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms
sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256
sha512 sin sinh sort splita splitax splitkv splitkvx splitnv splitnvx sqrt
ssub strfntime strfntime_local strftime strftime_local string strip strlen
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
sysntime system systime systimeint tan tanh tolower toupper truncate typeof
unflatten unformat unformatx upntime uptime urand urand32 urandelement
urandint urandrange utf8_to_latin1 version ! != !=~ % & && * ** + - . .* .+ .-
\&./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx
splitnv splitnvx sqrt ssub stddev strfntime strfntime_local strftime
strftime_local string strip strlen strpntime strpntime_local strptime
strptime_local sub substr substr0 substr1 sum sum2 sum3 sum4 sysntime system
systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat
unformatx upntime uptime urand urand32 urandelement urandint urandrange
utf8_to_latin1 variance version ! != !=~ % & && * ** + - . .* .+ .- ./ / // <
<< <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
.fi
.if n \{\
.RE
@ -2765,6 +2767,18 @@ being 'b=3,c=4', then the output is the two records 'a=1,b=2,c=' and
.fi
.if n \{\
.RE
.SS "antimode"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.
Examples:
antimode([3,3,4,4,4]) is 3
antimode([3,3,4,4]) is 3
.fi
.if n \{\
.RE
.SS "any"
.if n \{\
.RS 0
@ -3117,6 +3131,18 @@ concat([1,2],[3]) is [1,2,3]
.fi
.if n \{\
.RE
.SS "count"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the length of an array or map. Returns error for non-array/non-map types.
Examples:
count([7,8,9]) is 3
count({"a":7,"b":8,"c":9}) is 3
.fi
.if n \{\
.RE
.SS "depth"
.if n \{\
.RS 0
@ -3144,6 +3170,19 @@ concat([1,2],[3]) is [1,2,3]
.fi
.if n \{\
.RE
.SS "distinct_count"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.
Examples:
distinct_count([7,8,9,7]) is 3
distinct_count([1,"1"]) is 1
distinct_count([1,1.0]) is 2
.fi
.if n \{\
.RE
.SS "erf"
.if n \{\
.RS 0
@ -3698,6 +3737,17 @@ joinv({"a":3,"b":4,"c":5}, ",") = "3,4,5"
.fi
.if n \{\
.RE
.SS "kurtosis"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
kurtosis([4,5,9,10,11]) is -1.6703688
.fi
.if n \{\
.RE
.SS "latin1_to_utf8"
.if n \{\
.RS 0
@ -3872,7 +3922,18 @@ localtime2sec("2001-02-03 04:05:06", "Asia/Istanbul") = 981165906"
.RS 0
.\}
.nf
(class=math #args=variadic) Max of n numbers; null loses.
(class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
.fi
.if n \{\
.RE
.SS "maxlen"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
maxlen(["año", "alto"]) is 4
.fi
.if n \{\
.RE
@ -3885,6 +3946,41 @@ localtime2sec("2001-02-03 04:05:06", "Asia/Istanbul") = 981165906"
.fi
.if n \{\
.RE
.SS "mean"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.
Example:
mean([4,5,7,10]) is 6.5
.fi
.if n \{\
.RE
.SS "meaneb"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.
Example:
meaneb([4,5,7,10]) is 1.3228756
.fi
.if n \{\
.RE
.SS "median"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1,2) Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.
Examples:
median([3,4,5,6,9,10]) is 6
median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5
median(["abc", "def", "ghi", "ghi"]) is "ghi"
.fi
.if n \{\
.RE
.SS "mexp"
.if n \{\
.RS 0
@ -3899,7 +3995,18 @@ localtime2sec("2001-02-03 04:05:06", "Asia/Istanbul") = 981165906"
.RS 0
.\}
.nf
(class=math #args=variadic) Min of n numbers; null loses.
(class=math #args=variadic) Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
.fi
.if n \{\
.RE
.SS "minlen"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
minlen(["año", "alto"]) is 3
.fi
.if n \{\
.RE
@ -3912,6 +4019,18 @@ localtime2sec("2001-02-03 04:05:06", "Asia/Istanbul") = 981165906"
.fi
.if n \{\
.RE
.SS "mode"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.
Examples:
mode([3,3,4,4,4]) is 4
mode([3,3,4,4]) is 3
.fi
.if n \{\
.RE
.SS "msub"
.if n \{\
.RS 0
@ -3971,6 +4090,17 @@ nsec2localtime(1234567890123456789, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.1
.fi
.if n \{\
.RE
.SS "null_count"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.
Example:
null_count(["a", "", "c"]) is 1
.fi
.if n \{\
.RE
.SS "os"
.if n \{\
.RS 0
@ -3980,6 +4110,74 @@ nsec2localtime(1234567890123456789, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.1
.fi
.if n \{\
.RE
.SS "percentile"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=2,3) Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.
Examples:
percentile([3,4,5,6,9,10], 90) is 10
percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5
percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi"
.fi
.if n \{\
.RE
.SS "percentiles"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=2,3) Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags.
Examples:
Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort
the input before computing percentiles:
percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 }
percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" }
Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array:
percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9]
Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces
,error on string inputs:
percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 }
The percentiles function always sorts its inputs before computing percentiles. If you know your input
is already sorted -- see also the sort_collection function -- then computation will be faster on
large input if you pass in "array_is_sorted":
x = [6,5,9,10,4,3]
percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect
x = sort_collection(x)
percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct
You can also leverage this feature to compute percentiles on a sort of your choosing. For example:
Non-sorted input:
x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ")
x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"]
Percentiles are taken over the original positions of the words in the array -- "dogs" is last
and hence appears as p99:
percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"]
With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99:
percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]
With default sorting done outside percentiles, the same:
x = sort(x) # or x = sort_collection(x)
x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"]
percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"]
percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]
Now sorting by word length, "loquaciously" is longest and hence is the p99:
x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } )
x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"]
percentiles(x, [50, 99], {"oa":true, "ais":true})
["over", "loquaciously"]
.fi
.if n \{\
.RE
.SS "pow"
.if n \{\
.RS 0
@ -4208,6 +4406,17 @@ Map example: select({"a":1, "b":3, "c":5}, func(k,v) {return v >= 3}) returns {"
.fi
.if n \{\
.RE
.SS "skewness"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
skewness([4,5,9,10,11]) is -0.2097285
.fi
.if n \{\
.RE
.SS "sort"
.if n \{\
.RS 0
@ -4230,6 +4439,15 @@ Map without function: sort({"c":2,"a":3,"b":1}, "vnr") returns {"a":3,"c":2,"b":
.fi
.if n \{\
.RE
.SS "sort_collection"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) This is a helper function for the percentiles function; please see its online help for details.
.fi
.if n \{\
.RE
.SS "splita"
.if n \{\
.RS 0
@ -4316,6 +4534,17 @@ ssub("abc.def", ".", "X") gives "abcXdef"
.fi
.if n \{\
.RE
.SS "stddev"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
stddev([4,5,9,10,11]) is 3.1144823
.fi
.if n \{\
.RE
.SS "strfntime"
.if n \{\
.RS 0
@ -4493,6 +4722,50 @@ sub("prefix4529:suffix8567", "suffix([0-9]+)", "name\e1") gives "prefix4529:name
.fi
.if n \{\
.RE
.SS "sum"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the sum of values in an array or map. Returns error for non-array/non-map types.
Example:
sum([1,2,3,4,5]) is 15
.fi
.if n \{\
.RE
.SS "sum2"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types.
Example:
sum2([1,2,3,4,5]) is 55
.fi
.if n \{\
.RE
.SS "sum3"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types.
Example:
sum3([1,2,3,4,5]) is 225
.fi
.if n \{\
.RE
.SS "sum4"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types.
Example:
sum4([1,2,3,4,5]) is 979
.fi
.if n \{\
.RE
.SS "sysntime"
.if n \{\
.RS 0
@ -4697,6 +4970,17 @@ $* = utf8_to_latin1($*)
.fi
.if n \{\
.RE
.SS "variance"
.if n \{\
.RS 0
.\}
.nf
(class=stats #args=1) Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.
Example:
variance([4,5,9,10,11]) is 9.7
.fi
.if n \{\
.RE
.SS "version"
.if n \{\
.RS 0

View file

@ -0,0 +1 @@
mlr -n --ofmtf 6 --xtab put -f ${CASEDIR}/mlr

View file

@ -0,0 +1,20 @@
count_0 (error)
count_0_type error
count_null (error)
count_null_type error
count_empty_array 0
count_empty_array_type int
count_array_1 1
count_array_1_type int
count_array_3 3
count_array_3_type int
count_array_nested 3
count_array_nested_type int
count_empty_map 0
count_empty_map_type int
count_map_1 1
count_map_1_type int
count_map_3 3
count_map_3_type int
count_map_nested 3
count_map_nested_type int

View file

@ -0,0 +1,26 @@
end {
outputs = {};
outputs["count_0"] = count(0);
outputs["count_null"] = count(null);
outputs["count_nonesuch"] = count(nonesuch);
outputs["count_empty_array"] = count([]);
outputs["count_array_1"] = count([7]);
outputs["count_array_3"] = count([7,8,9]);
outputs["count_array_nested"] = count([7,[80,90],9]);
outputs["count_empty_map"] = count({});
outputs["count_map_1"] = count({ "a" : 7} );
outputs["count_map_3"] = count({ "a" : 7, "b" : 8, "c" : 9 } );
outputs["count_map_nested"] = count({ "a" : 7, "b" : [80,90], "c" : 9 });
typed_outputs = {};
for (k, v in outputs) {
typed_outputs[k] = v;
typed_outputs[k."_type"] = typeof(v);
}
emit typed_outputs;
}

View file

@ -0,0 +1 @@
mlr -n --ofmtf 6 --xtab put -f ${CASEDIR}/mlr

View file

@ -0,0 +1,32 @@
distinct_count_0 (error)
distinct_count_0_type error
distinct_count_null (error)
distinct_count_null_type error
distinct_count_empty_array 0
distinct_count_empty_array_type int
distinct_count_array_1 1
distinct_count_array_1_type int
distinct_count_array_3a 3
distinct_count_array_3a_type int
distinct_count_array_3b 2
distinct_count_array_3b_type int
distinct_count_array_3c 1
distinct_count_array_3c_type int
distinct_count_array_3d 1
distinct_count_array_3d_type int
distinct_count_array_nested 2
distinct_count_array_nested_type int
distinct_count_empty_map 0
distinct_count_empty_map_type int
distinct_count_map_1 1
distinct_count_map_1_type int
distinct_count_map_3a 3
distinct_count_map_3a_type int
distinct_count_map_3b 2
distinct_count_map_3b_type int
distinct_count_map_3c 1
distinct_count_map_3c_type int
distinct_count_map_3d 1
distinct_count_map_3d_type int
distinct_count_map_nested 2
distinct_count_map_nested_type int

View file

@ -0,0 +1,32 @@
end {
outputs = {};
outputs["distinct_count_0"] = distinct_count(0);
outputs["distinct_count_null"] = distinct_count(null);
outputs["distinct_count_nonesuch"] = distinct_count(nonesuch);
outputs["distinct_count_empty_array"] = distinct_count([]);
outputs["distinct_count_array_1"] = distinct_count([7]);
outputs["distinct_count_array_3a"] = distinct_count([7,8,9]);
outputs["distinct_count_array_3b"] = distinct_count([7,7,9]);
outputs["distinct_count_array_3c"] = distinct_count([7,7,7]);
outputs["distinct_count_array_3d"] = distinct_count([null,null,null]);
outputs["distinct_count_array_nested"] = distinct_count([7,[7],7]);
outputs["distinct_count_empty_map"] = distinct_count({});
outputs["distinct_count_map_1"] = distinct_count({ "a" : 7} );
outputs["distinct_count_map_3a"] = distinct_count({ "a" : 7, "b" : 8, "c" : 9 } );
outputs["distinct_count_map_3b"] = distinct_count({ "a" : 7, "b" : 7, "c" : 9 } );
outputs["distinct_count_map_3c"] = distinct_count({ "a" : 7, "b" : 7, "c" : 7 } );
outputs["distinct_count_map_3d"] = distinct_count({ "a" : null, "b" : null, "c" : null } );
outputs["distinct_count_map_nested"] = distinct_count({ "a" : 7, "b" : [7], "c" : 7 });
typed_outputs = {};
for (k, v in outputs) {
typed_outputs[k] = v;
typed_outputs[k."_type"] = typeof(v);
}
emit typed_outputs;
}

View file

@ -0,0 +1 @@
mlr -n --ofmtf 6 --xtab put -f ${CASEDIR}/mlr

View file

View file

@ -0,0 +1,24 @@
mode_0 (error)
mode_0_type error
mode_null (error)
mode_null_type error
mode_empty_array
mode_empty_array_type empty
mode_array_1 7
mode_array_1_type int
mode_array_3a 7
mode_array_3a_type int
mode_array_3b 7
mode_array_3b_type int
mode_array_nested 9
mode_array_nested_type int
mode_empty_map
mode_empty_map_type empty
mode_map_1 7
mode_map_1_type int
mode_map_3a 7
mode_map_3a_type int
mode_map_3b 7
mode_map_3b_type int
mode_map_nested 9
mode_map_nested_type int

View file

@ -0,0 +1,28 @@
end {
outputs = {};
outputs["mode_0"] = mode(0);
outputs["mode_null"] = mode(null);
outputs["mode_nonesuch"] = mode(nonesuch);
outputs["mode_empty_array"] = mode([]);
outputs["mode_array_1"] = mode([7]);
outputs["mode_array_3a"] = mode([7,8,9]);
outputs["mode_array_3b"] = mode([7,8,7]);
outputs["mode_array_nested"] = mode([7,[8,8,8,8,8,8],9,9,9]);
outputs["mode_empty_map"] = mode({});
outputs["mode_map_1"] = mode({ "a" : 7} );
outputs["mode_map_3a"] = mode({ "a" : 7, "b" : 8, "c" : 9 } );
outputs["mode_map_3b"] = mode({ "a" : 7, "b" : 8, "c" : 7 } );
outputs["mode_map_nested"] = mode({ "a" : 7, "b" : [8,8,8,8,8,8], "c" : 9, "d": 9, "e": 9 });
typed_outputs = {};
for (k, v in outputs) {
typed_outputs[k] = v;
typed_outputs[k."_type"] = typeof(v);
}
emit typed_outputs;
}

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --from test/input/abixy head -n 0 then put -q -f test/input/test-moments.mlr

View file

@ -0,0 +1,26 @@
[
{
"a_count": 0,
"a_sum": 0,
"a_sum2": 0,
"a_sum3": 0,
"a_sum4": 0,
"a_mean": "",
"a_var": "",
"a_stddev": "",
"a_meaneb": "",
"a_skewness": "",
"a_kurtosis": "",
"m_count": 0,
"m_sum": 0,
"m_sum2": 0,
"m_sum3": 0,
"m_sum4": 0,
"m_mean": "",
"m_var": "",
"m_stddev": "",
"m_meaneb": "",
"m_skewness": "",
"m_kurtosis": ""
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --from test/input/abixy head -n 1 then put -q -f test/input/test-moments.mlr

View file

@ -0,0 +1,26 @@
[
{
"a_count": 1,
"a_sum": 1,
"a_sum2": 1,
"a_sum3": 1,
"a_sum4": 1,
"a_mean": 1,
"a_var": "",
"a_stddev": "",
"a_meaneb": "",
"a_skewness": "",
"a_kurtosis": "",
"m_count": 1,
"m_sum": 1,
"m_sum2": 1,
"m_sum3": 1,
"m_sum4": 1,
"m_mean": 1,
"m_var": "",
"m_stddev": "",
"m_meaneb": "",
"m_skewness": "",
"m_kurtosis": ""
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --from test/input/abixy head -n 2 then put -q -f test/input/test-moments.mlr

View file

@ -0,0 +1,26 @@
[
{
"a_count": 2,
"a_sum": 3,
"a_sum2": 5,
"a_sum3": 9,
"a_sum4": 17,
"a_mean": 1.500000,
"a_var": 0.500000,
"a_stddev": 0.707107,
"a_meaneb": 0.500000,
"a_skewness": 0.000000,
"a_kurtosis": -2.000000,
"m_count": 2,
"m_sum": 3,
"m_sum2": 5,
"m_sum3": 9,
"m_sum4": 17,
"m_mean": 1.500000,
"m_var": 0.500000,
"m_stddev": 0.707107,
"m_meaneb": 0.500000,
"m_skewness": 0.000000,
"m_kurtosis": -2.000000
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --from test/input/abixy head -n 3 then put -q -f test/input/test-moments.mlr

View file

@ -0,0 +1,26 @@
[
{
"a_count": 3,
"a_sum": 6,
"a_sum2": 14,
"a_sum3": 36,
"a_sum4": 98,
"a_mean": 2,
"a_var": 1.000000,
"a_stddev": 1.000000,
"a_meaneb": 0.577350,
"a_skewness": 0.000000,
"a_kurtosis": -1.500000,
"m_count": 3,
"m_sum": 6,
"m_sum2": 14,
"m_sum3": 36,
"m_sum4": 98,
"m_mean": 2,
"m_var": 1.000000,
"m_stddev": 1.000000,
"m_meaneb": 0.577350,
"m_skewness": 0.000000,
"m_kurtosis": -1.500000
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --from test/input/abixy head -n 4 then put -q -f test/input/test-moments.mlr

View file

@ -0,0 +1,26 @@
[
{
"a_count": 4,
"a_sum": 10,
"a_sum2": 30,
"a_sum3": 100,
"a_sum4": 354,
"a_mean": 2.500000,
"a_var": 1.666667,
"a_stddev": 1.290994,
"a_meaneb": 0.645497,
"a_skewness": 0.000000,
"a_kurtosis": -1.360000,
"m_count": 4,
"m_sum": 10,
"m_sum2": 30,
"m_sum3": 100,
"m_sum4": 354,
"m_mean": 2.500000,
"m_var": 1.666667,
"m_stddev": 1.290994,
"m_meaneb": 0.645497,
"m_skewness": 0.000000,
"m_kurtosis": -1.360000
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --from test/input/abixy put -q -f test/input/test-moments.mlr

View file

@ -0,0 +1,26 @@
[
{
"a_count": 10,
"a_sum": 55,
"a_sum2": 385,
"a_sum3": 3025,
"a_sum4": 25333,
"a_mean": 5.500000,
"a_var": 9.166667,
"a_stddev": 3.027650,
"a_meaneb": 0.957427,
"a_skewness": 0.000000,
"a_kurtosis": -1.224242,
"m_count": 10,
"m_sum": 55,
"m_sum2": 385,
"m_sum3": 3025,
"m_sum4": 25333,
"m_mean": 5.500000,
"m_var": 9.166667,
"m_stddev": 3.027650,
"m_meaneb": 0.957427,
"m_skewness": 0.000000,
"m_kurtosis": -1.224242
}
]

View file

@ -0,0 +1 @@
mlr -n --ofmtf 6 --xtab put -f ${CASEDIR}/mlr

View file

@ -0,0 +1,20 @@
null_count_0 (error)
null_count_0_type error
null_count_null (error)
null_count_null_type error
null_count_empty_array 0
null_count_empty_array_type int
null_count_array_1 0
null_count_array_1_type int
null_count_array_2 0
null_count_array_2_type int
null_count_array_3 2
null_count_array_3_type int
null_count_empty_map 0
null_count_empty_map_type int
null_count_map_1 0
null_count_map_1_type int
null_count_map_2 0
null_count_map_2_type int
null_count_map_3 2
null_count_map_3_type int

View file

@ -0,0 +1,28 @@
end {
outputs = {};
# Only empty string and JSON-null count as nulls
outputs["null_count_0"] = null_count(0);
outputs["null_count_null"] = null_count(null);
outputs["null_count_nonesuch"] = null_count(nonesuch);
outputs["null_count_empty_array"] = null_count([]);
outputs["null_count_array_1"] = null_count([7]);
outputs["null_count_array_2"] = null_count([7,8]);
outputs["null_count_array_3"] = null_count(["",null,nonesuch]);
outputs["null_count_empty_map"] = null_count({});
outputs["null_count_map_1"] = null_count({ "a" : 7});
outputs["null_count_map_2"] = null_count({ "a" : 7, "b" : 8 });
outputs["null_count_map_3"] = null_count({ "a" : "", "b" : null, "c" : nonesuch });
typed_outputs = {};
for (k, v in outputs) {
typed_outputs[k] = v;
typed_outputs[k."_type"] = typeof(v);
}
emit typed_outputs;
}

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 0 then put -q -f test/input/test-percentiles.mlr -s field=a

View file

@ -0,0 +1,62 @@
[
{
"a_min": "",
"a_max": "",
"a_minlen": "",
"a_maxlen": "",
"a_median": "",
"a_ps": {
"0": "",
"1": "",
"10": "",
"25": "",
"50": "",
"75": "",
"90": "",
"99": "",
"100": ""
},
"a_psi": {
"0": "",
"1": "",
"10": "",
"25": "",
"50": "",
"75": "",
"90": "",
"99": "",
"100": ""
},
"a_psa": ["", "", "", "", "", "", "", "", ""],
"a_psia": ["", "", "", "", "", "", "", "", ""],
"m_min": "",
"m_max": "",
"m_minlen": "",
"m_maxlen": "",
"m_median": "",
"m_ps": {
"0": "",
"1": "",
"10": "",
"25": "",
"50": "",
"75": "",
"90": "",
"99": "",
"100": ""
},
"m_psi": {
"0": "",
"1": "",
"10": "",
"25": "",
"50": "",
"75": "",
"90": "",
"99": "",
"100": ""
},
"m_psa": ["", "", "", "", "", "", "", "", ""],
"m_psia": ["", "", "", "", "", "", "", "", ""]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 1 then put -q -f test/input/test-percentiles.mlr -s field=a

View file

@ -0,0 +1,62 @@
[
{
"a_min": "pan",
"a_max": "pan",
"a_minlen": 3,
"a_maxlen": 3,
"a_median": "pan",
"a_ps": {
"0": "pan",
"1": "pan",
"10": "pan",
"25": "pan",
"50": "pan",
"75": "pan",
"90": "pan",
"99": "pan",
"100": "pan"
},
"a_psi": {
"0": "pan",
"1": "pan",
"10": "pan",
"25": "pan",
"50": "pan",
"75": "pan",
"90": "pan",
"99": "pan",
"100": "pan"
},
"a_psa": ["pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan"],
"a_psia": ["pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan"],
"m_min": "pan",
"m_max": "pan",
"m_minlen": 3,
"m_maxlen": 3,
"m_median": "pan",
"m_ps": {
"0": "pan",
"1": "pan",
"10": "pan",
"25": "pan",
"50": "pan",
"75": "pan",
"90": "pan",
"99": "pan",
"100": "pan"
},
"m_psi": {
"0": "pan",
"1": "pan",
"10": "pan",
"25": "pan",
"50": "pan",
"75": "pan",
"90": "pan",
"99": "pan",
"100": "pan"
},
"m_psa": ["pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan"],
"m_psia": ["pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan"]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 2 then put -q -f test/input/test-percentiles.mlr -s field=a

View file

@ -0,0 +1,62 @@
[
{
"a_min": "eks",
"a_max": "pan",
"a_minlen": 3,
"a_maxlen": 3,
"a_median": "pan",
"a_ps": {
"0": "eks",
"1": "eks",
"10": "eks",
"25": "eks",
"50": "pan",
"75": "pan",
"90": "pan",
"99": "pan",
"100": "pan"
},
"a_psi": {
"0": (error),
"1": (error),
"10": (error),
"25": (error),
"50": (error),
"75": (error),
"90": (error),
"99": (error),
"100": "pan"
},
"a_psa": ["eks", "eks", "eks", "eks", "pan", "pan", "pan", "pan", "pan"],
"a_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "pan"],
"m_min": "eks",
"m_max": "pan",
"m_minlen": 3,
"m_maxlen": 3,
"m_median": "pan",
"m_ps": {
"0": "eks",
"1": "eks",
"10": "eks",
"25": "eks",
"50": "pan",
"75": "pan",
"90": "pan",
"99": "pan",
"100": "pan"
},
"m_psi": {
"0": (error),
"1": (error),
"10": (error),
"25": (error),
"50": (error),
"75": (error),
"90": (error),
"99": (error),
"100": "pan"
},
"m_psa": ["eks", "eks", "eks", "eks", "pan", "pan", "pan", "pan", "pan"],
"m_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "pan"]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 3 then put -q -f test/input/test-percentiles.mlr -s field=a

View file

@ -0,0 +1,62 @@
[
{
"a_min": "eks",
"a_max": "wye",
"a_minlen": 3,
"a_maxlen": 3,
"a_median": "pan",
"a_ps": {
"0": "eks",
"1": "eks",
"10": "eks",
"25": "eks",
"50": "pan",
"75": "wye",
"90": "wye",
"99": "wye",
"100": "wye"
},
"a_psi": {
"0": (error),
"1": (error),
"10": (error),
"25": (error),
"50": (error),
"75": (error),
"90": (error),
"99": (error),
"100": "wye"
},
"a_psa": ["eks", "eks", "eks", "eks", "pan", "wye", "wye", "wye", "wye"],
"a_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "wye"],
"m_min": "eks",
"m_max": "wye",
"m_minlen": 3,
"m_maxlen": 3,
"m_median": "pan",
"m_ps": {
"0": "eks",
"1": "eks",
"10": "eks",
"25": "eks",
"50": "pan",
"75": "wye",
"90": "wye",
"99": "wye",
"100": "wye"
},
"m_psi": {
"0": (error),
"1": (error),
"10": (error),
"25": (error),
"50": (error),
"75": (error),
"90": (error),
"99": (error),
"100": "wye"
},
"m_psa": ["eks", "eks", "eks", "eks", "pan", "wye", "wye", "wye", "wye"],
"m_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "wye"]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 4 then put -q -f test/input/test-percentiles.mlr -s field=a

View file

@ -0,0 +1,62 @@
[
{
"a_min": "eks",
"a_max": "wye",
"a_minlen": 3,
"a_maxlen": 3,
"a_median": "pan",
"a_ps": {
"0": "eks",
"1": "eks",
"10": "eks",
"25": "eks",
"50": "pan",
"75": "wye",
"90": "wye",
"99": "wye",
"100": "wye"
},
"a_psi": {
"0": (error),
"1": (error),
"10": (error),
"25": (error),
"50": (error),
"75": (error),
"90": (error),
"99": (error),
"100": "wye"
},
"a_psa": ["eks", "eks", "eks", "eks", "pan", "wye", "wye", "wye", "wye"],
"a_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "wye"],
"m_min": "eks",
"m_max": "wye",
"m_minlen": 3,
"m_maxlen": 3,
"m_median": "pan",
"m_ps": {
"0": "eks",
"1": "eks",
"10": "eks",
"25": "eks",
"50": "pan",
"75": "wye",
"90": "wye",
"99": "wye",
"100": "wye"
},
"m_psi": {
"0": (error),
"1": (error),
"10": (error),
"25": (error),
"50": (error),
"75": (error),
"90": (error),
"99": (error),
"100": "wye"
},
"m_psa": ["eks", "eks", "eks", "eks", "pan", "wye", "wye", "wye", "wye"],
"m_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "wye"]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z put -q -f test/input/test-percentiles.mlr -s field=a

View file

@ -0,0 +1,62 @@
[
{
"a_min": "eks",
"a_max": "zee",
"a_minlen": 3,
"a_maxlen": 3,
"a_median": "pan",
"a_ps": {
"0": "eks",
"1": "eks",
"10": "eks",
"25": "hat",
"50": "pan",
"75": "wye",
"90": "zee",
"99": "zee",
"100": "zee"
},
"a_psi": {
"0": (error),
"1": (error),
"10": (error),
"25": (error),
"50": (error),
"75": (error),
"90": (error),
"99": (error),
"100": "zee"
},
"a_psa": ["eks", "eks", "eks", "hat", "pan", "wye", "zee", "zee", "zee"],
"a_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "zee"],
"m_min": "eks",
"m_max": "zee",
"m_minlen": 3,
"m_maxlen": 3,
"m_median": "pan",
"m_ps": {
"0": "eks",
"1": "eks",
"10": "eks",
"25": "hat",
"50": "pan",
"75": "wye",
"90": "zee",
"99": "zee",
"100": "zee"
},
"m_psi": {
"0": (error),
"1": (error),
"10": (error),
"25": (error),
"50": (error),
"75": (error),
"90": (error),
"99": (error),
"100": "zee"
},
"m_psa": ["eks", "eks", "eks", "hat", "pan", "wye", "zee", "zee", "zee"],
"m_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "zee"]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 0 then put -q -f test/input/test-percentiles.mlr -s field=i

View file

@ -0,0 +1,62 @@
[
{
"a_min": "",
"a_max": "",
"a_minlen": "",
"a_maxlen": "",
"a_median": "",
"a_ps": {
"0": "",
"1": "",
"10": "",
"25": "",
"50": "",
"75": "",
"90": "",
"99": "",
"100": ""
},
"a_psi": {
"0": "",
"1": "",
"10": "",
"25": "",
"50": "",
"75": "",
"90": "",
"99": "",
"100": ""
},
"a_psa": ["", "", "", "", "", "", "", "", ""],
"a_psia": ["", "", "", "", "", "", "", "", ""],
"m_min": "",
"m_max": "",
"m_minlen": "",
"m_maxlen": "",
"m_median": "",
"m_ps": {
"0": "",
"1": "",
"10": "",
"25": "",
"50": "",
"75": "",
"90": "",
"99": "",
"100": ""
},
"m_psi": {
"0": "",
"1": "",
"10": "",
"25": "",
"50": "",
"75": "",
"90": "",
"99": "",
"100": ""
},
"m_psa": ["", "", "", "", "", "", "", "", ""],
"m_psia": ["", "", "", "", "", "", "", "", ""]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 1 then put -q -f test/input/test-percentiles.mlr -s field=i

View file

@ -0,0 +1,62 @@
[
{
"a_min": 1,
"a_max": 1,
"a_minlen": 1,
"a_maxlen": 1,
"a_median": 1,
"a_ps": {
"0": 1,
"1": 1,
"10": 1,
"25": 1,
"50": 1,
"75": 1,
"90": 1,
"99": 1,
"100": 1
},
"a_psi": {
"0": 1,
"1": 1,
"10": 1,
"25": 1,
"50": 1,
"75": 1,
"90": 1,
"99": 1,
"100": 1
},
"a_psa": [1, 1, 1, 1, 1, 1, 1, 1, 1],
"a_psia": [1, 1, 1, 1, 1, 1, 1, 1, 1],
"m_min": 1,
"m_max": 1,
"m_minlen": 1,
"m_maxlen": 1,
"m_median": 1,
"m_ps": {
"0": 1,
"1": 1,
"10": 1,
"25": 1,
"50": 1,
"75": 1,
"90": 1,
"99": 1,
"100": 1
},
"m_psi": {
"0": 1,
"1": 1,
"10": 1,
"25": 1,
"50": 1,
"75": 1,
"90": 1,
"99": 1,
"100": 1
},
"m_psa": [1, 1, 1, 1, 1, 1, 1, 1, 1],
"m_psia": [1, 1, 1, 1, 1, 1, 1, 1, 1]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 2 then put -q -f test/input/test-percentiles.mlr -s field=i

View file

@ -0,0 +1,62 @@
[
{
"a_min": 1,
"a_max": 2,
"a_minlen": 1,
"a_maxlen": 1,
"a_median": 2,
"a_ps": {
"0": 1,
"1": 1,
"10": 1,
"25": 1,
"50": 2,
"75": 2,
"90": 2,
"99": 2,
"100": 2
},
"a_psi": {
"0": 1.000000,
"1": 1.010000,
"10": 1.100000,
"25": 1.250000,
"50": 1.500000,
"75": 1.750000,
"90": 1.900000,
"99": 1.990000,
"100": 2
},
"a_psa": [1, 1, 1, 1, 2, 2, 2, 2, 2],
"a_psia": [1.000000, 1.010000, 1.100000, 1.250000, 1.500000, 1.750000, 1.900000, 1.990000, 2],
"m_min": 1,
"m_max": 2,
"m_minlen": 1,
"m_maxlen": 1,
"m_median": 2,
"m_ps": {
"0": 1,
"1": 1,
"10": 1,
"25": 1,
"50": 2,
"75": 2,
"90": 2,
"99": 2,
"100": 2
},
"m_psi": {
"0": 1.000000,
"1": 1.010000,
"10": 1.100000,
"25": 1.250000,
"50": 1.500000,
"75": 1.750000,
"90": 1.900000,
"99": 1.990000,
"100": 2
},
"m_psa": [1, 1, 1, 1, 2, 2, 2, 2, 2],
"m_psia": [1.000000, 1.010000, 1.100000, 1.250000, 1.500000, 1.750000, 1.900000, 1.990000, 2]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 3 then put -q -f test/input/test-percentiles.mlr -s field=i

View file

@ -0,0 +1,62 @@
[
{
"a_min": 1,
"a_max": 3,
"a_minlen": 1,
"a_maxlen": 1,
"a_median": 2,
"a_ps": {
"0": 1,
"1": 1,
"10": 1,
"25": 1,
"50": 2,
"75": 3,
"90": 3,
"99": 3,
"100": 3
},
"a_psi": {
"0": 1.000000,
"1": 1.020000,
"10": 1.200000,
"25": 1.500000,
"50": 2.000000,
"75": 2.500000,
"90": 2.800000,
"99": 2.980000,
"100": 3
},
"a_psa": [1, 1, 1, 1, 2, 3, 3, 3, 3],
"a_psia": [1.000000, 1.020000, 1.200000, 1.500000, 2.000000, 2.500000, 2.800000, 2.980000, 3],
"m_min": 1,
"m_max": 3,
"m_minlen": 1,
"m_maxlen": 1,
"m_median": 2,
"m_ps": {
"0": 1,
"1": 1,
"10": 1,
"25": 1,
"50": 2,
"75": 3,
"90": 3,
"99": 3,
"100": 3
},
"m_psi": {
"0": 1.000000,
"1": 1.020000,
"10": 1.200000,
"25": 1.500000,
"50": 2.000000,
"75": 2.500000,
"90": 2.800000,
"99": 2.980000,
"100": 3
},
"m_psa": [1, 1, 1, 1, 2, 3, 3, 3, 3],
"m_psia": [1.000000, 1.020000, 1.200000, 1.500000, 2.000000, 2.500000, 2.800000, 2.980000, 3]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 4 then put -q -f test/input/test-percentiles.mlr -s field=i

View file

@ -0,0 +1,62 @@
[
{
"a_min": 1,
"a_max": 4,
"a_minlen": 1,
"a_maxlen": 1,
"a_median": 3,
"a_ps": {
"0": 1,
"1": 1,
"10": 1,
"25": 2,
"50": 3,
"75": 4,
"90": 4,
"99": 4,
"100": 4
},
"a_psi": {
"0": 1.000000,
"1": 1.030000,
"10": 1.300000,
"25": 1.750000,
"50": 2.500000,
"75": 3.250000,
"90": 3.700000,
"99": 3.970000,
"100": 4
},
"a_psa": [1, 1, 1, 2, 3, 4, 4, 4, 4],
"a_psia": [1.000000, 1.030000, 1.300000, 1.750000, 2.500000, 3.250000, 3.700000, 3.970000, 4],
"m_min": 1,
"m_max": 4,
"m_minlen": 1,
"m_maxlen": 1,
"m_median": 3,
"m_ps": {
"0": 1,
"1": 1,
"10": 1,
"25": 2,
"50": 3,
"75": 4,
"90": 4,
"99": 4,
"100": 4
},
"m_psi": {
"0": 1.000000,
"1": 1.030000,
"10": 1.300000,
"25": 1.750000,
"50": 2.500000,
"75": 3.250000,
"90": 3.700000,
"99": 3.970000,
"100": 4
},
"m_psa": [1, 1, 1, 2, 3, 4, 4, 4, 4],
"m_psia": [1.000000, 1.030000, 1.300000, 1.750000, 2.500000, 3.250000, 3.700000, 3.970000, 4]
}
]

View file

@ -0,0 +1 @@
mlr --ofmtf 6 --ojson --zin --from test/input/medium.z put -q -f test/input/test-percentiles.mlr -s field=i

View file

@ -0,0 +1,62 @@
[
{
"a_min": 1,
"a_max": 10000,
"a_minlen": 1,
"a_maxlen": 5,
"a_median": 5001,
"a_ps": {
"0": 1,
"1": 101,
"10": 1001,
"25": 2501,
"50": 5001,
"75": 7501,
"90": 9001,
"99": 9901,
"100": 10000
},
"a_psi": {
"0": 1.000000,
"1": 100.990000,
"10": 1000.900000,
"25": 2500.750000,
"50": 5000.500000,
"75": 7500.250000,
"90": 9000.100000,
"99": 9900.010000,
"100": 10000
},
"a_psa": [1, 101, 1001, 2501, 5001, 7501, 9001, 9901, 10000],
"a_psia": [1.000000, 100.990000, 1000.900000, 2500.750000, 5000.500000, 7500.250000, 9000.100000, 9900.010000, 10000],
"m_min": 1,
"m_max": 10000,
"m_minlen": 1,
"m_maxlen": 5,
"m_median": 5001,
"m_ps": {
"0": 1,
"1": 101,
"10": 1001,
"25": 2501,
"50": 5001,
"75": 7501,
"90": 9001,
"99": 9901,
"100": 10000
},
"m_psi": {
"0": 1.000000,
"1": 100.990000,
"10": 1000.900000,
"25": 2500.750000,
"50": 5000.500000,
"75": 7500.250000,
"90": 9000.100000,
"99": 9900.010000,
"100": 10000
},
"m_psa": [1, 101, 1001, 2501, 5001, 7501, 9001, 9901, 10000],
"m_psia": [1.000000, 100.990000, 1000.900000, 2500.750000, 5000.500000, 7500.250000, 9000.100000, 9900.010000, 10000]
}
]

View file

@ -0,0 +1 @@
Coverage via unit-test framework, not regression-test framework

View file

@ -0,0 +1,39 @@
begin {
@a = [];
@m = {};
@field = "i";
}
@a[NR] = $[@field];
@m[NR] = $[@field];
end {
outputs = {
"a_count": count(@a),
"a_sum": sum(@a),
"a_sum2": sum2(@a),
"a_sum3": sum3(@a),
"a_sum4": sum4(@a),
"a_mean": mean(@a),
"a_var": variance(@a),
"a_stddev": stddev(@a),
"a_meaneb": meaneb(@a),
"a_skewness": skewness(@a),
"a_kurtosis": kurtosis(@a),
"m_count": count(@m),
"m_sum": sum(@m),
"m_sum2": sum2(@m),
"m_sum3": sum3(@m),
"m_sum4": sum4(@m),
"m_mean": mean(@m),
"m_var": variance(@m),
"m_stddev": stddev(@m),
"m_meaneb": meaneb(@m),
"m_skewness": skewness(@m),
"m_kurtosis": kurtosis(@m),
};
emit outputs;
}

View file

@ -0,0 +1,44 @@
begin {
@a = [];
@m = {};
# @field must be given by put -s field=namegoeshere in the script invocation.
# This lets us test percentiles over various field names/types while re-using
# this same script.
}
@a[NR] = $[@field];
@m[NR] = $[@field];
end {
outputs = {
"a_min": min(@a),
"a_max": max(@a),
"a_minlen": minlen(@a),
"a_maxlen": maxlen(@a),
"a_median": median(@a),
"a_ps": percentiles(@a, [0,1,10,25,50,75,90,99,100]),
"a_psi": percentiles(@a, [0,1,10,25,50,75,90,99,100], {"interpolate_linearly":true}),
"a_psa": percentiles(@a, [0,1,10,25,50,75,90,99,100], {"output_array_not_map":true}),
"a_psia": percentiles(@a, [0,1,10,25,50,75,90,99,100], {
"interpolate_linearly": true,
"output_array_not_map":true,
}),
"m_min": min(@m),
"m_max": max(@m),
"m_minlen": minlen(@m),
"m_maxlen": maxlen(@m),
"m_median": median(@m),
"m_ps": percentiles(@m, [0,1,10,25,50,75,90,99,100]),
"m_psi": percentiles(@m, [0,1,10,25,50,75,90,99,100], {"interpolate_linearly":true}),
"m_psa": percentiles(@m, [0,1,10,25,50,75,90,99,100], {"output_array_not_map":true}),
"m_psia": percentiles(@m, [0,1,10,25,50,75,90,99,100], {
"interpolate_linearly": true,
"output_array_not_map":true,
}),
};
emit outputs;
}