Rename internal regex functions (#1446)

This commit is contained in:
John Kerl 2023-12-17 12:46:28 -05:00 committed by GitHub
parent b5dbd7a751
commit 1ae670fd4a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 395 additions and 258 deletions

View file

@ -220,18 +220,19 @@ MILLER(1) MILLER(1)
is_numeric is_present is_string joink joinkv joinv json_parse json_stringify
kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec
localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
mapsum max maxlen md5 mean meaneb median mexp min minlen mmul mode msub
nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
strfntime_local strftime strftime_local string strip strlen strpntime
strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
mapsum match matchx max maxlen md5 mean meaneb median mexp min minlen mmul
mode msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os
percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad
round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate
sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort
sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub
stddev strfntime strfntime_local strftime strftime_local string strip strlen
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper
truncate typeof unflatten unformat unformatx upntime uptime urand urand32
urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % &
&& * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ |
|| ~
1mCOMMENTS-IN-DATA FLAGS0m
Miller lets you put comments in your data, such as
@ -2650,6 +2651,16 @@ MILLER(1) MILLER(1)
1mmapsum0m
(class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'.
1mmatch0m
(class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
1mmatchx0m
(class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
1mmax0m
(class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
@ -3649,5 +3660,5 @@ MILLER(1) MILLER(1)
2023-12-13 MILLER(1)
2023-12-16 MILLER(1)
</pre>

View file

@ -199,18 +199,19 @@ MILLER(1) MILLER(1)
is_numeric is_present is_string joink joinkv joinv json_parse json_stringify
kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec
localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
mapsum max maxlen md5 mean meaneb median mexp min minlen mmul mode msub
nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
strfntime_local strftime strftime_local string strip strlen strpntime
strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
mapsum match matchx max maxlen md5 mean meaneb median mexp min minlen mmul
mode msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os
percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad
round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate
sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort
sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub
stddev strfntime strfntime_local strftime strftime_local string strip strlen
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper
truncate typeof unflatten unformat unformatx upntime uptime urand urand32
urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % &
&& * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ |
|| ~
1mCOMMENTS-IN-DATA FLAGS0m
Miller lets you put comments in your data, such as
@ -2629,6 +2630,16 @@ MILLER(1) MILLER(1)
1mmapsum0m
(class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'.
1mmatch0m
(class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
1mmatchx0m
(class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
1mmax0m
(class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
@ -3628,4 +3639,4 @@ MILLER(1) MILLER(1)
2023-12-13 MILLER(1)
2023-12-16 MILLER(1)

View file

@ -75,7 +75,7 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary
* [**Higher-order-functions functions**](#higher-order-functions-functions): [any](#any), [apply](#apply), [every](#every), [fold](#fold), [reduce](#reduce), [select](#select), [sort](#sort).
* [**Math functions**](#math-functions): [abs](#abs), [acos](#acos), [acosh](#acosh), [asin](#asin), [asinh](#asinh), [atan](#atan), [atan2](#atan2), [atanh](#atanh), [cbrt](#cbrt), [ceil](#ceil), [cos](#cos), [cosh](#cosh), [erf](#erf), [erfc](#erfc), [exp](#exp), [expm1](#expm1), [floor](#floor), [invqnorm](#invqnorm), [log](#log), [log10](#log10), [log1p](#log1p), [logifit](#logifit), [max](#max), [min](#min), [qnorm](#qnorm), [round](#round), [roundm](#roundm), [sgn](#sgn), [sin](#sin), [sinh](#sinh), [sqrt](#sqrt), [tan](#tan), [tanh](#tanh), [urand](#urand), [urand32](#urand32), [urandelement](#urandelement), [urandint](#urandint), [urandrange](#urandrange).
* [**Stats functions**](#stats-functions): [antimode](#antimode), [count](#count), [distinct_count](#distinct_count), [kurtosis](#kurtosis), [maxlen](#maxlen), [mean](#mean), [meaneb](#meaneb), [median](#median), [minlen](#minlen), [mode](#mode), [null_count](#null_count), [percentile](#percentile), [percentiles](#percentiles), [skewness](#skewness), [sort_collection](#sort_collection), [stddev](#stddev), [sum](#sum), [sum2](#sum2), [sum3](#sum3), [sum4](#sum4), [variance](#variance).
* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot).
* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [match](#match), [matchx](#matchx), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot).
* [**System functions**](#system-functions): [exec](#exec), [hostname](#hostname), [os](#os), [system](#system), [version](#version).
* [**Time functions**](#time-functions): [dhms2fsec](#dhms2fsec), [dhms2sec](#dhms2sec), [fsec2dhms](#fsec2dhms), [fsec2hms](#fsec2hms), [gmt2localtime](#gmt2localtime), [gmt2nsec](#gmt2nsec), [gmt2sec](#gmt2sec), [hms2fsec](#hms2fsec), [hms2sec](#hms2sec), [localtime2gmt](#localtime2gmt), [localtime2nsec](#localtime2nsec), [localtime2sec](#localtime2sec), [nsec2gmt](#nsec2gmt), [nsec2gmtdate](#nsec2gmtdate), [nsec2localdate](#nsec2localdate), [nsec2localtime](#nsec2localtime), [sec2dhms](#sec2dhms), [sec2gmt](#sec2gmt), [sec2gmtdate](#sec2gmtdate), [sec2hms](#sec2hms), [sec2localdate](#sec2localdate), [sec2localtime](#sec2localtime), [strfntime](#strfntime), [strfntime_local](#strfntime_local), [strftime](#strftime), [strftime_local](#strftime_local), [strpntime](#strpntime), [strpntime_local](#strpntime_local), [strptime](#strptime), [strptime_local](#strptime_local), [sysntime](#sysntime), [systime](#systime), [systimeint](#systimeint), [upntime](#upntime), [uptime](#uptime).
* [**Typing functions**](#typing-functions): [asserting_absent](#asserting_absent), [asserting_array](#asserting_array), [asserting_bool](#asserting_bool), [asserting_boolean](#asserting_boolean), [asserting_empty](#asserting_empty), [asserting_empty_map](#asserting_empty_map), [asserting_error](#asserting_error), [asserting_float](#asserting_float), [asserting_int](#asserting_int), [asserting_map](#asserting_map), [asserting_nonempty_map](#asserting_nonempty_map), [asserting_not_array](#asserting_not_array), [asserting_not_empty](#asserting_not_empty), [asserting_not_map](#asserting_not_map), [asserting_not_null](#asserting_not_null), [asserting_null](#asserting_null), [asserting_numeric](#asserting_numeric), [asserting_present](#asserting_present), [asserting_string](#asserting_string), [is_absent](#is_absent), [is_array](#is_array), [is_bool](#is_bool), [is_boolean](#is_boolean), [is_empty](#is_empty), [is_empty_map](#is_empty_map), [is_error](#is_error), [is_float](#is_float), [is_int](#is_int), [is_map](#is_map), [is_nan](#is_nan), [is_nonempty_map](#is_nonempty_map), [is_not_array](#is_not_array), [is_not_empty](#is_not_empty), [is_not_map](#is_not_map), [is_not_null](#is_not_null), [is_null](#is_null), [is_numeric](#is_numeric), [is_present](#is_present), [is_string](#is_string), [typeof](#typeof).
@ -1296,6 +1296,22 @@ lstrip (class=string #args=1) Strip leading whitespace from string.
</pre>
### match
<pre class="pre-non-highlight-non-pair">
match (class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
</pre>
### matchx
<pre class="pre-non-highlight-non-pair">
matchx (class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
</pre>
### regextract
<pre class="pre-non-highlight-non-pair">
regextract (class=string #args=2) Extracts a substring (the first, if there are multiple matches), matching a regular expression, from the input. Does not use capture groups; see also the =~ operator which does.

View file

@ -199,18 +199,19 @@ MILLER(1) MILLER(1)
is_numeric is_present is_string joink joinkv joinv json_parse json_stringify
kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec
localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
mapsum max maxlen md5 mean meaneb median mexp min minlen mmul mode msub
nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
strfntime_local strftime strftime_local string strip strlen strpntime
strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
mapsum match matchx max maxlen md5 mean meaneb median mexp min minlen mmul
mode msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os
percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad
round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate
sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort
sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub
stddev strfntime strfntime_local strftime strftime_local string strip strlen
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper
truncate typeof unflatten unformat unformatx upntime uptime urand urand32
urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % &
&& * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ |
|| ~
1mCOMMENTS-IN-DATA FLAGS0m
Miller lets you put comments in your data, such as
@ -2629,6 +2630,16 @@ MILLER(1) MILLER(1)
1mmapsum0m
(class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'.
1mmatch0m
(class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
1mmatchx0m
(class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
1mmax0m
(class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
@ -3628,4 +3639,4 @@ MILLER(1) MILLER(1)
2023-12-13 MILLER(1)
2023-12-16 MILLER(1)

View file

@ -2,12 +2,12 @@
.\" Title: mlr
.\" Author: [see the "AUTHOR" section]
.\" Generator: ./mkman.rb
.\" Date: 2023-12-13
.\" Date: 2023-12-16
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "MILLER" "1" "2023-12-13" "\ \&" "\ \&"
.TH "MILLER" "1" "2023-12-16" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Portability definitions
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -246,18 +246,19 @@ is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null
is_numeric is_present is_string joink joinkv joinv json_parse json_stringify
kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec
localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
mapsum max maxlen md5 mean meaneb median mexp min minlen mmul mode msub
nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
strfntime_local strftime strftime_local string strip strlen strpntime
strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
\&.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
mapsum match matchx max maxlen md5 mean meaneb median mexp min minlen mmul
mode msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os
percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad
round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate
sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort
sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub
stddev strfntime strfntime_local strftime strftime_local string strip strlen
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper
truncate typeof unflatten unformat unformatx upntime uptime urand urand32
urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % &
&& * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ |
|| ~
.fi
.if n \{\
.RE
@ -3938,6 +3939,28 @@ localtime2sec("2001-02-03 04:05:06", "Asia/Istanbul") = 981165906"
.fi
.if n \{\
.RE
.SS "match"
.if n \{\
.RS 0
.\}
.nf
(class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
.fi
.if n \{\
.RE
.SS "matchx"
.if n \{\
.RS 0
.\}
.nf
(class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
.fi
.if n \{\
.RE
.SS "max"
.if n \{\
.RS 0

View file

@ -81,7 +81,7 @@ func BIF_sub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
sregex := input2.AcquireStringValue()
replacement := input3.AcquireStringValue()
stringOutput := lib.RegexSub(input, sregex, replacement)
stringOutput := lib.RegexStringSub(input, sregex, replacement)
return mlrval.FromString(stringOutput)
}
@ -111,7 +111,7 @@ func BIF_gsub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
sregex := input2.AcquireStringValue()
replacement := input3.AcquireStringValue()
stringOutput := lib.RegexGsub(input, sregex, replacement)
stringOutput := lib.RegexStringGsub(input, sregex, replacement)
return mlrval.FromString(stringOutput)
}
@ -129,7 +129,7 @@ func BIF_string_matches_regexp(input1, input2 *mlrval.Mlrval) (retval *mlrval.Ml
return mlrval.FromNotStringError("=~", input2), nil
}
boolOutput, captures := lib.RegexMatches(input1string, input2.AcquireStringValue())
boolOutput, captures := lib.RegexStringMatchWithCaptures(input1string, input2.AcquireStringValue())
return mlrval.FromBool(boolOutput), captures
}

View file

@ -266,7 +266,7 @@ func (root *RootNode) BuildStringLiteralNode(literal string) IEvaluable {
// RegexLiteralNode. See also https://github.com/johnkerl/miller/issues/297.
literal = lib.UnbackslashStringLiteral(literal)
hasCaptures, replacementCaptureMatrix := lib.RegexReplacementHasCaptures(literal)
hasCaptures, replacementCaptureMatrix := lib.ReplacementHasCaptures(literal)
if !hasCaptures {
return &StringLiteralNode{
literal: mlrval.FromString(literal),

View file

@ -158,7 +158,7 @@ type tIPSRegexSplitter struct {
}
func (s *tIPSRegexSplitter) Split(input string) []string {
return lib.RegexSplitString(s.ipsRegex, input, 2)
return lib.RegexCompiledSplitString(s.ipsRegex, input, 2)
}
// IFieldSplitter splits a string into pieces, e.g. for IFS.
@ -193,5 +193,5 @@ type tIFSRegexSplitter struct {
}
func (s *tIFSRegexSplitter) Split(input string) []string {
return lib.RegexSplitString(s.ifsRegex, input, -1)
return lib.RegexCompiledSplitString(s.ifsRegex, input, -1)
}

View file

@ -304,7 +304,7 @@ type tXTABIPSSplitter struct {
// which we need to produce just a pair of items -- a key and a value -- delimited by one or more
// IPS. For exaemple, with IPS being a space, in 'abc 123' we need to get key 'abc' and value
// '123'; for 'abc 123 456' we need key 'abc' and value '123 456'. It's super-elegant to simply
// regex-split the line like 'kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, line, 2)' --
// regex-split the line like 'kv = lib.RegexCompiledSplitString(reader.readerOptions.IPSRegex, line, 2)' --
// however, that's 3x slower than the current implementation. It turns out regexes are great
// but we should use them only when we must, since they are expensive.
func (s *tXTABIPSSplitter) Split(input string) (key, value string, err error) {
@ -358,7 +358,7 @@ type tXTABIPSRegexSplitter struct {
}
func (s *tXTABIPSRegexSplitter) Split(input string) (key, value string, err error) {
kv := lib.RegexSplitString(s.ipsRegex, input, 2)
kv := lib.RegexCompiledSplitString(s.ipsRegex, input, 2)
if len(kv) == 0 {
return "", "", fmt.Errorf("internal coding error in XTAB reader")
} else if len(kv) == 1 {

View file

@ -1,5 +1,5 @@
// ================================================================
// Support for regexes in Miller.
// Support for regular expressions in Miller.
//
// * By and large we use the Go library.
//
@ -13,17 +13,24 @@
// $y = "\2:\1";
// }
// where the '=~' sets the captures and the "\2:\1" uses them. (Note that
// https://github.com/johnkerl/miller/issues/388 has a better suggestion
// which would make the captures explicit as variables, rather than implicit
// within CST state -- regardless, the current syntax will still be supported
// for backward compatibility and so is here to stay.) Here we make use of Go
// regexp-library functions to write to, and then later interpolate from, a
// captures array which is stored within CST state. (See the `runtime.State`
// object.)
// https://github.com/johnkerl/miller/issues/388 has a better suggestion which would make the
// captures explicit as variables, rather than implicit within CST state: this is implemented by
// the `match` and `matchx` DSL functions. Regardless, the `=~` syntax will still be supported
// for backward compatibility and so is here to stay.) Here we make use of Go regexp-library
// functions to write to, and then later interpolate from, a captures array which is stored within
// CST state. (See the `runtime.State` object.)
//
// * "\0" is for a full match; "\1" .. "\9" are for submatch cqptures. E.g.
// if $x is "foobarbaz" and the regex is "foo(.)(..)baz", then "\0" is
// "foobarbaz", "\1" is "b", "\2" is "ar", and "\3".."\9" are "".
//
// * Naming:
//
// o "regexp" and "Regexp" are used for the Go library and its data structure, respectively;
//
// o "regex" is used for regular-expression strings following Miller's idiosyncratic syntax and
// semantics as described above.
//
// ================================================================
package lib
@ -34,6 +41,7 @@ import (
"os"
"regexp"
"strings"
"sync"
)
// captureDetector is used to see if a string literal interpolates previous
@ -44,20 +52,54 @@ var captureDetector = regexp.MustCompile(`\\[0-9]`)
// "\2:\1" so they don't need to be recomputed on every record.
var captureSplitter = regexp.MustCompile(`(\\[0-9])`)
// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax
// which predate the port of Miller from C to Go. Miller regexes use a final
// 'i' to indicate case-insensitivity; Go regexes use an initial "(?i)".
// See regexpCompileCached
var regexpCache map[string]*regexp.Regexp
const cacheMaxSize = 1000
var cacheMutex sync.Mutex
// regexpCompileCached keeps a cache of compiled regexes, so that the caller has the flexibility to
// only pass in strings while getting the benefits of compilation avoidance.
//
// (See also mlr.bnf where we specify which things can be backslash-escaped
// without a syntax error at parse time.)
// Regarding cache size: in nominal use, regexp strings are within Miller DSL code statements, and
// there will be a handful. These will all get re-used after their first application, and the cache
// will remain bounded by the size of the user's DSL code. However, it is possible to have regex
// strings contained within Miller record-field data.
//
// * If the regex_string is of the form a.*b, compiles it case-sensisitively.
// * If the regex_string is of the form "a.*b", compiles a.*b case-sensisitively.
// We could solve this by using an LRU cache. However, for simplicity, we limit the number of
// cached compiles, and for any extras that appear during record processing, we simply recompile
// each time.
func regexpCompileCached(s string) (*regexp.Regexp, error) {
if len(regexpCache) > cacheMaxSize {
return regexp.Compile(s)
}
r, err := regexp.Compile(s)
if err == nil {
cacheMutex.Lock()
if regexpCache == nil {
regexpCache = make(map[string]*regexp.Regexp)
}
regexpCache[s] = r
cacheMutex.Unlock()
}
return r, err
}
// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax which predates the
// port of Miller from C to Go. Miller regexes use a final 'i' to indicate case-insensitivity; Go
// regexes use an initial "(?i)".
//
// (See also mlr.bnf where we specify which things can be backslash-escaped without a syntax error
// at parse time.)
//
// * If the regex_string is of the form a.*b, compiles it case-sensitively.
// * If the regex_string is of the form "a.*b", compiles a.*b case-sensitively.
// * If the regex_string is of the form "a.*b"i, compiles a.*b case-insensitively.
func CompileMillerRegex(regexString string) (*regexp.Regexp, error) {
n := len(regexString)
if n < 2 {
return regexp.Compile(regexString)
return regexpCompileCached(regexString)
}
// TODO: rethink this. This will strip out things people have entered, e.g. "\"...\"".
@ -68,20 +110,20 @@ func CompileMillerRegex(regexString string) (*regexp.Regexp, error) {
// literals) and from verbs (like cut -r or having-fields).
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"") {
return regexp.Compile(regexString[1 : n-1])
return regexpCompileCached(regexString[1 : n-1])
}
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/") {
return regexp.Compile(regexString[1 : n-1])
return regexpCompileCached(regexString[1 : n-1])
}
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"i") {
return regexp.Compile("(?i)" + regexString[1:n-2])
return regexpCompileCached("(?i)" + regexString[1:n-2])
}
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/i") {
return regexp.Compile("(?i)" + regexString[1:n-2])
return regexpCompileCached("(?i)" + regexString[1:n-2])
}
return regexp.Compile(regexString)
return regexpCompileCached(regexString)
}
// CompileMillerRegexOrDie wraps CompileMillerRegex. Usually in Go we want to
@ -110,7 +152,7 @@ func CompileMillerRegexesOrDie(regexStrings []string) []*regexp.Regexp {
// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
// but "" splits to [""] when I wish it were []. This function does the latter.
func RegexSplitString(regex *regexp.Regexp, input string, n int) []string {
func RegexCompiledSplitString(regex *regexp.Regexp, input string, n int) []string {
if input == "" {
return make([]string, 0)
} else {
@ -118,193 +160,42 @@ func RegexSplitString(regex *regexp.Regexp, input string, n int) []string {
}
}
// MakeEmptyRegexCaptures is for initial CST state at the start of executing
// the DSL expression for the current record. Even if '$x =~ "(..)_(...)" set
// "\1" and "\2" on the previous record, at start of processing for the current
// record we need to start with a clean slate.
func MakeEmptyRegexCaptures() []string {
return nil
}
// RegexReplacementHasCaptures is used by the CST builder to see if
// string-literal is like "foo bar" or "foo \1 bar" -- in the latter case it
// needs to retain the compiled offsets-matrix information.
func RegexReplacementHasCaptures(
replacement string,
) (
hasCaptures bool,
matrix [][]int,
) {
if captureDetector.MatchString(replacement) {
return true, captureSplitter.FindAllSubmatchIndex([]byte(replacement), -1)
} else {
return false, nil
}
}
// RegexMatches implements the =~ DSL operator. The captures are stored in DSL
// state and may be used by a DSL statement after the =~. For example, in
//
// sub($a, "(..)_(...)", "\1:\2")
//
// the replacement string is an argument to sub and therefore the captures are
// confined to the implementation of the sub function. Similarly for gsub. But
// for the match operator, people can do
//
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1"
// }
//
// and the =~ callsite doesn't know if captures will be used or not. So,
// RegexMatches always returns the captures array. It is stored within the CST
// state.
func RegexMatches(
input string,
sregex string,
) (
matches bool,
capturesOneUp []string,
) {
regex := CompileMillerRegexOrDie(sregex)
return RegexMatchesCompiled(input, regex)
}
// RegexMatchesCompiled is the implementation for the =~ operator. Without
// Miller-style regex captures this would a simple one-line
// regex.MatchString(input). However, we return the captures array for the
// benefit of subsequent references to "\0".."\9".
func RegexMatchesCompiled(
input string,
regex *regexp.Regexp,
) (bool, []string) {
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
if matrix == nil || len(matrix) == 0 {
// Set all captures to ""
return false, make([]string, 10)
}
// "\0" .. "\9"
captures := make([]string, 10)
// If there are multiple matches -- e.g. input is
//
// "...ab_cde...fg_hij..."
//
// with regex
//
// "(..)_(...)"
//
// -- then we only consider the first match: boolean return value is true
// (the input string matched the regex), and the captures array will map
// "\1" to "ab" and "\2" to "cde".
row := matrix[0]
n := len(row)
// Example return value from FindAllSubmatchIndex with input
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
//
// Matrix is [][]int{
// []int{3, 9, 3, 5, 6, 9},
// []int{12, 18, 12, 14, 15, 18},
// }
//
// As noted above we look at only the first row.
//
// * 3-9 is for the entire match "ab_cde"
// * 3-5 is for the first capture "ab"
// * 6-9 is for the second capture "cde"
di := 0
for si := 0; si < n && di <= 9; si += 2 {
start := row[si]
end := row[si+1]
if start >= 0 && end >= 0 {
captures[di] = input[start:end]
}
di += 1
}
return true, captures
}
// InterpolateCaptures example:
// - Input $x is "ab_cde"
// - DSL expression
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1";
// }
// - InterpolateCaptures is used on the evaluation of "\2:\1"
// - replacementString is "\2:\1"
// - replacementMatrix contains precomputed/cached offsets for the "\2" and
// "\1" substrings within "\2:\1"
// - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
// slot 2 being "cde" (for "\2"), and slots 3-9 being "".
func InterpolateCaptures(
replacementString string,
replacementMatrix [][]int,
captures []string,
) string {
if replacementMatrix == nil || captures == nil {
return replacementString
}
var buffer bytes.Buffer
nonMatchStartIndex := 0
for _, row := range replacementMatrix {
start := row[0]
buffer.WriteString(replacementString[nonMatchStartIndex:row[0]])
// Map "\0".."\9" to integer index 0..9
index := replacementString[start+1] - '0'
buffer.WriteString(captures[index])
nonMatchStartIndex = row[1]
}
buffer.WriteString(replacementString[nonMatchStartIndex:])
return buffer.String()
}
// RegexSub implements the sub DSL function.
func RegexSub(
// RegexStringSub implements the sub DSL function.
func RegexStringSub(
input string,
sregex string,
replacement string,
) string {
regex := CompileMillerRegexOrDie(sregex)
_, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement)
return RegexSubCompiled(input, regex, replacement, replacementCaptureMatrix)
_, replacementCaptureMatrix := ReplacementHasCaptures(replacement)
return RegexCompiledSub(input, regex, replacement, replacementCaptureMatrix)
}
// RegexSubCompiled is the same as RegexSub but with compiled regex and
// RegexCompiledSub is the same as RegexStringSub but with compiled regex and
// replacement strings.
func RegexSubCompiled(
func RegexCompiledSub(
input string,
regex *regexp.Regexp,
replacement string,
replacementCaptureMatrix [][]int,
) string {
return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, true)
return regexCompiledSubOrGsub(input, regex, replacement, replacementCaptureMatrix, true)
}
// RegexGsub implements the gsub DSL function.
func RegexGsub(
// RegexStringGsub implements the `gsub` DSL function.
func RegexStringGsub(
input string,
sregex string,
replacement string,
) string {
regex := CompileMillerRegexOrDie(sregex)
_, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement)
return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, false)
_, replacementCaptureMatrix := ReplacementHasCaptures(replacement)
return regexCompiledSubOrGsub(input, regex, replacement, replacementCaptureMatrix, false)
}
// regexSubGsubCompiled is the implementation for sub/gsub with compilex regex
// regexCompiledSubOrGsub is the implementation for `sub`/`gsub` with compilex regex
// and replacement strings.
func regexSubGsubCompiled(
func regexCompiledSubOrGsub(
input string,
regex *regexp.Regexp,
replacement string,
@ -384,3 +275,177 @@ func regexSubGsubCompiled(
buffer.WriteString(input[nonMatchStartIndex:])
return buffer.String()
}
// RegexStringMatchSimple is for simple boolean return without any substring captures.
func RegexStringMatchSimple(
input string,
sregex string,
) bool {
regex := CompileMillerRegexOrDie(sregex)
return RegexCompiledMatchSimple(input, regex)
}
// RegexCompiledMatchSimple is for simple boolean return without any substring captures.
func RegexCompiledMatchSimple(
input string,
regex *regexp.Regexp,
) bool {
return regex.Match([]byte(input))
}
// RegexStringMatchWithCaptures implements the =~ DSL operator. The captures are stored in DSL
// state and may be used by a DSL statement after the =~. For example, in
//
// sub($a, "(..)_(...)", "\1:\2")
//
// the replacement string is an argument to sub and therefore the captures are
// confined to the implementation of the sub function. Similarly for gsub. But
// for the match operator, people can do
//
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1"
// }
//
// and the =~ callsite doesn't know if captures will be used or not. So,
// RegexStringMatchWithCaptures always returns the captures array. It is stored within the CST
// state.
func RegexStringMatchWithCaptures(
input string,
sregex string,
) (
matches bool,
capturesOneUp []string,
) {
regex := CompileMillerRegexOrDie(sregex)
return RegexCompiledMatchWithCaptures(input, regex)
}
// RegexCompiledMatchWithCaptures is the implementation for the =~ operator. Without
// Miller-style regex captures this would a simple one-line
// regex.MatchString(input). However, we return the captures array for the
// benefit of subsequent references to "\0".."\9".
func RegexCompiledMatchWithCaptures(
input string,
regex *regexp.Regexp,
) (bool, []string) {
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
if matrix == nil || len(matrix) == 0 {
// Set all captures to ""
return false, make([]string, 10)
}
// "\0" .. "\9"
captures := make([]string, 10)
// If there are multiple matches -- e.g. input is
//
// "...ab_cde...fg_hij..."
//
// with regex
//
// "(..)_(...)"
//
// -- then we only consider the first match: boolean return value is true
// (the input string matched the regex), and the captures array will map
// "\1" to "ab" and "\2" to "cde".
row := matrix[0]
n := len(row)
// Example return value from FindAllSubmatchIndex with input
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
//
// Matrix is [][]int{
// []int{3, 9, 3, 5, 6, 9},
// []int{12, 18, 12, 14, 15, 18},
// }
//
// As noted above we look at only the first row.
//
// * 3-9 is for the entire match "ab_cde"
// * 3-5 is for the first capture "ab"
// * 6-9 is for the second capture "cde"
di := 0
for si := 0; si < n && di <= 9; si += 2 {
start := row[si]
end := row[si+1]
if start >= 0 && end >= 0 {
captures[di] = input[start:end]
}
di += 1
}
return true, captures
}
// MakeEmptyCaptures is for initial CST state at the start of executing the DSL expression for the
// current record. Even if '$x =~ "(..)_(...)" set "\1" and "\2" on the previous record, at start
// of processing for the current record we need to start with a clean slate. This is in support of
// CST state, which `=~` semantics requires.
func MakeEmptyCaptures() []string {
return nil
}
// ReplacementHasCaptures is used by the CST builder to see if string-literal is like "foo bar" or
// "foo \1 bar" -- in the latter case it needs to retain the compiled offsets-matrix information.
// This is in support of CST state, which `=~` semantics requires.
func ReplacementHasCaptures(
replacement string,
) (
hasCaptures bool,
matrix [][]int,
) {
if captureDetector.MatchString(replacement) {
return true, captureSplitter.FindAllSubmatchIndex([]byte(replacement), -1)
} else {
return false, nil
}
}
// InterpolateCaptures example:
//
// * Input $x is "ab_cde"
//
// - DSL expression
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1";
// }
//
// * InterpolateCaptures is used on the evaluation of "\2:\1"
//
// * replacementString is "\2:\1"
//
// - replacementMatrix contains precomputed/cached offsets for the "\2" and
// "\1" substrings within "\2:\1"
//
// - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
// slot 2 being "cde" (for "\2"), and slots 3-9 being "".
func InterpolateCaptures(
replacementString string,
replacementMatrix [][]int,
captures []string,
) string {
if replacementMatrix == nil || captures == nil {
return replacementString
}
var buffer bytes.Buffer
nonMatchStartIndex := 0
for _, row := range replacementMatrix {
start := row[0]
buffer.WriteString(replacementString[nonMatchStartIndex:row[0]])
// Map "\0".."\9" to integer index 0..9
index := replacementString[start+1] - '0'
buffer.WriteString(captures[index])
nonMatchStartIndex = row[1]
}
buffer.WriteString(replacementString[nonMatchStartIndex:])
return buffer.String()
}

View file

@ -88,7 +88,7 @@ var dataForMatches = []tDataForMatches{
func TestRegexReplacementHasCaptures(t *testing.T) {
for i, entry := range dataForHasCaptures {
actualHasCaptures, actualMatrix := RegexReplacementHasCaptures(entry.replacement)
actualHasCaptures, actualMatrix := ReplacementHasCaptures(entry.replacement)
if actualHasCaptures != entry.expectedHasCaptures {
t.Fatalf("case %d replacement \"%s\" expected %v got %v\n",
i, entry.replacement, entry.expectedHasCaptures, actualHasCaptures,
@ -104,7 +104,7 @@ func TestRegexReplacementHasCaptures(t *testing.T) {
func TestRegexSub(t *testing.T) {
for i, entry := range dataForSub {
actualOutput := RegexSub(entry.input, entry.sregex, entry.replacement)
actualOutput := RegexStringSub(entry.input, entry.sregex, entry.replacement)
if actualOutput != entry.expectedOutput {
t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n",
i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput,
@ -115,7 +115,7 @@ func TestRegexSub(t *testing.T) {
func TestRegexGsub(t *testing.T) {
for i, entry := range dataForGsub {
actualOutput := RegexGsub(entry.input, entry.sregex, entry.replacement)
actualOutput := RegexStringGsub(entry.input, entry.sregex, entry.replacement)
if actualOutput != entry.expectedOutput {
t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n",
i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput,
@ -126,7 +126,7 @@ func TestRegexGsub(t *testing.T) {
func TestRegexMatches(t *testing.T) {
for i, entry := range dataForMatches {
actualOutput, actualCaptures := RegexMatches(entry.input, entry.sregex)
actualOutput, actualCaptures := RegexStringMatchWithCaptures(entry.input, entry.sregex)
if actualOutput != entry.expectedOutput {
t.Fatalf("case %d input \"%s\" sregex \"%s\" expected %v got %v\n",
i, entry.input, entry.sregex, entry.expectedOutput, actualOutput,

View file

@ -43,8 +43,8 @@ func NewEmptyState(options *cli.TOptions, strictMode bool) *State {
// OutputRecordsAndContexts is assigned after construction
// See lib.MakeEmptyRegexCaptures for context.
RegexCaptures: lib.MakeEmptyRegexCaptures(),
// See lib.MakeEmptyCaptures for context.
RegexCaptures: lib.MakeEmptyCaptures(),
Options: options,
StrictMode: strictMode,
@ -57,5 +57,5 @@ func (state *State) Update(
) {
state.Inrec = inrec
state.Context = context
state.RegexCaptures = lib.MakeEmptyRegexCaptures()
state.RegexCaptures = lib.MakeEmptyCaptures()
}

View file

@ -479,7 +479,7 @@ func (tr *TransformerMergeFields) transformByCollapsing(
matched = valueFieldNameRegex.MatchString(pe.Key)
if matched {
// TODO: comment re matrix
shortName = lib.RegexSubCompiled(valueFieldName, valueFieldNameRegex, "", nil)
shortName = lib.RegexCompiledSub(valueFieldName, valueFieldNameRegex, "", nil)
break
}
}

View file

@ -169,7 +169,7 @@ func NewTransformerRename(
regexString := pe.Key
regex := lib.CompileMillerRegexOrDie(regexString)
replacement := pe.Value.(string)
_, replacementCaptureMatrix := lib.RegexReplacementHasCaptures(replacement)
_, replacementCaptureMatrix := lib.ReplacementHasCaptures(replacement)
regexAndReplacement := tRegexAndReplacement{
regex: regex,
replacement: replacement,
@ -241,7 +241,7 @@ func (tr *TransformerRename) transformWithRegexes(
inrec.Rename(oldName, newName)
}
} else {
newName := lib.RegexSubCompiled(oldName, regex, replacement, replacementCaptureMatrix)
newName := lib.RegexCompiledSub(oldName, regex, replacement, replacementCaptureMatrix)
if newName != oldName {
inrec.Rename(oldName, newName)
}