mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
Rename internal regex functions (#1446)
This commit is contained in:
parent
b5dbd7a751
commit
1ae670fd4a
14 changed files with 395 additions and 258 deletions
|
|
@ -220,18 +220,19 @@ MILLER(1) MILLER(1)
|
|||
is_numeric is_present is_string joink joinkv joinv json_parse json_stringify
|
||||
kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec
|
||||
localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
|
||||
mapsum max maxlen md5 mean meaneb median mexp min minlen mmul mode msub
|
||||
nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile
|
||||
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
|
||||
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
|
||||
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
|
||||
splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
|
||||
strfntime_local strftime strftime_local string strip strlen strpntime
|
||||
strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
|
||||
sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
|
||||
typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
|
||||
urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
|
||||
.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
|
||||
mapsum match matchx max maxlen md5 mean meaneb median mexp min minlen mmul
|
||||
mode msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os
|
||||
percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad
|
||||
round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate
|
||||
sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort
|
||||
sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub
|
||||
stddev strfntime strfntime_local strftime strftime_local string strip strlen
|
||||
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
|
||||
sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper
|
||||
truncate typeof unflatten unformat unformatx upntime uptime urand urand32
|
||||
urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % &
|
||||
&& * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ |
|
||||
|| ~
|
||||
|
||||
1mCOMMENTS-IN-DATA FLAGS0m
|
||||
Miller lets you put comments in your data, such as
|
||||
|
|
@ -2650,6 +2651,16 @@ MILLER(1) MILLER(1)
|
|||
1mmapsum0m
|
||||
(class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'.
|
||||
|
||||
1mmatch0m
|
||||
(class=string #args=2) TODO: WRITE ME
|
||||
Example:
|
||||
TODO: WRITE ME
|
||||
|
||||
1mmatchx0m
|
||||
(class=string #args=2) TODO: WRITE ME
|
||||
Example:
|
||||
TODO: WRITE ME
|
||||
|
||||
1mmax0m
|
||||
(class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
|
||||
|
||||
|
|
@ -3649,5 +3660,5 @@ MILLER(1) MILLER(1)
|
|||
|
||||
|
||||
|
||||
2023-12-13 MILLER(1)
|
||||
2023-12-16 MILLER(1)
|
||||
</pre>
|
||||
|
|
|
|||
|
|
@ -199,18 +199,19 @@ MILLER(1) MILLER(1)
|
|||
is_numeric is_present is_string joink joinkv joinv json_parse json_stringify
|
||||
kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec
|
||||
localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
|
||||
mapsum max maxlen md5 mean meaneb median mexp min minlen mmul mode msub
|
||||
nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile
|
||||
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
|
||||
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
|
||||
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
|
||||
splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
|
||||
strfntime_local strftime strftime_local string strip strlen strpntime
|
||||
strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
|
||||
sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
|
||||
typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
|
||||
urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
|
||||
.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
|
||||
mapsum match matchx max maxlen md5 mean meaneb median mexp min minlen mmul
|
||||
mode msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os
|
||||
percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad
|
||||
round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate
|
||||
sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort
|
||||
sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub
|
||||
stddev strfntime strfntime_local strftime strftime_local string strip strlen
|
||||
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
|
||||
sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper
|
||||
truncate typeof unflatten unformat unformatx upntime uptime urand urand32
|
||||
urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % &
|
||||
&& * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ |
|
||||
|| ~
|
||||
|
||||
1mCOMMENTS-IN-DATA FLAGS0m
|
||||
Miller lets you put comments in your data, such as
|
||||
|
|
@ -2629,6 +2630,16 @@ MILLER(1) MILLER(1)
|
|||
1mmapsum0m
|
||||
(class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'.
|
||||
|
||||
1mmatch0m
|
||||
(class=string #args=2) TODO: WRITE ME
|
||||
Example:
|
||||
TODO: WRITE ME
|
||||
|
||||
1mmatchx0m
|
||||
(class=string #args=2) TODO: WRITE ME
|
||||
Example:
|
||||
TODO: WRITE ME
|
||||
|
||||
1mmax0m
|
||||
(class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
|
||||
|
||||
|
|
@ -3628,4 +3639,4 @@ MILLER(1) MILLER(1)
|
|||
|
||||
|
||||
|
||||
2023-12-13 MILLER(1)
|
||||
2023-12-16 MILLER(1)
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary
|
|||
* [**Higher-order-functions functions**](#higher-order-functions-functions): [any](#any), [apply](#apply), [every](#every), [fold](#fold), [reduce](#reduce), [select](#select), [sort](#sort).
|
||||
* [**Math functions**](#math-functions): [abs](#abs), [acos](#acos), [acosh](#acosh), [asin](#asin), [asinh](#asinh), [atan](#atan), [atan2](#atan2), [atanh](#atanh), [cbrt](#cbrt), [ceil](#ceil), [cos](#cos), [cosh](#cosh), [erf](#erf), [erfc](#erfc), [exp](#exp), [expm1](#expm1), [floor](#floor), [invqnorm](#invqnorm), [log](#log), [log10](#log10), [log1p](#log1p), [logifit](#logifit), [max](#max), [min](#min), [qnorm](#qnorm), [round](#round), [roundm](#roundm), [sgn](#sgn), [sin](#sin), [sinh](#sinh), [sqrt](#sqrt), [tan](#tan), [tanh](#tanh), [urand](#urand), [urand32](#urand32), [urandelement](#urandelement), [urandint](#urandint), [urandrange](#urandrange).
|
||||
* [**Stats functions**](#stats-functions): [antimode](#antimode), [count](#count), [distinct_count](#distinct_count), [kurtosis](#kurtosis), [maxlen](#maxlen), [mean](#mean), [meaneb](#meaneb), [median](#median), [minlen](#minlen), [mode](#mode), [null_count](#null_count), [percentile](#percentile), [percentiles](#percentiles), [skewness](#skewness), [sort_collection](#sort_collection), [stddev](#stddev), [sum](#sum), [sum2](#sum2), [sum3](#sum3), [sum4](#sum4), [variance](#variance).
|
||||
* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot).
|
||||
* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [match](#match), [matchx](#matchx), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot).
|
||||
* [**System functions**](#system-functions): [exec](#exec), [hostname](#hostname), [os](#os), [system](#system), [version](#version).
|
||||
* [**Time functions**](#time-functions): [dhms2fsec](#dhms2fsec), [dhms2sec](#dhms2sec), [fsec2dhms](#fsec2dhms), [fsec2hms](#fsec2hms), [gmt2localtime](#gmt2localtime), [gmt2nsec](#gmt2nsec), [gmt2sec](#gmt2sec), [hms2fsec](#hms2fsec), [hms2sec](#hms2sec), [localtime2gmt](#localtime2gmt), [localtime2nsec](#localtime2nsec), [localtime2sec](#localtime2sec), [nsec2gmt](#nsec2gmt), [nsec2gmtdate](#nsec2gmtdate), [nsec2localdate](#nsec2localdate), [nsec2localtime](#nsec2localtime), [sec2dhms](#sec2dhms), [sec2gmt](#sec2gmt), [sec2gmtdate](#sec2gmtdate), [sec2hms](#sec2hms), [sec2localdate](#sec2localdate), [sec2localtime](#sec2localtime), [strfntime](#strfntime), [strfntime_local](#strfntime_local), [strftime](#strftime), [strftime_local](#strftime_local), [strpntime](#strpntime), [strpntime_local](#strpntime_local), [strptime](#strptime), [strptime_local](#strptime_local), [sysntime](#sysntime), [systime](#systime), [systimeint](#systimeint), [upntime](#upntime), [uptime](#uptime).
|
||||
* [**Typing functions**](#typing-functions): [asserting_absent](#asserting_absent), [asserting_array](#asserting_array), [asserting_bool](#asserting_bool), [asserting_boolean](#asserting_boolean), [asserting_empty](#asserting_empty), [asserting_empty_map](#asserting_empty_map), [asserting_error](#asserting_error), [asserting_float](#asserting_float), [asserting_int](#asserting_int), [asserting_map](#asserting_map), [asserting_nonempty_map](#asserting_nonempty_map), [asserting_not_array](#asserting_not_array), [asserting_not_empty](#asserting_not_empty), [asserting_not_map](#asserting_not_map), [asserting_not_null](#asserting_not_null), [asserting_null](#asserting_null), [asserting_numeric](#asserting_numeric), [asserting_present](#asserting_present), [asserting_string](#asserting_string), [is_absent](#is_absent), [is_array](#is_array), [is_bool](#is_bool), [is_boolean](#is_boolean), [is_empty](#is_empty), [is_empty_map](#is_empty_map), [is_error](#is_error), [is_float](#is_float), [is_int](#is_int), [is_map](#is_map), [is_nan](#is_nan), [is_nonempty_map](#is_nonempty_map), [is_not_array](#is_not_array), [is_not_empty](#is_not_empty), [is_not_map](#is_not_map), [is_not_null](#is_not_null), [is_null](#is_null), [is_numeric](#is_numeric), [is_present](#is_present), [is_string](#is_string), [typeof](#typeof).
|
||||
|
|
@ -1296,6 +1296,22 @@ lstrip (class=string #args=1) Strip leading whitespace from string.
|
|||
</pre>
|
||||
|
||||
|
||||
### match
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
match (class=string #args=2) TODO: WRITE ME
|
||||
Example:
|
||||
TODO: WRITE ME
|
||||
</pre>
|
||||
|
||||
|
||||
### matchx
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
matchx (class=string #args=2) TODO: WRITE ME
|
||||
Example:
|
||||
TODO: WRITE ME
|
||||
</pre>
|
||||
|
||||
|
||||
### regextract
|
||||
<pre class="pre-non-highlight-non-pair">
|
||||
regextract (class=string #args=2) Extracts a substring (the first, if there are multiple matches), matching a regular expression, from the input. Does not use capture groups; see also the =~ operator which does.
|
||||
|
|
|
|||
|
|
@ -199,18 +199,19 @@ MILLER(1) MILLER(1)
|
|||
is_numeric is_present is_string joink joinkv joinv json_parse json_stringify
|
||||
kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec
|
||||
localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
|
||||
mapsum max maxlen md5 mean meaneb median mexp min minlen mmul mode msub
|
||||
nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile
|
||||
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
|
||||
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
|
||||
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
|
||||
splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
|
||||
strfntime_local strftime strftime_local string strip strlen strpntime
|
||||
strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
|
||||
sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
|
||||
typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
|
||||
urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
|
||||
.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
|
||||
mapsum match matchx max maxlen md5 mean meaneb median mexp min minlen mmul
|
||||
mode msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os
|
||||
percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad
|
||||
round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate
|
||||
sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort
|
||||
sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub
|
||||
stddev strfntime strfntime_local strftime strftime_local string strip strlen
|
||||
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
|
||||
sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper
|
||||
truncate typeof unflatten unformat unformatx upntime uptime urand urand32
|
||||
urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % &
|
||||
&& * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ |
|
||||
|| ~
|
||||
|
||||
1mCOMMENTS-IN-DATA FLAGS0m
|
||||
Miller lets you put comments in your data, such as
|
||||
|
|
@ -2629,6 +2630,16 @@ MILLER(1) MILLER(1)
|
|||
1mmapsum0m
|
||||
(class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'.
|
||||
|
||||
1mmatch0m
|
||||
(class=string #args=2) TODO: WRITE ME
|
||||
Example:
|
||||
TODO: WRITE ME
|
||||
|
||||
1mmatchx0m
|
||||
(class=string #args=2) TODO: WRITE ME
|
||||
Example:
|
||||
TODO: WRITE ME
|
||||
|
||||
1mmax0m
|
||||
(class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.
|
||||
|
||||
|
|
@ -3628,4 +3639,4 @@ MILLER(1) MILLER(1)
|
|||
|
||||
|
||||
|
||||
2023-12-13 MILLER(1)
|
||||
2023-12-16 MILLER(1)
|
||||
|
|
|
|||
51
man/mlr.1
51
man/mlr.1
|
|
@ -2,12 +2,12 @@
|
|||
.\" Title: mlr
|
||||
.\" Author: [see the "AUTHOR" section]
|
||||
.\" Generator: ./mkman.rb
|
||||
.\" Date: 2023-12-13
|
||||
.\" Date: 2023-12-16
|
||||
.\" Manual: \ \&
|
||||
.\" Source: \ \&
|
||||
.\" Language: English
|
||||
.\"
|
||||
.TH "MILLER" "1" "2023-12-13" "\ \&" "\ \&"
|
||||
.TH "MILLER" "1" "2023-12-16" "\ \&" "\ \&"
|
||||
.\" -----------------------------------------------------------------
|
||||
.\" * Portability definitions
|
||||
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
@ -246,18 +246,19 @@ is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null
|
|||
is_numeric is_present is_string joink joinkv joinv json_parse json_stringify
|
||||
kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec
|
||||
localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
|
||||
mapsum max maxlen md5 mean meaneb median mexp min minlen mmul mode msub
|
||||
nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile
|
||||
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
|
||||
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
|
||||
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
|
||||
splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
|
||||
strfntime_local strftime strftime_local string strip strlen strpntime
|
||||
strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
|
||||
sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
|
||||
typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
|
||||
urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
|
||||
\&.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
|
||||
mapsum match matchx max maxlen md5 mean meaneb median mexp min minlen mmul
|
||||
mode msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os
|
||||
percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad
|
||||
round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate
|
||||
sec2localtime select sgn sha1 sha256 sha512 sin sinh skewness sort
|
||||
sort_collection splita splitax splitkv splitkvx splitnv splitnvx sqrt ssub
|
||||
stddev strfntime strfntime_local strftime strftime_local string strip strlen
|
||||
strpntime strpntime_local strptime strptime_local sub substr substr0 substr1
|
||||
sum sum2 sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper
|
||||
truncate typeof unflatten unformat unformatx upntime uptime urand urand32
|
||||
urandelement urandint urandrange utf8_to_latin1 variance version ! != !=~ % &
|
||||
&& * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ |
|
||||
|| ~
|
||||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
|
|
@ -3938,6 +3939,28 @@ localtime2sec("2001-02-03 04:05:06", "Asia/Istanbul") = 981165906"
|
|||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
.SS "match"
|
||||
.if n \{\
|
||||
.RS 0
|
||||
.\}
|
||||
.nf
|
||||
(class=string #args=2) TODO: WRITE ME
|
||||
Example:
|
||||
TODO: WRITE ME
|
||||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
.SS "matchx"
|
||||
.if n \{\
|
||||
.RS 0
|
||||
.\}
|
||||
.nf
|
||||
(class=string #args=2) TODO: WRITE ME
|
||||
Example:
|
||||
TODO: WRITE ME
|
||||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
.SS "max"
|
||||
.if n \{\
|
||||
.RS 0
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@ func BIF_sub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
|
|||
sregex := input2.AcquireStringValue()
|
||||
replacement := input3.AcquireStringValue()
|
||||
|
||||
stringOutput := lib.RegexSub(input, sregex, replacement)
|
||||
stringOutput := lib.RegexStringSub(input, sregex, replacement)
|
||||
return mlrval.FromString(stringOutput)
|
||||
}
|
||||
|
||||
|
|
@ -111,7 +111,7 @@ func BIF_gsub(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
|
|||
sregex := input2.AcquireStringValue()
|
||||
replacement := input3.AcquireStringValue()
|
||||
|
||||
stringOutput := lib.RegexGsub(input, sregex, replacement)
|
||||
stringOutput := lib.RegexStringGsub(input, sregex, replacement)
|
||||
return mlrval.FromString(stringOutput)
|
||||
}
|
||||
|
||||
|
|
@ -129,7 +129,7 @@ func BIF_string_matches_regexp(input1, input2 *mlrval.Mlrval) (retval *mlrval.Ml
|
|||
return mlrval.FromNotStringError("=~", input2), nil
|
||||
}
|
||||
|
||||
boolOutput, captures := lib.RegexMatches(input1string, input2.AcquireStringValue())
|
||||
boolOutput, captures := lib.RegexStringMatchWithCaptures(input1string, input2.AcquireStringValue())
|
||||
return mlrval.FromBool(boolOutput), captures
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -266,7 +266,7 @@ func (root *RootNode) BuildStringLiteralNode(literal string) IEvaluable {
|
|||
// RegexLiteralNode. See also https://github.com/johnkerl/miller/issues/297.
|
||||
literal = lib.UnbackslashStringLiteral(literal)
|
||||
|
||||
hasCaptures, replacementCaptureMatrix := lib.RegexReplacementHasCaptures(literal)
|
||||
hasCaptures, replacementCaptureMatrix := lib.ReplacementHasCaptures(literal)
|
||||
if !hasCaptures {
|
||||
return &StringLiteralNode{
|
||||
literal: mlrval.FromString(literal),
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ type tIPSRegexSplitter struct {
|
|||
}
|
||||
|
||||
func (s *tIPSRegexSplitter) Split(input string) []string {
|
||||
return lib.RegexSplitString(s.ipsRegex, input, 2)
|
||||
return lib.RegexCompiledSplitString(s.ipsRegex, input, 2)
|
||||
}
|
||||
|
||||
// IFieldSplitter splits a string into pieces, e.g. for IFS.
|
||||
|
|
@ -193,5 +193,5 @@ type tIFSRegexSplitter struct {
|
|||
}
|
||||
|
||||
func (s *tIFSRegexSplitter) Split(input string) []string {
|
||||
return lib.RegexSplitString(s.ifsRegex, input, -1)
|
||||
return lib.RegexCompiledSplitString(s.ifsRegex, input, -1)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -304,7 +304,7 @@ type tXTABIPSSplitter struct {
|
|||
// which we need to produce just a pair of items -- a key and a value -- delimited by one or more
|
||||
// IPS. For exaemple, with IPS being a space, in 'abc 123' we need to get key 'abc' and value
|
||||
// '123'; for 'abc 123 456' we need key 'abc' and value '123 456'. It's super-elegant to simply
|
||||
// regex-split the line like 'kv = lib.RegexSplitString(reader.readerOptions.IPSRegex, line, 2)' --
|
||||
// regex-split the line like 'kv = lib.RegexCompiledSplitString(reader.readerOptions.IPSRegex, line, 2)' --
|
||||
// however, that's 3x slower than the current implementation. It turns out regexes are great
|
||||
// but we should use them only when we must, since they are expensive.
|
||||
func (s *tXTABIPSSplitter) Split(input string) (key, value string, err error) {
|
||||
|
|
@ -358,7 +358,7 @@ type tXTABIPSRegexSplitter struct {
|
|||
}
|
||||
|
||||
func (s *tXTABIPSRegexSplitter) Split(input string) (key, value string, err error) {
|
||||
kv := lib.RegexSplitString(s.ipsRegex, input, 2)
|
||||
kv := lib.RegexCompiledSplitString(s.ipsRegex, input, 2)
|
||||
if len(kv) == 0 {
|
||||
return "", "", fmt.Errorf("internal coding error in XTAB reader")
|
||||
} else if len(kv) == 1 {
|
||||
|
|
|
|||
437
pkg/lib/regex.go
437
pkg/lib/regex.go
|
|
@ -1,5 +1,5 @@
|
|||
// ================================================================
|
||||
// Support for regexes in Miller.
|
||||
// Support for regular expressions in Miller.
|
||||
//
|
||||
// * By and large we use the Go library.
|
||||
//
|
||||
|
|
@ -13,17 +13,24 @@
|
|||
// $y = "\2:\1";
|
||||
// }
|
||||
// where the '=~' sets the captures and the "\2:\1" uses them. (Note that
|
||||
// https://github.com/johnkerl/miller/issues/388 has a better suggestion
|
||||
// which would make the captures explicit as variables, rather than implicit
|
||||
// within CST state -- regardless, the current syntax will still be supported
|
||||
// for backward compatibility and so is here to stay.) Here we make use of Go
|
||||
// regexp-library functions to write to, and then later interpolate from, a
|
||||
// captures array which is stored within CST state. (See the `runtime.State`
|
||||
// object.)
|
||||
// https://github.com/johnkerl/miller/issues/388 has a better suggestion which would make the
|
||||
// captures explicit as variables, rather than implicit within CST state: this is implemented by
|
||||
// the `match` and `matchx` DSL functions. Regardless, the `=~` syntax will still be supported
|
||||
// for backward compatibility and so is here to stay.) Here we make use of Go regexp-library
|
||||
// functions to write to, and then later interpolate from, a captures array which is stored within
|
||||
// CST state. (See the `runtime.State` object.)
|
||||
//
|
||||
// * "\0" is for a full match; "\1" .. "\9" are for submatch cqptures. E.g.
|
||||
// if $x is "foobarbaz" and the regex is "foo(.)(..)baz", then "\0" is
|
||||
// "foobarbaz", "\1" is "b", "\2" is "ar", and "\3".."\9" are "".
|
||||
//
|
||||
// * Naming:
|
||||
//
|
||||
// o "regexp" and "Regexp" are used for the Go library and its data structure, respectively;
|
||||
//
|
||||
// o "regex" is used for regular-expression strings following Miller's idiosyncratic syntax and
|
||||
// semantics as described above.
|
||||
//
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
|
@ -34,6 +41,7 @@ import (
|
|||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// captureDetector is used to see if a string literal interpolates previous
|
||||
|
|
@ -44,20 +52,54 @@ var captureDetector = regexp.MustCompile(`\\[0-9]`)
|
|||
// "\2:\1" so they don't need to be recomputed on every record.
|
||||
var captureSplitter = regexp.MustCompile(`(\\[0-9])`)
|
||||
|
||||
// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax
|
||||
// which predate the port of Miller from C to Go. Miller regexes use a final
|
||||
// 'i' to indicate case-insensitivity; Go regexes use an initial "(?i)".
|
||||
// See regexpCompileCached
|
||||
var regexpCache map[string]*regexp.Regexp
|
||||
|
||||
const cacheMaxSize = 1000
|
||||
|
||||
var cacheMutex sync.Mutex
|
||||
|
||||
// regexpCompileCached keeps a cache of compiled regexes, so that the caller has the flexibility to
|
||||
// only pass in strings while getting the benefits of compilation avoidance.
|
||||
//
|
||||
// (See also mlr.bnf where we specify which things can be backslash-escaped
|
||||
// without a syntax error at parse time.)
|
||||
// Regarding cache size: in nominal use, regexp strings are within Miller DSL code statements, and
|
||||
// there will be a handful. These will all get re-used after their first application, and the cache
|
||||
// will remain bounded by the size of the user's DSL code. However, it is possible to have regex
|
||||
// strings contained within Miller record-field data.
|
||||
//
|
||||
// * If the regex_string is of the form a.*b, compiles it case-sensisitively.
|
||||
// * If the regex_string is of the form "a.*b", compiles a.*b case-sensisitively.
|
||||
// We could solve this by using an LRU cache. However, for simplicity, we limit the number of
|
||||
// cached compiles, and for any extras that appear during record processing, we simply recompile
|
||||
// each time.
|
||||
func regexpCompileCached(s string) (*regexp.Regexp, error) {
|
||||
if len(regexpCache) > cacheMaxSize {
|
||||
return regexp.Compile(s)
|
||||
}
|
||||
r, err := regexp.Compile(s)
|
||||
if err == nil {
|
||||
cacheMutex.Lock()
|
||||
if regexpCache == nil {
|
||||
regexpCache = make(map[string]*regexp.Regexp)
|
||||
}
|
||||
regexpCache[s] = r
|
||||
cacheMutex.Unlock()
|
||||
}
|
||||
return r, err
|
||||
}
|
||||
|
||||
// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax which predates the
|
||||
// port of Miller from C to Go. Miller regexes use a final 'i' to indicate case-insensitivity; Go
|
||||
// regexes use an initial "(?i)".
|
||||
//
|
||||
// (See also mlr.bnf where we specify which things can be backslash-escaped without a syntax error
|
||||
// at parse time.)
|
||||
//
|
||||
// * If the regex_string is of the form a.*b, compiles it case-sensitively.
|
||||
// * If the regex_string is of the form "a.*b", compiles a.*b case-sensitively.
|
||||
// * If the regex_string is of the form "a.*b"i, compiles a.*b case-insensitively.
|
||||
func CompileMillerRegex(regexString string) (*regexp.Regexp, error) {
|
||||
n := len(regexString)
|
||||
if n < 2 {
|
||||
return regexp.Compile(regexString)
|
||||
return regexpCompileCached(regexString)
|
||||
}
|
||||
|
||||
// TODO: rethink this. This will strip out things people have entered, e.g. "\"...\"".
|
||||
|
|
@ -68,20 +110,20 @@ func CompileMillerRegex(regexString string) (*regexp.Regexp, error) {
|
|||
// literals) and from verbs (like cut -r or having-fields).
|
||||
|
||||
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"") {
|
||||
return regexp.Compile(regexString[1 : n-1])
|
||||
return regexpCompileCached(regexString[1 : n-1])
|
||||
}
|
||||
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/") {
|
||||
return regexp.Compile(regexString[1 : n-1])
|
||||
return regexpCompileCached(regexString[1 : n-1])
|
||||
}
|
||||
|
||||
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"i") {
|
||||
return regexp.Compile("(?i)" + regexString[1:n-2])
|
||||
return regexpCompileCached("(?i)" + regexString[1:n-2])
|
||||
}
|
||||
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/i") {
|
||||
return regexp.Compile("(?i)" + regexString[1:n-2])
|
||||
return regexpCompileCached("(?i)" + regexString[1:n-2])
|
||||
}
|
||||
|
||||
return regexp.Compile(regexString)
|
||||
return regexpCompileCached(regexString)
|
||||
}
|
||||
|
||||
// CompileMillerRegexOrDie wraps CompileMillerRegex. Usually in Go we want to
|
||||
|
|
@ -110,7 +152,7 @@ func CompileMillerRegexesOrDie(regexStrings []string) []*regexp.Regexp {
|
|||
// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
|
||||
// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
|
||||
// but "" splits to [""] when I wish it were []. This function does the latter.
|
||||
func RegexSplitString(regex *regexp.Regexp, input string, n int) []string {
|
||||
func RegexCompiledSplitString(regex *regexp.Regexp, input string, n int) []string {
|
||||
if input == "" {
|
||||
return make([]string, 0)
|
||||
} else {
|
||||
|
|
@ -118,193 +160,42 @@ func RegexSplitString(regex *regexp.Regexp, input string, n int) []string {
|
|||
}
|
||||
}
|
||||
|
||||
// MakeEmptyRegexCaptures is for initial CST state at the start of executing
|
||||
// the DSL expression for the current record. Even if '$x =~ "(..)_(...)" set
|
||||
// "\1" and "\2" on the previous record, at start of processing for the current
|
||||
// record we need to start with a clean slate.
|
||||
func MakeEmptyRegexCaptures() []string {
|
||||
return nil
|
||||
}
|
||||
|
||||
// RegexReplacementHasCaptures is used by the CST builder to see if
|
||||
// string-literal is like "foo bar" or "foo \1 bar" -- in the latter case it
|
||||
// needs to retain the compiled offsets-matrix information.
|
||||
func RegexReplacementHasCaptures(
|
||||
replacement string,
|
||||
) (
|
||||
hasCaptures bool,
|
||||
matrix [][]int,
|
||||
) {
|
||||
if captureDetector.MatchString(replacement) {
|
||||
return true, captureSplitter.FindAllSubmatchIndex([]byte(replacement), -1)
|
||||
} else {
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
||||
// RegexMatches implements the =~ DSL operator. The captures are stored in DSL
|
||||
// state and may be used by a DSL statement after the =~. For example, in
|
||||
//
|
||||
// sub($a, "(..)_(...)", "\1:\2")
|
||||
//
|
||||
// the replacement string is an argument to sub and therefore the captures are
|
||||
// confined to the implementation of the sub function. Similarly for gsub. But
|
||||
// for the match operator, people can do
|
||||
//
|
||||
// if ($x =~ "(..)_(...)") {
|
||||
// ... other lines of code ...
|
||||
// $y = "\2:\1"
|
||||
// }
|
||||
//
|
||||
// and the =~ callsite doesn't know if captures will be used or not. So,
|
||||
// RegexMatches always returns the captures array. It is stored within the CST
|
||||
// state.
|
||||
func RegexMatches(
|
||||
input string,
|
||||
sregex string,
|
||||
) (
|
||||
matches bool,
|
||||
capturesOneUp []string,
|
||||
) {
|
||||
regex := CompileMillerRegexOrDie(sregex)
|
||||
return RegexMatchesCompiled(input, regex)
|
||||
}
|
||||
|
||||
// RegexMatchesCompiled is the implementation for the =~ operator. Without
|
||||
// Miller-style regex captures this would a simple one-line
|
||||
// regex.MatchString(input). However, we return the captures array for the
|
||||
// benefit of subsequent references to "\0".."\9".
|
||||
func RegexMatchesCompiled(
|
||||
input string,
|
||||
regex *regexp.Regexp,
|
||||
) (bool, []string) {
|
||||
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
|
||||
if matrix == nil || len(matrix) == 0 {
|
||||
// Set all captures to ""
|
||||
return false, make([]string, 10)
|
||||
}
|
||||
|
||||
// "\0" .. "\9"
|
||||
captures := make([]string, 10)
|
||||
|
||||
// If there are multiple matches -- e.g. input is
|
||||
//
|
||||
// "...ab_cde...fg_hij..."
|
||||
//
|
||||
// with regex
|
||||
//
|
||||
// "(..)_(...)"
|
||||
//
|
||||
// -- then we only consider the first match: boolean return value is true
|
||||
// (the input string matched the regex), and the captures array will map
|
||||
// "\1" to "ab" and "\2" to "cde".
|
||||
row := matrix[0]
|
||||
n := len(row)
|
||||
|
||||
// Example return value from FindAllSubmatchIndex with input
|
||||
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
|
||||
//
|
||||
// Matrix is [][]int{
|
||||
// []int{3, 9, 3, 5, 6, 9},
|
||||
// []int{12, 18, 12, 14, 15, 18},
|
||||
// }
|
||||
//
|
||||
// As noted above we look at only the first row.
|
||||
//
|
||||
// * 3-9 is for the entire match "ab_cde"
|
||||
// * 3-5 is for the first capture "ab"
|
||||
// * 6-9 is for the second capture "cde"
|
||||
|
||||
di := 0
|
||||
for si := 0; si < n && di <= 9; si += 2 {
|
||||
start := row[si]
|
||||
end := row[si+1]
|
||||
if start >= 0 && end >= 0 {
|
||||
captures[di] = input[start:end]
|
||||
}
|
||||
di += 1
|
||||
}
|
||||
|
||||
return true, captures
|
||||
}
|
||||
|
||||
// InterpolateCaptures example:
|
||||
// - Input $x is "ab_cde"
|
||||
// - DSL expression
|
||||
// if ($x =~ "(..)_(...)") {
|
||||
// ... other lines of code ...
|
||||
// $y = "\2:\1";
|
||||
// }
|
||||
// - InterpolateCaptures is used on the evaluation of "\2:\1"
|
||||
// - replacementString is "\2:\1"
|
||||
// - replacementMatrix contains precomputed/cached offsets for the "\2" and
|
||||
// "\1" substrings within "\2:\1"
|
||||
// - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
|
||||
// slot 2 being "cde" (for "\2"), and slots 3-9 being "".
|
||||
func InterpolateCaptures(
|
||||
replacementString string,
|
||||
replacementMatrix [][]int,
|
||||
captures []string,
|
||||
) string {
|
||||
if replacementMatrix == nil || captures == nil {
|
||||
return replacementString
|
||||
}
|
||||
var buffer bytes.Buffer
|
||||
|
||||
nonMatchStartIndex := 0
|
||||
|
||||
for _, row := range replacementMatrix {
|
||||
start := row[0]
|
||||
buffer.WriteString(replacementString[nonMatchStartIndex:row[0]])
|
||||
|
||||
// Map "\0".."\9" to integer index 0..9
|
||||
index := replacementString[start+1] - '0'
|
||||
buffer.WriteString(captures[index])
|
||||
|
||||
nonMatchStartIndex = row[1]
|
||||
}
|
||||
|
||||
buffer.WriteString(replacementString[nonMatchStartIndex:])
|
||||
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
// RegexSub implements the sub DSL function.
|
||||
func RegexSub(
|
||||
// RegexStringSub implements the sub DSL function.
|
||||
func RegexStringSub(
|
||||
input string,
|
||||
sregex string,
|
||||
replacement string,
|
||||
) string {
|
||||
regex := CompileMillerRegexOrDie(sregex)
|
||||
_, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement)
|
||||
return RegexSubCompiled(input, regex, replacement, replacementCaptureMatrix)
|
||||
_, replacementCaptureMatrix := ReplacementHasCaptures(replacement)
|
||||
return RegexCompiledSub(input, regex, replacement, replacementCaptureMatrix)
|
||||
}
|
||||
|
||||
// RegexSubCompiled is the same as RegexSub but with compiled regex and
|
||||
// RegexCompiledSub is the same as RegexStringSub but with compiled regex and
|
||||
// replacement strings.
|
||||
func RegexSubCompiled(
|
||||
func RegexCompiledSub(
|
||||
input string,
|
||||
regex *regexp.Regexp,
|
||||
replacement string,
|
||||
replacementCaptureMatrix [][]int,
|
||||
) string {
|
||||
return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, true)
|
||||
return regexCompiledSubOrGsub(input, regex, replacement, replacementCaptureMatrix, true)
|
||||
}
|
||||
|
||||
// RegexGsub implements the gsub DSL function.
|
||||
func RegexGsub(
|
||||
// RegexStringGsub implements the `gsub` DSL function.
|
||||
func RegexStringGsub(
|
||||
input string,
|
||||
sregex string,
|
||||
replacement string,
|
||||
) string {
|
||||
regex := CompileMillerRegexOrDie(sregex)
|
||||
_, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement)
|
||||
return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, false)
|
||||
_, replacementCaptureMatrix := ReplacementHasCaptures(replacement)
|
||||
return regexCompiledSubOrGsub(input, regex, replacement, replacementCaptureMatrix, false)
|
||||
}
|
||||
|
||||
// regexSubGsubCompiled is the implementation for sub/gsub with compilex regex
|
||||
// regexCompiledSubOrGsub is the implementation for `sub`/`gsub` with compilex regex
|
||||
// and replacement strings.
|
||||
func regexSubGsubCompiled(
|
||||
func regexCompiledSubOrGsub(
|
||||
input string,
|
||||
regex *regexp.Regexp,
|
||||
replacement string,
|
||||
|
|
@ -384,3 +275,177 @@ func regexSubGsubCompiled(
|
|||
buffer.WriteString(input[nonMatchStartIndex:])
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
// RegexStringMatchSimple is for simple boolean return without any substring captures.
|
||||
func RegexStringMatchSimple(
|
||||
input string,
|
||||
sregex string,
|
||||
) bool {
|
||||
regex := CompileMillerRegexOrDie(sregex)
|
||||
return RegexCompiledMatchSimple(input, regex)
|
||||
}
|
||||
|
||||
// RegexCompiledMatchSimple is for simple boolean return without any substring captures.
|
||||
func RegexCompiledMatchSimple(
|
||||
input string,
|
||||
regex *regexp.Regexp,
|
||||
) bool {
|
||||
return regex.Match([]byte(input))
|
||||
}
|
||||
|
||||
// RegexStringMatchWithCaptures implements the =~ DSL operator. The captures are stored in DSL
|
||||
// state and may be used by a DSL statement after the =~. For example, in
|
||||
//
|
||||
// sub($a, "(..)_(...)", "\1:\2")
|
||||
//
|
||||
// the replacement string is an argument to sub and therefore the captures are
|
||||
// confined to the implementation of the sub function. Similarly for gsub. But
|
||||
// for the match operator, people can do
|
||||
//
|
||||
// if ($x =~ "(..)_(...)") {
|
||||
// ... other lines of code ...
|
||||
// $y = "\2:\1"
|
||||
// }
|
||||
//
|
||||
// and the =~ callsite doesn't know if captures will be used or not. So,
|
||||
// RegexStringMatchWithCaptures always returns the captures array. It is stored within the CST
|
||||
// state.
|
||||
func RegexStringMatchWithCaptures(
|
||||
input string,
|
||||
sregex string,
|
||||
) (
|
||||
matches bool,
|
||||
capturesOneUp []string,
|
||||
) {
|
||||
regex := CompileMillerRegexOrDie(sregex)
|
||||
return RegexCompiledMatchWithCaptures(input, regex)
|
||||
}
|
||||
|
||||
// RegexCompiledMatchWithCaptures is the implementation for the =~ operator. Without
|
||||
// Miller-style regex captures this would a simple one-line
|
||||
// regex.MatchString(input). However, we return the captures array for the
|
||||
// benefit of subsequent references to "\0".."\9".
|
||||
func RegexCompiledMatchWithCaptures(
|
||||
input string,
|
||||
regex *regexp.Regexp,
|
||||
) (bool, []string) {
|
||||
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
|
||||
if matrix == nil || len(matrix) == 0 {
|
||||
// Set all captures to ""
|
||||
return false, make([]string, 10)
|
||||
}
|
||||
|
||||
// "\0" .. "\9"
|
||||
captures := make([]string, 10)
|
||||
|
||||
// If there are multiple matches -- e.g. input is
|
||||
//
|
||||
// "...ab_cde...fg_hij..."
|
||||
//
|
||||
// with regex
|
||||
//
|
||||
// "(..)_(...)"
|
||||
//
|
||||
// -- then we only consider the first match: boolean return value is true
|
||||
// (the input string matched the regex), and the captures array will map
|
||||
// "\1" to "ab" and "\2" to "cde".
|
||||
row := matrix[0]
|
||||
n := len(row)
|
||||
|
||||
// Example return value from FindAllSubmatchIndex with input
|
||||
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
|
||||
//
|
||||
// Matrix is [][]int{
|
||||
// []int{3, 9, 3, 5, 6, 9},
|
||||
// []int{12, 18, 12, 14, 15, 18},
|
||||
// }
|
||||
//
|
||||
// As noted above we look at only the first row.
|
||||
//
|
||||
// * 3-9 is for the entire match "ab_cde"
|
||||
// * 3-5 is for the first capture "ab"
|
||||
// * 6-9 is for the second capture "cde"
|
||||
|
||||
di := 0
|
||||
for si := 0; si < n && di <= 9; si += 2 {
|
||||
start := row[si]
|
||||
end := row[si+1]
|
||||
if start >= 0 && end >= 0 {
|
||||
captures[di] = input[start:end]
|
||||
}
|
||||
di += 1
|
||||
}
|
||||
|
||||
return true, captures
|
||||
}
|
||||
|
||||
// MakeEmptyCaptures is for initial CST state at the start of executing the DSL expression for the
|
||||
// current record. Even if '$x =~ "(..)_(...)" set "\1" and "\2" on the previous record, at start
|
||||
// of processing for the current record we need to start with a clean slate. This is in support of
|
||||
// CST state, which `=~` semantics requires.
|
||||
func MakeEmptyCaptures() []string {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ReplacementHasCaptures is used by the CST builder to see if string-literal is like "foo bar" or
|
||||
// "foo \1 bar" -- in the latter case it needs to retain the compiled offsets-matrix information.
|
||||
// This is in support of CST state, which `=~` semantics requires.
|
||||
func ReplacementHasCaptures(
|
||||
replacement string,
|
||||
) (
|
||||
hasCaptures bool,
|
||||
matrix [][]int,
|
||||
) {
|
||||
if captureDetector.MatchString(replacement) {
|
||||
return true, captureSplitter.FindAllSubmatchIndex([]byte(replacement), -1)
|
||||
} else {
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
||||
// InterpolateCaptures example:
|
||||
//
|
||||
// * Input $x is "ab_cde"
|
||||
//
|
||||
// - DSL expression
|
||||
// if ($x =~ "(..)_(...)") {
|
||||
// ... other lines of code ...
|
||||
// $y = "\2:\1";
|
||||
// }
|
||||
//
|
||||
// * InterpolateCaptures is used on the evaluation of "\2:\1"
|
||||
//
|
||||
// * replacementString is "\2:\1"
|
||||
//
|
||||
// - replacementMatrix contains precomputed/cached offsets for the "\2" and
|
||||
// "\1" substrings within "\2:\1"
|
||||
//
|
||||
// - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
|
||||
// slot 2 being "cde" (for "\2"), and slots 3-9 being "".
|
||||
func InterpolateCaptures(
|
||||
replacementString string,
|
||||
replacementMatrix [][]int,
|
||||
captures []string,
|
||||
) string {
|
||||
if replacementMatrix == nil || captures == nil {
|
||||
return replacementString
|
||||
}
|
||||
var buffer bytes.Buffer
|
||||
|
||||
nonMatchStartIndex := 0
|
||||
|
||||
for _, row := range replacementMatrix {
|
||||
start := row[0]
|
||||
buffer.WriteString(replacementString[nonMatchStartIndex:row[0]])
|
||||
|
||||
// Map "\0".."\9" to integer index 0..9
|
||||
index := replacementString[start+1] - '0'
|
||||
buffer.WriteString(captures[index])
|
||||
|
||||
nonMatchStartIndex = row[1]
|
||||
}
|
||||
|
||||
buffer.WriteString(replacementString[nonMatchStartIndex:])
|
||||
|
||||
return buffer.String()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ var dataForMatches = []tDataForMatches{
|
|||
|
||||
func TestRegexReplacementHasCaptures(t *testing.T) {
|
||||
for i, entry := range dataForHasCaptures {
|
||||
actualHasCaptures, actualMatrix := RegexReplacementHasCaptures(entry.replacement)
|
||||
actualHasCaptures, actualMatrix := ReplacementHasCaptures(entry.replacement)
|
||||
if actualHasCaptures != entry.expectedHasCaptures {
|
||||
t.Fatalf("case %d replacement \"%s\" expected %v got %v\n",
|
||||
i, entry.replacement, entry.expectedHasCaptures, actualHasCaptures,
|
||||
|
|
@ -104,7 +104,7 @@ func TestRegexReplacementHasCaptures(t *testing.T) {
|
|||
|
||||
func TestRegexSub(t *testing.T) {
|
||||
for i, entry := range dataForSub {
|
||||
actualOutput := RegexSub(entry.input, entry.sregex, entry.replacement)
|
||||
actualOutput := RegexStringSub(entry.input, entry.sregex, entry.replacement)
|
||||
if actualOutput != entry.expectedOutput {
|
||||
t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n",
|
||||
i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput,
|
||||
|
|
@ -115,7 +115,7 @@ func TestRegexSub(t *testing.T) {
|
|||
|
||||
func TestRegexGsub(t *testing.T) {
|
||||
for i, entry := range dataForGsub {
|
||||
actualOutput := RegexGsub(entry.input, entry.sregex, entry.replacement)
|
||||
actualOutput := RegexStringGsub(entry.input, entry.sregex, entry.replacement)
|
||||
if actualOutput != entry.expectedOutput {
|
||||
t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n",
|
||||
i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput,
|
||||
|
|
@ -126,7 +126,7 @@ func TestRegexGsub(t *testing.T) {
|
|||
|
||||
func TestRegexMatches(t *testing.T) {
|
||||
for i, entry := range dataForMatches {
|
||||
actualOutput, actualCaptures := RegexMatches(entry.input, entry.sregex)
|
||||
actualOutput, actualCaptures := RegexStringMatchWithCaptures(entry.input, entry.sregex)
|
||||
if actualOutput != entry.expectedOutput {
|
||||
t.Fatalf("case %d input \"%s\" sregex \"%s\" expected %v got %v\n",
|
||||
i, entry.input, entry.sregex, entry.expectedOutput, actualOutput,
|
||||
|
|
|
|||
|
|
@ -43,8 +43,8 @@ func NewEmptyState(options *cli.TOptions, strictMode bool) *State {
|
|||
|
||||
// OutputRecordsAndContexts is assigned after construction
|
||||
|
||||
// See lib.MakeEmptyRegexCaptures for context.
|
||||
RegexCaptures: lib.MakeEmptyRegexCaptures(),
|
||||
// See lib.MakeEmptyCaptures for context.
|
||||
RegexCaptures: lib.MakeEmptyCaptures(),
|
||||
Options: options,
|
||||
|
||||
StrictMode: strictMode,
|
||||
|
|
@ -57,5 +57,5 @@ func (state *State) Update(
|
|||
) {
|
||||
state.Inrec = inrec
|
||||
state.Context = context
|
||||
state.RegexCaptures = lib.MakeEmptyRegexCaptures()
|
||||
state.RegexCaptures = lib.MakeEmptyCaptures()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -479,7 +479,7 @@ func (tr *TransformerMergeFields) transformByCollapsing(
|
|||
matched = valueFieldNameRegex.MatchString(pe.Key)
|
||||
if matched {
|
||||
// TODO: comment re matrix
|
||||
shortName = lib.RegexSubCompiled(valueFieldName, valueFieldNameRegex, "", nil)
|
||||
shortName = lib.RegexCompiledSub(valueFieldName, valueFieldNameRegex, "", nil)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -169,7 +169,7 @@ func NewTransformerRename(
|
|||
regexString := pe.Key
|
||||
regex := lib.CompileMillerRegexOrDie(regexString)
|
||||
replacement := pe.Value.(string)
|
||||
_, replacementCaptureMatrix := lib.RegexReplacementHasCaptures(replacement)
|
||||
_, replacementCaptureMatrix := lib.ReplacementHasCaptures(replacement)
|
||||
regexAndReplacement := tRegexAndReplacement{
|
||||
regex: regex,
|
||||
replacement: replacement,
|
||||
|
|
@ -241,7 +241,7 @@ func (tr *TransformerRename) transformWithRegexes(
|
|||
inrec.Rename(oldName, newName)
|
||||
}
|
||||
} else {
|
||||
newName := lib.RegexSubCompiled(oldName, regex, replacement, replacementCaptureMatrix)
|
||||
newName := lib.RegexCompiledSub(oldName, regex, replacement, replacementCaptureMatrix)
|
||||
if newName != oldName {
|
||||
inrec.Rename(oldName, newName)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue