miller/pkg/lib/regex.go
John Kerl b4ff26a7d0
Static-check fixes from @lespea #1657, batch 6/n (#1708)
* Static-check fixes from @lespea #1657, batch 2/n

* Static-check fixes from @lespea #1657, batch 3/n

* Static-check fixes from @lespea #1657, batch 4/n

* Static-check fixes from @lespea #1657, batch 5/n

* Static-check fixes from @lespea #1657, batch 6/n
2024-10-27 12:06:17 -04:00

528 lines
16 KiB
Go

// ================================================================
// Support for regular expressions in Miller.
//
// * By and large we use the Go library.
//
// * There is (for historical reasons) a DSL syntax "[a-z]"i (note the trailing i)
// for case-insensitive regular expressions which we map into Go syntax for
// regex-compilation.
//
// * Also for historical reasons, we allow things like
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1";
// }
// where the '=~' sets the captures and the "\2:\1" uses them. (Note that
// https://github.com/johnkerl/miller/issues/388 has a better suggestion which would make the
// captures explicit as variables, rather than implicit within CST state: this is implemented by
// the `strmatch` and `strmatchx` DSL functions. Regardless, the `=~` syntax will still be supported
// for backward compatibility and so is here to stay.) Here we make use of Go regexp-library
// functions to write to, and then later interpolate from, a captures array which is stored within
// CST state. (See the `runtime.State` object.)
//
// * "\0" is for a full match; "\1" .. "\9" are for submatch cqptures. E.g.
// if $x is "foobarbaz" and the regex is "foo(.)(..)baz", then "\0" is
// "foobarbaz", "\1" is "b", "\2" is "ar", and "\3".."\9" are "".
//
// * Naming:
//
// o "regexp" and "Regexp" are used for the Go library and its data structure, respectively;
//
// o "regex" is used for regular-expression strings following Miller's idiosyncratic syntax and
// semantics as described above.
//
// ================================================================
package lib
import (
"bytes"
"fmt"
"os"
"regexp"
"strings"
"sync"
)
// captureDetector is used to see if a string literal interpolates previous
// captures (like "\2:\1") or not (like "2:1").
var captureDetector = regexp.MustCompile(`\\[0-9]`)
// captureSplitter is used to precompute an offsets matrix for strings like
// "\2:\1" so they don't need to be recomputed on every record.
var captureSplitter = regexp.MustCompile(`(\\[0-9])`)
// See regexpCompileCached
var regexpCache map[string]*regexp.Regexp
const cacheMaxSize = 1000
var cacheMutex sync.Mutex
// regexpCompileCached keeps a cache of compiled regexes, so that the caller has the flexibility to
// only pass in strings while getting the benefits of compilation avoidance.
//
// Regarding cache size: in nominal use, regexp strings are within Miller DSL code statements, and
// there will be a handful. These will all get re-used after their first application, and the cache
// will remain bounded by the size of the user's DSL code. However, it is possible to have regex
// strings contained within Miller record-field data.
//
// We could solve this by using an LRU cache. However, for simplicity, we limit the number of
// cached compiles, and for any extras that appear during record processing, we simply recompile
// each time.
func regexpCompileCached(s string) (*regexp.Regexp, error) {
if len(regexpCache) > cacheMaxSize {
return regexp.Compile(s)
}
r, err := regexp.Compile(s)
if err == nil {
cacheMutex.Lock()
if regexpCache == nil {
regexpCache = make(map[string]*regexp.Regexp)
}
regexpCache[s] = r
cacheMutex.Unlock()
}
return r, err
}
// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax which predates the
// port of Miller from C to Go. Miller regexes use a final 'i' to indicate case-insensitivity; Go
// regexes use an initial "(?i)".
//
// (See also mlr.bnf where we specify which things can be backslash-escaped without a syntax error
// at parse time.)
//
// * If the regex_string is of the form a.*b, compiles it case-sensitively.
// * If the regex_string is of the form "a.*b", compiles a.*b case-sensitively.
// * If the regex_string is of the form "a.*b"i, compiles a.*b case-insensitively.
func CompileMillerRegex(regexString string) (*regexp.Regexp, error) {
n := len(regexString)
if n < 2 {
return regexpCompileCached(regexString)
}
// TODO: rethink this. This will strip out things people have entered, e.g. "\"...\"".
// The parser-to-AST will have stripped the outer and we'll strip the inner and the
// user's intent will be lost.
//
// TODO: make separate functions for calling from parser-to-AST (string
// literals) and from verbs (like cut -r or having-fields).
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"") {
return regexpCompileCached(regexString[1 : n-1])
}
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/") {
return regexpCompileCached(regexString[1 : n-1])
}
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"i") {
return regexpCompileCached("(?i)" + regexString[1:n-2])
}
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/i") {
return regexpCompileCached("(?i)" + regexString[1:n-2])
}
return regexpCompileCached(regexString)
}
// CompileMillerRegexOrDie wraps CompileMillerRegex. Usually in Go we want to
// return a second error argument rather than fataling. However, if there's a
// malformed regex we really cannot continue so it's simpler to just fatal.
func CompileMillerRegexOrDie(regexString string) *regexp.Regexp {
regex, err := CompileMillerRegex(regexString)
if err != nil {
fmt.Fprint(os.Stderr, err)
os.Exit(1)
}
return regex
}
// CompileMillerRegexesOrDie is a convenenience looper over CompileMillerRegexOrDie.
func CompileMillerRegexesOrDie(regexStrings []string) []*regexp.Regexp {
regexes := make([]*regexp.Regexp, len(regexStrings))
for i, regexString := range regexStrings {
regexes[i] = CompileMillerRegexOrDie(regexString)
}
return regexes
}
// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
// but "" splits to [""] when I wish it were []. This function does the latter.
func RegexCompiledSplitString(regex *regexp.Regexp, input string, n int) []string {
if input == "" {
return make([]string, 0)
} else {
return regex.Split(input, n)
}
}
// RegexStringSub implements the sub DSL function.
func RegexStringSub(
input string,
sregex string,
replacement string,
) string {
regex := CompileMillerRegexOrDie(sregex)
_, replacementCaptureMatrix := ReplacementHasCaptures(replacement)
return RegexCompiledSub(input, regex, replacement, replacementCaptureMatrix)
}
// RegexCompiledSub is the same as RegexStringSub but with compiled regex and
// replacement strings.
func RegexCompiledSub(
input string,
regex *regexp.Regexp,
replacement string,
replacementCaptureMatrix [][]int,
) string {
return regexCompiledSubOrGsub(input, regex, replacement, replacementCaptureMatrix, true)
}
// RegexStringGsub implements the `gsub` DSL function.
func RegexStringGsub(
input string,
sregex string,
replacement string,
) string {
regex := CompileMillerRegexOrDie(sregex)
_, replacementCaptureMatrix := ReplacementHasCaptures(replacement)
return regexCompiledSubOrGsub(input, regex, replacement, replacementCaptureMatrix, false)
}
// regexCompiledSubOrGsub is the implementation for `sub`/`gsub` with compilex regex
// and replacement strings.
func regexCompiledSubOrGsub(
input string,
regex *regexp.Regexp,
replacement string,
replacementCaptureMatrix [][]int,
breakOnFirst bool,
) string {
matrix := regex.FindAllStringSubmatchIndex(input, -1)
if len(matrix) == 0 {
return input
}
// Example return value from FindAllSubmatchIndex with input
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
//
// Matrix is [][]int{
// []int{3, 9, 3, 5, 6, 9},
// []int{12, 18, 12, 14, 15, 18},
// }
//
// * 3-9 is for the entire match "ab_cde"
// * 3-5 is for the first capture "ab"
// * 6-9 is for the second capture "cde"
//
// * 12-18 is for the entire match "fg_hij"
// * 12-14 is for the first capture "fg"
// * 15-18 is for the second capture "hij"
var buffer bytes.Buffer
nonMatchStartIndex := 0
for _, row := range matrix {
buffer.WriteString(input[nonMatchStartIndex:row[0]])
// "\0" .. "\9"
captures := make([]string, 10)
di := 0
n := len(row)
for si := 0; si < n && di <= 9; si += 2 {
start := row[si]
end := row[si+1]
if start >= 0 && end >= 0 {
captures[di] = input[start:end]
}
di += 1
}
// If the replacement had no captures, e.g. "xyz", we would insert it
//
// "..." -> "..."
// "ab_cde" -> "xyz" --- here
// "..." -> "..."
// "fg_hij" -> "xyz" --- and here
// "..." -> "..."
//
// using buffer.WriteString(replacement). However, this function exists
// to handle the case when the replacement string has captures like
// "\2:\1", so we need to produce
//
// "..." -> "..."
// "ab_cde" -> "cde:ab" --- here
// "..." -> "..."
// "fg_hij" -> "hij:fg" --- and here
// "..." -> "..."
updatedReplacement := InterpolateCaptures(
replacement,
replacementCaptureMatrix,
captures,
)
buffer.WriteString(updatedReplacement)
nonMatchStartIndex = row[1]
if breakOnFirst {
break
}
}
buffer.WriteString(input[nonMatchStartIndex:])
return buffer.String()
}
// RegexStringMatchSimple is for simple boolean return without any substring captures.
func RegexStringMatchSimple(
input string,
sregex string,
) bool {
regex := CompileMillerRegexOrDie(sregex)
return RegexCompiledMatchSimple(input, regex)
}
// RegexCompiledMatchSimple is for simple boolean return without any substring captures.
func RegexCompiledMatchSimple(
input string,
regex *regexp.Regexp,
) bool {
return regex.MatchString(input)
}
// RegexStringMatchWithMapResults implements much of the `strmatchx` DSL function. This returns
// captures via return values. This is distinct from RegexStringMatchWithCaptures which is for the
// `=~` DSL operator.
func RegexStringMatchWithMapResults(
input string,
sregex string,
) (
matches bool,
captures []string,
starts []int,
ends []int,
) {
regex := CompileMillerRegexOrDie(sregex)
return RegexCompiledMatchWithMapResults(input, regex)
}
// RegexCompiledMatchWithMapResults does the work for RegexStringMatchWithMapResults once
// a compiled regexp is available. Array slot 0 is for the full match; slots 1 and up
// are for the capture-matches such as "\([0-9]+\):\([a-z]+\)".
func RegexCompiledMatchWithMapResults(
input string,
regex *regexp.Regexp,
) (bool, []string, []int, []int) {
captures := make([]string, 0, 10)
starts := make([]int, 0, 10)
ends := make([]int, 0, 10)
matrix := regex.FindAllStringSubmatchIndex(input, -1)
if len(matrix) == 0 {
return false, captures, starts, ends
}
// If there are multiple matches -- e.g. input is
//
// "...ab_cde...fg_hij..."
//
// with regex
//
// "(..)_(...)"
//
// -- then we only consider the first match: boolean return value is true
// (the input string matched the regex), and the captures array will map
// slot 1 to "ab" and slot 2 to "cde".
row := matrix[0]
n := len(row)
// Example return value from FindAllSubmatchIndex with input
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
//
// Matrix is [][]int{
// []int{3, 9, 3, 5, 6, 9},
// []int{12, 18, 12, 14, 15, 18},
// }
//
// As noted above we look at only the first row.
//
// * 3-9 is for the entire match "ab_cde"
// * 3-5 is for the first capture "ab"
// * 6-9 is for the second capture "cde"
for si := 0; si < n; si += 2 {
start := row[si]
end := row[si+1]
if start >= 0 && end >= 0 {
captures = append(captures, input[start:end])
starts = append(starts, start+1)
ends = append(ends, end)
} else {
captures = append(captures, "")
starts = append(starts, -1)
ends = append(ends, -1)
}
}
return true, captures, starts, ends
}
// RegexStringMatchWithCaptures implements the =~ DSL operator. The captures are stored in DSL
// state and may be used by a DSL statement after the =~. For example, in
//
// sub($a, "(..)_(...)", "\1:\2")
//
// the replacement string is an argument to sub and therefore the captures are
// confined to the implementation of the sub function. Similarly for gsub. But
// for the match operator, people can do
//
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1"
// }
//
// and the =~ callsite doesn't know if captures will be used or not. So,
// RegexStringMatchWithCaptures always returns the captures array. It is stored within the CST
// state.
func RegexStringMatchWithCaptures(
input string,
sregex string,
) (
matches bool,
capturesOneUp []string,
) {
regex := CompileMillerRegexOrDie(sregex)
return RegexCompiledMatchWithCaptures(input, regex)
}
// RegexCompiledMatchWithCaptures is the implementation for the =~ operator. Without
// Miller-style regex captures this would a simple one-line
// regex.MatchString(input). However, we return the captures array for the
// benefit of subsequent references to "\0".."\9".
func RegexCompiledMatchWithCaptures(
input string,
regex *regexp.Regexp,
) (bool, []string) {
matrix := regex.FindAllStringSubmatchIndex(input, -1)
if len(matrix) == 0 {
// Set all captures to ""
return false, make([]string, 10)
}
// "\0" .. "\9"
captures := make([]string, 10)
// If there are multiple matches -- e.g. input is
//
// "...ab_cde...fg_hij..."
//
// with regex
//
// "(..)_(...)"
//
// -- then we only consider the first match: boolean return value is true
// (the input string matched the regex), and the captures array will map
// "\1" to "ab" and "\2" to "cde".
row := matrix[0]
n := len(row)
// Example return value from FindAllSubmatchIndex with input
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
//
// Matrix is [][]int{
// []int{3, 9, 3, 5, 6, 9},
// []int{12, 18, 12, 14, 15, 18},
// }
//
// As noted above we look at only the first row.
//
// * 3-9 is for the entire match "ab_cde"
// * 3-5 is for the first capture "ab"
// * 6-9 is for the second capture "cde"
di := 0
for si := 0; si < n && di <= 9; si += 2 {
start := row[si]
end := row[si+1]
if start >= 0 && end >= 0 {
captures[di] = input[start:end]
}
di += 1
}
return true, captures
}
// MakeEmptyCaptures is for initial CST state at the start of executing the DSL expression for the
// current record. Even if '$x =~ "(..)_(...)" set "\1" and "\2" on the previous record, at start
// of processing for the current record we need to start with a clean slate. This is in support of
// CST state, which `=~` semantics requires.
func MakeEmptyCaptures() []string {
return nil
}
// ReplacementHasCaptures is used by the CST builder to see if string-literal is like "foo bar" or
// "foo \1 bar" -- in the latter case it needs to retain the compiled offsets-matrix information.
// This is in support of CST state, which `=~` semantics requires.
func ReplacementHasCaptures(
replacement string,
) (
hasCaptures bool,
matrix [][]int,
) {
if captureDetector.MatchString(replacement) {
return true, captureSplitter.FindAllStringSubmatchIndex(replacement, -1)
} else {
return false, nil
}
}
// InterpolateCaptures example:
//
// * Input $x is "ab_cde"
//
// - DSL expression
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1";
// }
//
// * InterpolateCaptures is used on the evaluation of "\2:\1"
//
// * replacementString is "\2:\1"
//
// - replacementMatrix contains precomputed/cached offsets for the "\2" and
// "\1" substrings within "\2:\1"
//
// - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
// slot 2 being "cde" (for "\2"), and slots 3-9 being "".
func InterpolateCaptures(
replacementString string,
replacementMatrix [][]int,
captures []string,
) string {
if replacementMatrix == nil || captures == nil {
return replacementString
}
var buffer bytes.Buffer
nonMatchStartIndex := 0
for _, row := range replacementMatrix {
start := row[0]
buffer.WriteString(replacementString[nonMatchStartIndex:row[0]])
// Map "\0".."\9" to integer index 0..9
index := replacementString[start+1] - '0'
buffer.WriteString(captures[index])
nonMatchStartIndex = row[1]
}
buffer.WriteString(replacementString[nonMatchStartIndex:])
return buffer.String()
}