miller/pkg/bifs/strings.go
Adam Lesperance 085e831668
The package version must match the major tag version (#1654)
* Update package version

* Update makefile targets

* Update readme packages

* Remaining old packages via rg/sd
2024-09-20 12:10:11 -04:00

602 lines
18 KiB
Go

package bifs
import (
"bytes"
"fmt"
"regexp"
"strconv"
"strings"
"github.com/johnkerl/miller/v6/pkg/lib"
"github.com/johnkerl/miller/v6/pkg/mlrval"
)
// ================================================================
func BIF_strlen(input1 *mlrval.Mlrval) *mlrval.Mlrval {
if !input1.IsStringOrVoid() {
return mlrval.FromTypeErrorUnary("strlen", input1)
} else {
return mlrval.FromInt(lib.UTF8Strlen(input1.AcquireStringValue()))
}
}
// ================================================================
func BIF_string(input1 *mlrval.Mlrval) *mlrval.Mlrval {
return mlrval.FromString(input1.String())
}
// ================================================================
// Dot operator, with loose typecasting.
//
// For most operations, I don't like loose typecasting -- for example, in PHP
// "10" + 2 is the number 12 and in JavaScript it's the string "102", and I
// find both of those horrid and error-prone. In Miller, "10"+2 is MT_ERROR, by
// design, unless intentional casting is done like '$x=int("10")+2'.
//
// However, for dotting, in practice I tipped over and allowed dotting of
// strings and ints: so while "10" + 2 is an error in Miller, '"10". 2' is
// "102". Unlike with "+", with "." there is no ambiguity about what the output
// should be: always the string concatenation of the string representations of
// the two arguments. So, we do the string-cast for the user.
func dot_s_xx(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
return mlrval.FromString(input1.String() + input2.String())
}
func dot_te(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
return mlrval.FromTypeErrorBinary(".", input1, input2)
}
var dot_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{
// . INT FLOAT BOOL VOID STRING ARRAY MAP FUNC ERROR NULL ABSENT
/*INT */ {dot_s_xx, dot_s_xx, dot_s_xx, _s1__, dot_s_xx, dot_te, dot_te, dot_te, dot_te, _1___, _s1__},
/*FLOAT */ {dot_s_xx, dot_s_xx, dot_s_xx, _s1__, dot_s_xx, dot_te, dot_te, dot_te, dot_te, _1___, _s1__},
/*BOOL */ {dot_s_xx, dot_s_xx, dot_s_xx, _s1__, dot_s_xx, dot_te, dot_te, dot_te, dot_te, _1___, _s1__},
/*VOID */ {_s2__, _s2__, _s2__, _void, _2___, _absn, _absn, dot_te, dot_te, _void, _void},
/*STRING */ {dot_s_xx, dot_s_xx, dot_s_xx, _1___, dot_s_xx, dot_te, dot_te, dot_te, dot_te, _1___, _1___},
/*ARRAY */ {dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te},
/*MAP */ {dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te},
/*FUNC */ {dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te},
/*ERROR */ {dot_te, dot_te, dot_te, dot_te, dot_te, _absn, _absn, dot_te, dot_te, dot_te, dot_te},
/*NULL */ {_s2__, _s2__, _s2__, _void, _2___, _absn, _absn, dot_te, dot_te, _null, _null},
/*ABSENT */ {_s2__, _s2__, _s2__, _void, _2___, _absn, _absn, dot_te, dot_te, _null, _absn},
}
func BIF_dot(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
return dot_dispositions[input1.Type()][input2.Type()](input1, input2)
}
// ================================================================
// substr1(s,m,n) gives substring of s from 1-up position m to n inclusive.
// Negative indices -len .. -1 alias to 0 .. len-1.
func BIF_substr_1_up(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsAbsent() {
return mlrval.ABSENT
}
if input1.IsError() {
return mlrval.FromTypeErrorUnary("substr1", input1)
}
sinput := input1.String()
// Handle UTF-8 correctly: len(input1.AcquireStringValue()) will count bytes, not runes.
runes := []rune(sinput)
strlen := int(len(runes))
sliceIsEmpty, absentOrError, lowerZindex, upperZindex := MillerSliceAccess(input2, input3, strlen, false)
if sliceIsEmpty {
return mlrval.VOID
}
if absentOrError != nil {
return absentOrError
}
// Note Golang slice indices are 0-up, and the 1st index is inclusive
// while the 2nd is exclusive. For Miller, indices are 1-up and both
// are inclusive.
return mlrval.FromString(string(runes[lowerZindex : upperZindex+1]))
}
// ================================================================
// substr0(s,m,n) gives substring of s from 0-up position m to n inclusive.
// Negative indices -len .. -1 alias to 0 .. len-1.
func BIF_substr_0_up(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsAbsent() {
return mlrval.ABSENT
}
if input1.IsError() {
return mlrval.FromTypeErrorUnary("substr0", input1)
}
sinput := input1.String()
// Handle UTF-8 correctly: len(input1.AcquireStringValue()) will count bytes, not runes.
runes := []rune(sinput)
strlen := int(len(runes))
sliceIsEmpty, absentOrError, lowerZindex, upperZindex := MillerSliceAccess(input2, input3, strlen, true)
if sliceIsEmpty {
return mlrval.VOID
}
if absentOrError != nil {
return absentOrError
}
// Note Golang slice indices are 0-up, and the 1st index is inclusive
// while the 2nd is exclusive. For Miller, indices are 1-up and both
// are inclusive.
return mlrval.FromString(string(runes[lowerZindex : upperZindex+1]))
}
// ================================================================
// index(string, substring) returns the index of substring within string (if found), or -1 if not
// found.
func BIF_index(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsAbsent() {
return mlrval.ABSENT
}
if input1.IsError() {
return mlrval.FromTypeErrorUnary("index", input1)
}
sinput1 := input1.String()
sinput2 := input2.String()
// Handle UTF-8 correctly, since Go's strings.Index counts bytes
iindex := strings.Index(sinput1, sinput2)
if iindex < 0 {
return mlrval.FromInt(int64(iindex))
}
// Go indices are 0-up; Miller indices are 1-up.
return mlrval.FromInt(lib.UTF8Strlen(sinput1[:iindex]) + 1)
}
// ================================================================
// contains(string, substring) returns true if string contains substring, else false.
func BIF_contains(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsAbsent() {
return mlrval.ABSENT
}
if input1.IsError() {
return input1
}
return mlrval.FromBool(strings.Contains(input1.String(), input2.String()))
}
// ================================================================
func BIF_truncate(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsErrorOrAbsent() {
return input1
}
if input2.IsErrorOrAbsent() {
return input2
}
if !input1.IsStringOrVoid() {
return mlrval.FromTypeErrorUnary("truncate", input1)
}
if !input2.IsInt() {
return mlrval.FromTypeErrorUnary("truncate", input2)
}
if input2.AcquireIntValue() < 0 {
return mlrval.FromTypeErrorUnary("truncate", input2)
}
// Handle UTF-8 correctly: len(input1.AcquireStringValue()) will count bytes, not runes.
runes := []rune(input1.AcquireStringValue())
oldLength := int(len(runes))
maxLength := int(input2.AcquireIntValue())
if oldLength <= maxLength {
return input1
} else {
return mlrval.FromString(string(runes[0:maxLength]))
}
}
// ================================================================
func BIF_leftpad(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsErrorOrAbsent() {
return input1
}
if input2.IsErrorOrAbsent() {
return input2
}
if input3.IsErrorOrAbsent() {
return input3
}
if !input2.IsInt() {
return mlrval.FromTypeErrorUnary("leftpad", input2)
}
inputString := input1.String()
padString := input3.String()
inputLength := lib.UTF8Strlen(inputString)
padLength := lib.UTF8Strlen(padString)
targetLength := input2.AcquireIntValue()
outputLength := inputLength
var buffer bytes.Buffer
for outputLength+padLength <= targetLength {
buffer.WriteString(padString)
outputLength += padLength
}
buffer.WriteString(inputString)
return mlrval.FromString(buffer.String())
}
func BIF_rightpad(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsErrorOrAbsent() {
return input1
}
if input2.IsErrorOrAbsent() {
return input2
}
if input3.IsErrorOrAbsent() {
return input3
}
if !input2.IsInt() {
return mlrval.FromTypeErrorUnary("rightpad", input2)
}
inputString := input1.String()
padString := input3.String()
inputLength := lib.UTF8Strlen(inputString)
padLength := lib.UTF8Strlen(padString)
targetLength := input2.AcquireIntValue()
outputLength := inputLength
var buffer bytes.Buffer
buffer.WriteString(inputString)
for outputLength+padLength <= targetLength {
buffer.WriteString(padString)
outputLength += padLength
}
return mlrval.FromString(buffer.String())
}
// ================================================================
func BIF_lstrip(input1 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsString() {
return mlrval.FromString(strings.TrimLeft(input1.AcquireStringValue(), " \t"))
} else {
return input1
}
}
func BIF_rstrip(input1 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsString() {
return mlrval.FromString(strings.TrimRight(input1.AcquireStringValue(), " \t"))
} else {
return input1
}
}
func BIF_strip(input1 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsString() {
return mlrval.FromString(strings.Trim(input1.AcquireStringValue(), " \t"))
} else {
return input1
}
}
// ----------------------------------------------------------------
func BIF_collapse_whitespace(input1 *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_collapse_whitespace_regexp(input1, _whitespace_regexp)
}
func BIF_collapse_whitespace_regexp(input1 *mlrval.Mlrval, whitespaceRegexp *regexp.Regexp) *mlrval.Mlrval {
if input1.IsString() {
return mlrval.FromString(whitespaceRegexp.ReplaceAllString(input1.AcquireStringValue(), " "))
} else {
return input1
}
}
var _whitespace_regexp = regexp.MustCompile(`\s+`)
// ================================================================
func BIF_toupper(input1 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsString() {
return mlrval.FromString(strings.ToUpper(input1.AcquireStringValue()))
} else if input1.IsVoid() {
return input1
} else {
return input1
}
}
func BIF_tolower(input1 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsString() {
return mlrval.FromString(strings.ToLower(input1.AcquireStringValue()))
} else if input1.IsVoid() {
return input1
} else {
return input1
}
}
func BIF_capitalize(input1 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsString() {
if input1.AcquireStringValue() == "" {
return input1
} else {
runes := []rune(input1.AcquireStringValue())
rfirst := runes[0]
rrest := runes[1:]
sfirst := strings.ToUpper(string(rfirst))
srest := string(rrest)
return mlrval.FromString(sfirst + srest)
}
} else {
return input1
}
}
// ----------------------------------------------------------------
func BIF_clean_whitespace(input1 *mlrval.Mlrval) *mlrval.Mlrval {
mv := BIF_strip(
BIF_collapse_whitespace_regexp(
input1, _whitespace_regexp,
),
)
return mlrval.FromInferredType(mv.String())
}
// ================================================================
func BIF_format(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval {
if len(mlrvals) == 0 {
return mlrval.VOID
}
formatString, ok := mlrvals[0].GetStringValue()
if !ok { // not a string
return mlrval.FromTypeErrorUnary("format", mlrvals[0])
}
pieces := lib.SplitString(formatString, "{}")
var buffer bytes.Buffer
// Example: format("{}:{}", 8, 9)
//
// * piece[0] ""
// * piece[1] ":"
// * piece[2] ""
// * mlrval[1] 8
// * mlrval[2] 9
//
// So:
// * Write piece[0]
// * Write mlrvals[1]
// * Write piece[1]
// * Write mlrvals[2]
// * Write piece[2]
// Q: What if too few arguments for format?
// A: Leave them off
// Q: What if too many arguments for format?
// A: Leave them off
n := len(mlrvals)
for i, piece := range pieces {
if i > 0 {
if i < n {
buffer.WriteString(mlrvals[i].String())
}
}
buffer.WriteString(piece)
}
return mlrval.FromString(buffer.String())
}
// unformat("{}:{}:{}", "1:2:3") gives [1, 2]
// unformat("{}h{}m{}s", "3h47m22s") gives [3, 47, 22]
func BIF_unformat(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
return bif_unformat_aux(input1, input2, true)
}
func BIF_unformatx(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
return bif_unformat_aux(input1, input2, false)
}
func bif_unformat_aux(input1, input2 *mlrval.Mlrval, inferTypes bool) *mlrval.Mlrval {
template, ok1 := input1.GetStringValue()
if !ok1 {
return mlrval.FromTypeErrorUnary("unformat", input1)
}
input, ok2 := input2.GetStringValue()
if !ok2 {
return mlrval.FromTypeErrorUnary("unformat", input2)
}
templatePieces := strings.Split(template, "{}")
output := mlrval.FromEmptyArray()
// template "{}h{}m{}s"
// input "12h34m56s"
// templatePieces ["", "h", "m", "s"]
remaining := input
if !strings.HasPrefix(remaining, templatePieces[0]) {
return mlrval.FromError(
fmt.Errorf(
"unformat(\"%s\", \"%s\"): component \"%s\" lacks prefix \"%s\"",
input1.OriginalString(),
input2.OriginalString(),
remaining,
templatePieces[0],
),
)
}
remaining = remaining[len(templatePieces[0]):]
templatePieces = templatePieces[1:]
n := len(templatePieces)
for i, templatePiece := range templatePieces {
var index int
if i == n-1 && templatePiece == "" {
// strings.Index("", ...) will match the *start* of what's
// remaining, whereas we want it to match the end.
index = len(remaining)
} else {
index = strings.Index(remaining, templatePiece)
if index < 0 {
return mlrval.FromError(
fmt.Errorf(
"unformat(\"%s\", \"%s\"): component \"%s\" lacks prefix \"%s\"",
input1.OriginalString(),
input2.OriginalString(),
remaining,
templatePiece,
),
)
}
}
inputPiece := remaining[:index]
remaining = remaining[index+len(templatePiece):]
if inferTypes {
output.ArrayAppend(mlrval.FromInferredType(inputPiece))
} else {
output.ArrayAppend(mlrval.FromString(inputPiece))
}
}
return output
}
// ================================================================
func BIF_hexfmt(input1 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsInt() {
return mlrval.FromString("0x" + strconv.FormatUint(uint64(input1.AcquireIntValue()), 16))
} else {
return input1
}
}
// ----------------------------------------------------------------
func fmtnum_is(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
if !input2.IsString() {
return mlrval.FromTypeErrorUnary("fmtnum", input2)
}
formatString := input2.AcquireStringValue()
formatter, err := mlrval.GetFormatter(formatString)
if err != nil {
return mlrval.FromError(err)
}
return formatter.Format(input1)
}
func fmtnum_fs(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
if !input2.IsString() {
return mlrval.FromTypeErrorUnary("fmtnum", input2)
}
formatString := input2.AcquireStringValue()
formatter, err := mlrval.GetFormatter(formatString)
if err != nil {
return mlrval.FromError(err)
}
return formatter.Format(input1)
}
func fmtnum_bs(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
if !input2.IsString() {
return mlrval.FromTypeErrorUnary("fmtnum", input2)
}
formatString := input2.AcquireStringValue()
formatter, err := mlrval.GetFormatter(formatString)
if err != nil {
return mlrval.FromError(err)
}
intMv := mlrval.FromInt(lib.BoolToInt(input1.AcquireBoolValue()))
return formatter.Format(intMv)
}
func fmtnum_te(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
return mlrval.FromTypeErrorBinary("fmtnum", input1, input2)
}
var fmtnum_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{
// . INT FLOAT BOOL VOID STRING ARRAY MAP FUNC ERROR NULL ABSENT
/*INT */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_is, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
/*FLOAT */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_fs, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
/*BOOL */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_bs, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
/*VOID */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
/*STRING */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
/*ARRAY */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
/*MAP */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
/*FUNC */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te},
/*ERROR */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te},
/*NULL */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
/*ABSENT */ {_absn, _absn, fmtnum_te, _absn, _absn, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn, _absn},
}
func BIF_fmtnum(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsArray() || input1.IsMap() {
return recurseBinaryFuncOnInput1(BIF_fmtnum, input1, input2)
} else {
return fmtnum_dispositions[input1.Type()][input2.Type()](input1, input2)
}
}
func BIF_fmtifnum(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsArray() || input1.IsMap() {
return recurseBinaryFuncOnInput1(BIF_fmtifnum, input1, input2)
} else {
output := fmtnum_dispositions[input1.Type()][input2.Type()](input1, input2)
if output.IsError() {
return input1
} else {
return output
}
}
}
func BIF_latin1_to_utf8(input1 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsArray() || input1.IsMap() {
return recurseUnaryFuncOnInput1(BIF_latin1_to_utf8, input1)
} else if input1.IsString() {
output, err := lib.TryLatin1ToUTF8(input1.String())
if err != nil {
// Somewhat arbitrary design decision
// return input1
return mlrval.FromError(err)
} else {
return mlrval.FromString(output)
}
} else {
return input1
}
}
func BIF_utf8_to_latin1(input1 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsArray() || input1.IsMap() {
return recurseUnaryFuncOnInput1(BIF_utf8_to_latin1, input1)
} else if input1.IsString() {
output, err := lib.TryUTF8ToLatin1(input1.String())
if err != nil {
// Somewhat arbitrary design decision
// return input1
return mlrval.FromError(err)
} else {
return mlrval.FromString(output)
}
} else {
return input1
}
}