miller/pkg/bifs/strings.go

package bifs

import (
	"bytes"
	"fmt"
	"regexp"
	"strconv"
	"strings"

	"github.com/johnkerl/miller/v6/pkg/lib"
	"github.com/johnkerl/miller/v6/pkg/mlrval"
)

// ================================================================
func BIF_strlen(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	if !input1.IsStringOrVoid() {
		return mlrval.FromTypeErrorUnary("strlen", input1)
	} else {
		return mlrval.FromInt(lib.UTF8Strlen(input1.AcquireStringValue()))
	}
}

// ================================================================
func BIF_string(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	return mlrval.FromString(input1.String())
}

// ================================================================
// Dot operator, with loose typecasting.
//
// For most operations, I don't like loose typecasting -- for example, in PHP
// "10" + 2 is the number 12 and in JavaScript it's the string "102", and I
// find both of those horrid and error-prone. In Miller, "10"+2 is MT_ERROR, by
// design, unless intentional casting is done like '$x=int("10")+2'.
//
// However, for dotting, in practice I tipped over and allowed dotting of
// strings and ints: so while "10" + 2 is an error in Miller, '"10". 2' is
// "102". Unlike with "+", with "." there is no ambiguity about what the output
// should be: always the string concatenation of the string representations of
// the two arguments. So, we do the string-cast for the user.

func dot_s_xx(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	return mlrval.FromString(input1.String() + input2.String())
}

func dot_te(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	return mlrval.FromTypeErrorBinary(".", input1, input2)
}

var dot_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{
	//       .  INT       FLOAT     BOOL      VOID   STRING    ARRAY  MAP    FUNC    ERROR   NULL   ABSENT
	/*INT    */ {dot_s_xx, dot_s_xx, dot_s_xx, _s1__, dot_s_xx, dot_te, dot_te, dot_te, dot_te, _1___, _s1__},
	/*FLOAT  */ {dot_s_xx, dot_s_xx, dot_s_xx, _s1__, dot_s_xx, dot_te, dot_te, dot_te, dot_te, _1___, _s1__},
	/*BOOL   */ {dot_s_xx, dot_s_xx, dot_s_xx, _s1__, dot_s_xx, dot_te, dot_te, dot_te, dot_te, _1___, _s1__},
	/*VOID   */ {_s2__, _s2__, _s2__, _void, _2___, _absn, _absn, dot_te, dot_te, _void, _void},
	/*STRING */ {dot_s_xx, dot_s_xx, dot_s_xx, _1___, dot_s_xx, dot_te, dot_te, dot_te, dot_te, _1___, _1___},
	/*ARRAY  */ {dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te},
	/*MAP    */ {dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te},
	/*FUNC   */ {dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te, dot_te},
	/*ERROR  */ {dot_te, dot_te, dot_te, dot_te, dot_te, _absn, _absn, dot_te, dot_te, dot_te, dot_te},
	/*NULL   */ {_s2__, _s2__, _s2__, _void, _2___, _absn, _absn, dot_te, dot_te, _null, _null},
	/*ABSENT */ {_s2__, _s2__, _s2__, _void, _2___, _absn, _absn, dot_te, dot_te, _null, _absn},
}

func BIF_dot(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	return dot_dispositions[input1.Type()][input2.Type()](input1, input2)
}

// ================================================================
// substr1(s,m,n) gives substring of s from 1-up position m to n inclusive.
// Negative indices -len .. -1 alias to 0 .. len-1.

func BIF_substr_1_up(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsAbsent() {
		return mlrval.ABSENT
	}
	if input1.IsError() {
		return mlrval.FromTypeErrorUnary("substr1", input1)
	}
	sinput := input1.String()

	// Handle UTF-8 correctly: len(input1.AcquireStringValue()) will count bytes, not runes.
	runes := []rune(sinput)
	strlen := int(len(runes))

	sliceIsEmpty, absentOrError, lowerZindex, upperZindex := MillerSliceAccess(input2, input3, strlen, false)

	if sliceIsEmpty {
		return mlrval.VOID
	}
	if absentOrError != nil {
		return absentOrError
	}

	// Note Golang slice indices are 0-up, and the 1st index is inclusive
	// while the 2nd is exclusive. For Miller, indices are 1-up and both
	// are inclusive.
	return mlrval.FromString(string(runes[lowerZindex : upperZindex+1]))
}

// ================================================================
// substr0(s,m,n) gives substring of s from 0-up position m to n inclusive.
// Negative indices -len .. -1 alias to 0 .. len-1.

func BIF_substr_0_up(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsAbsent() {
		return mlrval.ABSENT
	}
	if input1.IsError() {
		return mlrval.FromTypeErrorUnary("substr0", input1)
	}
	sinput := input1.String()

	// Handle UTF-8 correctly: len(input1.AcquireStringValue()) will count bytes, not runes.
	runes := []rune(sinput)
	strlen := int(len(runes))

	sliceIsEmpty, absentOrError, lowerZindex, upperZindex := MillerSliceAccess(input2, input3, strlen, true)

	if sliceIsEmpty {
		return mlrval.VOID
	}
	if absentOrError != nil {
		return absentOrError
	}

	// Note Golang slice indices are 0-up, and the 1st index is inclusive
	// while the 2nd is exclusive. For Miller, indices are 1-up and both
	// are inclusive.
	return mlrval.FromString(string(runes[lowerZindex : upperZindex+1]))
}

// ================================================================
// index(string, substring) returns the index of substring within string (if found), or -1 if not
// found.

func BIF_index(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsAbsent() {
		return mlrval.ABSENT
	}
	if input1.IsError() {
		return mlrval.FromTypeErrorUnary("index", input1)
	}
	sinput1 := input1.String()
	sinput2 := input2.String()

	// Handle UTF-8 correctly, since Go's strings.Index counts bytes
	iindex := strings.Index(sinput1, sinput2)
	if iindex < 0 {
		return mlrval.FromInt(int64(iindex))
	}

	// Go indices are 0-up; Miller indices are 1-up.
	return mlrval.FromInt(lib.UTF8Strlen(sinput1[:iindex]) + 1)
}

// ================================================================
// contains(string, substring) returns true if string contains substring, else false.

func BIF_contains(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsAbsent() {
		return mlrval.ABSENT
	}
	if input1.IsError() {
		return input1
	}

	return mlrval.FromBool(strings.Contains(input1.String(), input2.String()))
}

// ================================================================
func BIF_truncate(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsErrorOrAbsent() {
		return input1
	}
	if input2.IsErrorOrAbsent() {
		return input2
	}
	if !input1.IsStringOrVoid() {
		return mlrval.FromTypeErrorUnary("truncate", input1)
	}
	if !input2.IsInt() {
		return mlrval.FromTypeErrorUnary("truncate", input2)
	}
	if input2.AcquireIntValue() < 0 {
		return mlrval.FromTypeErrorUnary("truncate", input2)
	}

	// Handle UTF-8 correctly: len(input1.AcquireStringValue()) will count bytes, not runes.
	runes := []rune(input1.AcquireStringValue())
	oldLength := int(len(runes))
	maxLength := int(input2.AcquireIntValue())
	if oldLength <= maxLength {
		return input1
	} else {
		return mlrval.FromString(string(runes[0:maxLength]))
	}
}

// ================================================================
func BIF_leftpad(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsErrorOrAbsent() {
		return input1
	}
	if input2.IsErrorOrAbsent() {
		return input2
	}
	if input3.IsErrorOrAbsent() {
		return input3
	}

	if !input2.IsInt() {
		return mlrval.FromTypeErrorUnary("leftpad", input2)
	}

	inputString := input1.String()
	padString := input3.String()

	inputLength := lib.UTF8Strlen(inputString)
	padLength := lib.UTF8Strlen(padString)
	targetLength := input2.AcquireIntValue()
	outputLength := inputLength

	var buffer bytes.Buffer
	for outputLength+padLength <= targetLength {
		buffer.WriteString(padString)
		outputLength += padLength
	}
	buffer.WriteString(inputString)

	return mlrval.FromString(buffer.String())
}

func BIF_rightpad(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsErrorOrAbsent() {
		return input1
	}
	if input2.IsErrorOrAbsent() {
		return input2
	}
	if input3.IsErrorOrAbsent() {
		return input3
	}

	if !input2.IsInt() {
		return mlrval.FromTypeErrorUnary("rightpad", input2)
	}

	inputString := input1.String()
	padString := input3.String()

	inputLength := lib.UTF8Strlen(inputString)
	padLength := lib.UTF8Strlen(padString)
	targetLength := input2.AcquireIntValue()
	outputLength := inputLength

	var buffer bytes.Buffer
	buffer.WriteString(inputString)
	for outputLength+padLength <= targetLength {
		buffer.WriteString(padString)
		outputLength += padLength
	}

	return mlrval.FromString(buffer.String())
}

// ================================================================
func BIF_lstrip(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsString() {
		return mlrval.FromString(strings.TrimLeft(input1.AcquireStringValue(), " \t"))
	} else {
		return input1
	}
}

func BIF_rstrip(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsString() {
		return mlrval.FromString(strings.TrimRight(input1.AcquireStringValue(), " \t"))
	} else {
		return input1
	}
}

func BIF_strip(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsString() {
		return mlrval.FromString(strings.Trim(input1.AcquireStringValue(), " \t"))
	} else {
		return input1
	}
}

// ----------------------------------------------------------------
func BIF_collapse_whitespace(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	return BIF_collapse_whitespace_regexp(input1, _whitespace_regexp)
}

func BIF_collapse_whitespace_regexp(input1 *mlrval.Mlrval, whitespaceRegexp *regexp.Regexp) *mlrval.Mlrval {
	if input1.IsString() {
		return mlrval.FromString(whitespaceRegexp.ReplaceAllString(input1.AcquireStringValue(), " "))
	} else {
		return input1
	}
}

var _whitespace_regexp = regexp.MustCompile(`\s+`)

// ================================================================
func BIF_toupper(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsString() {
		return mlrval.FromString(strings.ToUpper(input1.AcquireStringValue()))
	} else if input1.IsVoid() {
		return input1
	} else {
		return input1
	}
}

func BIF_tolower(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsString() {
		return mlrval.FromString(strings.ToLower(input1.AcquireStringValue()))
	} else if input1.IsVoid() {
		return input1
	} else {
		return input1
	}
}

func BIF_capitalize(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsString() {
		if input1.AcquireStringValue() == "" {
			return input1
		} else {
			runes := []rune(input1.AcquireStringValue())
			rfirst := runes[0]
			rrest := runes[1:]
			sfirst := strings.ToUpper(string(rfirst))
			srest := string(rrest)
			return mlrval.FromString(sfirst + srest)
		}
	} else {
		return input1
	}
}

// ----------------------------------------------------------------
func BIF_clean_whitespace(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	mv := BIF_strip(
		BIF_collapse_whitespace_regexp(
			input1, _whitespace_regexp,
		),
	)
	return mlrval.FromInferredType(mv.String())
}

// ================================================================
func BIF_format(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval {
	if len(mlrvals) == 0 {
		return mlrval.VOID
	}
	formatString, ok := mlrvals[0].GetStringValue()
	if !ok { // not a string
		return mlrval.FromTypeErrorUnary("format", mlrvals[0])
	}

	pieces := lib.SplitString(formatString, "{}")

	var buffer bytes.Buffer

	// Example: format("{}:{}", 8, 9)
	//
	// * piece[0] ""
	// * piece[1] ":"
	// * piece[2] ""
	// * mlrval[1] 8
	// * mlrval[2] 9
	//
	// So:
	// * Write piece[0]
	// * Write mlrvals[1]
	// * Write piece[1]
	// * Write mlrvals[2]
	// * Write piece[2]

	// Q: What if too few arguments for format?
	// A: Leave them off
	// Q: What if too many arguments for format?
	// A: Leave them off

	n := len(mlrvals)
	for i, piece := range pieces {
		if i > 0 {
			if i < n {
				buffer.WriteString(mlrvals[i].String())
			}
		}
		buffer.WriteString(piece)
	}

	return mlrval.FromString(buffer.String())
}

// unformat("{}:{}:{}",  "1:2:3")    gives [1, 2]
// unformat("{}h{}m{}s", "3h47m22s") gives [3, 47, 22]
func BIF_unformat(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	return bif_unformat_aux(input1, input2, true)
}
func BIF_unformatx(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	return bif_unformat_aux(input1, input2, false)
}

func bif_unformat_aux(input1, input2 *mlrval.Mlrval, inferTypes bool) *mlrval.Mlrval {
	template, ok1 := input1.GetStringValue()
	if !ok1 {
		return mlrval.FromTypeErrorUnary("unformat", input1)
	}
	input, ok2 := input2.GetStringValue()
	if !ok2 {
		return mlrval.FromTypeErrorUnary("unformat", input2)
	}

	templatePieces := strings.Split(template, "{}")
	output := mlrval.FromEmptyArray()

	// template "{}h{}m{}s"
	// input    "12h34m56s"
	// templatePieces   ["", "h", "m", "s"]

	remaining := input

	if !strings.HasPrefix(remaining, templatePieces[0]) {
		return mlrval.FromError(
			fmt.Errorf(
				"unformat(\"%s\", \"%s\"): component \"%s\" lacks prefix \"%s\"",
				input1.OriginalString(),
				input2.OriginalString(),
				remaining,
				templatePieces[0],
			),
		)
	}
	remaining = remaining[len(templatePieces[0]):]
	templatePieces = templatePieces[1:]

	n := len(templatePieces)
	for i, templatePiece := range templatePieces {

		var index int
		if i == n-1 && templatePiece == "" {
			// strings.Index("", ...) will match the *start* of what's
			// remaining, whereas we want it to match the end.
			index = len(remaining)
		} else {
			index = strings.Index(remaining, templatePiece)
			if index < 0 {
				return mlrval.FromError(
					fmt.Errorf(
						"unformat(\"%s\", \"%s\"): component \"%s\" lacks prefix \"%s\"",
						input1.OriginalString(),
						input2.OriginalString(),
						remaining,
						templatePiece,
					),
				)
			}
		}

		inputPiece := remaining[:index]
		remaining = remaining[index+len(templatePiece):]
		if inferTypes {
			output.ArrayAppend(mlrval.FromInferredType(inputPiece))
		} else {
			output.ArrayAppend(mlrval.FromString(inputPiece))
		}
	}

	return output
}

// ================================================================
func BIF_hexfmt(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsInt() {
		return mlrval.FromString("0x" + strconv.FormatUint(uint64(input1.AcquireIntValue()), 16))
	} else {
		return input1
	}
}

// ----------------------------------------------------------------
func fmtnum_is(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	if !input2.IsString() {
		return mlrval.FromTypeErrorUnary("fmtnum", input2)
	}
	formatString := input2.AcquireStringValue()
	formatter, err := mlrval.GetFormatter(formatString)
	if err != nil {
		return mlrval.FromError(err)
	}

	return formatter.Format(input1)
}

func fmtnum_fs(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	if !input2.IsString() {
		return mlrval.FromTypeErrorUnary("fmtnum", input2)
	}
	formatString := input2.AcquireStringValue()
	formatter, err := mlrval.GetFormatter(formatString)
	if err != nil {
		return mlrval.FromError(err)
	}

	return formatter.Format(input1)
}

func fmtnum_bs(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	if !input2.IsString() {
		return mlrval.FromTypeErrorUnary("fmtnum", input2)
	}
	formatString := input2.AcquireStringValue()
	formatter, err := mlrval.GetFormatter(formatString)
	if err != nil {
		return mlrval.FromError(err)
	}

	intMv := mlrval.FromInt(lib.BoolToInt(input1.AcquireBoolValue()))

	return formatter.Format(intMv)
}

func fmtnum_te(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	return mlrval.FromTypeErrorBinary("fmtnum", input1, input2)
}

var fmtnum_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{
	//       .  INT    FLOAT  BOOL   VOID   STRING     ARRAY  MAP    FUNC    ERROR   NULL   ABSENT
	/*INT    */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_is, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
	/*FLOAT  */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_fs, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
	/*BOOL   */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_bs, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
	/*VOID   */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
	/*STRING */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
	/*ARRAY  */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
	/*MAP    */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
	/*FUNC   */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te},
	/*ERROR  */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te},
	/*NULL   */ {fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn},
	/*ABSENT */ {_absn, _absn, fmtnum_te, _absn, _absn, fmtnum_te, fmtnum_te, fmtnum_te, fmtnum_te, _absn, _absn},
}

func BIF_fmtnum(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsArray() || input1.IsMap() {
		return recurseBinaryFuncOnInput1(BIF_fmtnum, input1, input2)
	} else {
		return fmtnum_dispositions[input1.Type()][input2.Type()](input1, input2)
	}
}

func BIF_fmtifnum(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsArray() || input1.IsMap() {
		return recurseBinaryFuncOnInput1(BIF_fmtifnum, input1, input2)
	} else {
		output := fmtnum_dispositions[input1.Type()][input2.Type()](input1, input2)
		if output.IsError() {
			return input1
		} else {
			return output
		}
	}
}

func BIF_latin1_to_utf8(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsArray() || input1.IsMap() {
		return recurseUnaryFuncOnInput1(BIF_latin1_to_utf8, input1)
	} else if input1.IsString() {
		output, err := lib.TryLatin1ToUTF8(input1.String())
		if err != nil {
			// Somewhat arbitrary design decision
			// return input1
			return mlrval.FromError(err)
		} else {
			return mlrval.FromString(output)
		}
	} else {
		return input1
	}
}

func BIF_utf8_to_latin1(input1 *mlrval.Mlrval) *mlrval.Mlrval {
	if input1.IsArray() || input1.IsMap() {
		return recurseUnaryFuncOnInput1(BIF_utf8_to_latin1, input1)
	} else if input1.IsString() {
		output, err := lib.TryUTF8ToLatin1(input1.String())
		if err != nil {
			// Somewhat arbitrary design decision
			// return input1
			return mlrval.FromError(err)
		} else {
			return mlrval.FromString(output)
		}
	} else {
		return input1
	}
}