From fa9b00ee64a7ab2c775169550558ab7e35ac30e4 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 27 Mar 2023 00:37:33 -0400 Subject: [PATCH] `index` DSL function [WIP] (#1247) --- docs/src/manpage.md | 22 +++++++++----- docs/src/manpage.txt | 22 +++++++++----- docs/src/reference-dsl-builtin-functions.md | 13 +++++++- internal/pkg/bifs/strings.go | 24 +++++++++++++++ .../pkg/dsl/cst/builtin_function_manager.go | 12 ++++++++ man/manpage.txt | 22 +++++++++----- man/mlr.1 | 30 ++++++++++++++----- test/cases/dsl-index/0001/cmd | 1 + test/cases/dsl-index/0001/experr | 0 test/cases/dsl-index/0001/expout | 13 ++++++++ test/cases/dsl-index/0001/input | 13 ++++++++ test/cases/dsl-index/0001/mlr | 1 + 12 files changed, 143 insertions(+), 30 deletions(-) create mode 100644 test/cases/dsl-index/0001/cmd create mode 100644 test/cases/dsl-index/0001/experr create mode 100644 test/cases/dsl-index/0001/expout create mode 100644 test/cases/dsl-index/0001/input create mode 100644 test/cases/dsl-index/0001/mlr diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 120b4b550..e1a0ac63a 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -210,12 +210,12 @@ MILLER(1) MILLER(1) capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values - gmt2localtime gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec hostname int - invqnorm is_absent is_array is_bool is_boolean is_empty is_empty_map is_error - is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty - is_not_map is_not_null is_null is_numeric is_present is_string joink joinkv - joinv json_parse json_stringify latin1_to_utf8 leafcount leftpad length - localtime2gmt localtime2sec log log10 log1p logifit lstrip madd mapdiff + gmt2localtime gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec hostname index + int invqnorm is_absent is_array is_bool is_boolean is_empty is_empty_map + is_error is_float is_int is_map is_nan is_nonempty_map is_not_array + is_not_empty is_not_map is_not_null is_null is_numeric is_present is_string + joink joinkv joinv json_parse json_stringify latin1_to_utf8 leafcount leftpad + length localtime2gmt localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 mexp min mmul msub os pow qnorm reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 sha512 @@ -2387,6 +2387,14 @@ MILLER(1) MILLER(1) 1mhostname0m (class=system #args=0) Returns the hostname as a string. + 1mindex0m + (class=string #args=2) Returns the index (1-based) of the second argument within the first. Returns -1 if the second argument isn't a substring of the first. Stringifies non-string inputs. Uses UTF-8 encoding to count characters, not bytes. + Examples: + index("abcde", "e") gives 5 + index("abcde", "x") gives 01 + index(12345, 34) gives 3 + index("fort", "t") gives 5 + 1mint0m (class=conversion #args=1,2) Convert int/float/bool/string to int. If the second argument is omitted and the first argument is a string, base is inferred from the first argument's prefix. If the second argument is provided and the first argument is a string, the second argument is used as the base. If the second argument is provided and the first argument is not a string, the second argument is ignored. Examples: @@ -3346,5 +3354,5 @@ MILLER(1) MILLER(1) - 2023-03-24 MILLER(1) + 2023-03-27 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index c78f1b415..17b05732b 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -189,12 +189,12 @@ MILLER(1) MILLER(1) capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values - gmt2localtime gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec hostname int - invqnorm is_absent is_array is_bool is_boolean is_empty is_empty_map is_error - is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty - is_not_map is_not_null is_null is_numeric is_present is_string joink joinkv - joinv json_parse json_stringify latin1_to_utf8 leafcount leftpad length - localtime2gmt localtime2sec log log10 log1p logifit lstrip madd mapdiff + gmt2localtime gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec hostname index + int invqnorm is_absent is_array is_bool is_boolean is_empty is_empty_map + is_error is_float is_int is_map is_nan is_nonempty_map is_not_array + is_not_empty is_not_map is_not_null is_null is_numeric is_present is_string + joink joinkv joinv json_parse json_stringify latin1_to_utf8 leafcount leftpad + length localtime2gmt localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 mexp min mmul msub os pow qnorm reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 sha512 @@ -2366,6 +2366,14 @@ MILLER(1) MILLER(1) 1mhostname0m (class=system #args=0) Returns the hostname as a string. + 1mindex0m + (class=string #args=2) Returns the index (1-based) of the second argument within the first. Returns -1 if the second argument isn't a substring of the first. Stringifies non-string inputs. Uses UTF-8 encoding to count characters, not bytes. + Examples: + index("abcde", "e") gives 5 + index("abcde", "x") gives 01 + index(12345, 34) gives 3 + index("fort", "t") gives 5 + 1mint0m (class=conversion #args=1,2) Convert int/float/bool/string to int. If the second argument is omitted and the first argument is a string, base is inferred from the first argument's prefix. If the second argument is provided and the first argument is a string, the second argument is used as the base. If the second argument is provided and the first argument is not a string, the second argument is ignored. Examples: @@ -3325,4 +3333,4 @@ MILLER(1) MILLER(1) - 2023-03-24 MILLER(1) + 2023-03-27 MILLER(1) diff --git a/docs/src/reference-dsl-builtin-functions.md b/docs/src/reference-dsl-builtin-functions.md index 90bf09e00..ae8f5d517 100644 --- a/docs/src/reference-dsl-builtin-functions.md +++ b/docs/src/reference-dsl-builtin-functions.md @@ -74,7 +74,7 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary * [**Hashing functions**](#hashing-functions): [md5](#md5), [sha1](#sha1), [sha256](#sha256), [sha512](#sha512). * [**Higher-order-functions functions**](#higher-order-functions-functions): [any](#any), [apply](#apply), [every](#every), [fold](#fold), [reduce](#reduce), [select](#select), [sort](#sort). * [**Math functions**](#math-functions): [abs](#abs), [acos](#acos), [acosh](#acosh), [asin](#asin), [asinh](#asinh), [atan](#atan), [atan2](#atan2), [atanh](#atanh), [cbrt](#cbrt), [ceil](#ceil), [cos](#cos), [cosh](#cosh), [erf](#erf), [erfc](#erfc), [exp](#exp), [expm1](#expm1), [floor](#floor), [invqnorm](#invqnorm), [log](#log), [log10](#log10), [log1p](#log1p), [logifit](#logifit), [max](#max), [min](#min), [qnorm](#qnorm), [round](#round), [roundm](#roundm), [sgn](#sgn), [sin](#sin), [sinh](#sinh), [sqrt](#sqrt), [tan](#tan), [tanh](#tanh), [urand](#urand), [urand32](#urand32), [urandelement](#urandelement), [urandint](#urandint), [urandrange](#urandrange). -* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [format](#format), [gssub](#gssub), [gsub](#gsub), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot). +* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot). * [**System functions**](#system-functions): [exec](#exec), [hostname](#hostname), [os](#os), [system](#system), [version](#version). * [**Time functions**](#time-functions): [dhms2fsec](#dhms2fsec), [dhms2sec](#dhms2sec), [fsec2dhms](#fsec2dhms), [fsec2hms](#fsec2hms), [gmt2localtime](#gmt2localtime), [gmt2sec](#gmt2sec), [hms2fsec](#hms2fsec), [hms2sec](#hms2sec), [localtime2gmt](#localtime2gmt), [localtime2sec](#localtime2sec), [sec2dhms](#sec2dhms), [sec2gmt](#sec2gmt), [sec2gmtdate](#sec2gmtdate), [sec2hms](#sec2hms), [sec2localdate](#sec2localdate), [sec2localtime](#sec2localtime), [strftime](#strftime), [strftime_local](#strftime_local), [strptime](#strptime), [strptime_local](#strptime_local), [systime](#systime), [systimeint](#systimeint), [uptime](#uptime). * [**Typing functions**](#typing-functions): [asserting_absent](#asserting_absent), [asserting_array](#asserting_array), [asserting_bool](#asserting_bool), [asserting_boolean](#asserting_boolean), [asserting_empty](#asserting_empty), [asserting_empty_map](#asserting_empty_map), [asserting_error](#asserting_error), [asserting_float](#asserting_float), [asserting_int](#asserting_int), [asserting_map](#asserting_map), [asserting_nonempty_map](#asserting_nonempty_map), [asserting_not_array](#asserting_not_array), [asserting_not_empty](#asserting_not_empty), [asserting_not_map](#asserting_not_map), [asserting_not_null](#asserting_not_null), [asserting_null](#asserting_null), [asserting_numeric](#asserting_numeric), [asserting_present](#asserting_present), [asserting_string](#asserting_string), [is_absent](#is_absent), [is_array](#is_array), [is_bool](#is_bool), [is_boolean](#is_boolean), [is_empty](#is_empty), [is_empty_map](#is_empty_map), [is_error](#is_error), [is_float](#is_float), [is_int](#is_int), [is_map](#is_map), [is_nan](#is_nan), [is_nonempty_map](#is_nonempty_map), [is_not_array](#is_not_array), [is_not_empty](#is_not_empty), [is_not_map](#is_not_map), [is_not_null](#is_not_null), [is_null](#is_null), [is_numeric](#is_numeric), [is_present](#is_present), [is_string](#is_string), [typeof](#typeof). @@ -1023,6 +1023,17 @@ gsub("prefix4529:suffix8567", "(....ix)([0-9]+)", "[\1 : \2]") gives "[prefix : +### index +
+index  (class=string #args=2) Returns the index (1-based) of the second argument within the first. Returns -1 if the second argument isn't a substring of the first. Stringifies non-string inputs. Uses UTF-8 encoding to count characters, not bytes.
+Examples:
+index("abcde", "e") gives 5
+index("abcde", "x") gives 01
+index(12345, 34) gives 3
+index("forêt", "t") gives 5
+
+ + ### latin1_to_utf8
 latin1_to_utf8  (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it.
diff --git a/internal/pkg/bifs/strings.go b/internal/pkg/bifs/strings.go
index 46c86e28d..4cdcdce93 100644
--- a/internal/pkg/bifs/strings.go
+++ b/internal/pkg/bifs/strings.go
@@ -125,6 +125,30 @@ func BIF_substr_0_up(input1, input2, input3 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromString(string(runes[lowerZindex : upperZindex+1]))
 }
 
+// ================================================================
+// index(string, substring) returns the index of substring within string (if found), or -1 if not
+// found.
+
+func BIF_index(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
+	if input1.IsAbsent() {
+		return mlrval.ABSENT
+	}
+	if input1.IsError() {
+		return mlrval.ERROR
+	}
+	sinput1 := input1.String()
+	sinput2 := input2.String()
+
+	// Handle UTF-8 correctly, since Go's strings.Index counts bytes
+	iindex := strings.Index(sinput1, sinput2)
+	if iindex < 0 {
+		return mlrval.FromInt(int64(iindex))
+	}
+
+	// Go indices are 0-up; Miller indices are 1-up.
+	return mlrval.FromInt(lib.UTF8Strlen(sinput1[:iindex]) + 1)
+}
+
 // ================================================================
 func BIF_truncate(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	if input1.IsErrorOrAbsent() {
diff --git a/internal/pkg/dsl/cst/builtin_function_manager.go b/internal/pkg/dsl/cst/builtin_function_manager.go
index 8a6ca2d67..6f616baa4 100644
--- a/internal/pkg/dsl/cst/builtin_function_manager.go
+++ b/internal/pkg/dsl/cst/builtin_function_manager.go
@@ -538,6 +538,18 @@ array and string indices, but, this is a backward-compatibility issue with Mille
 Arrays are new in Miller 6; the substr function is older.`,
 			ternaryFunc: bifs.BIF_substr_0_up,
 		},
+		{
+			name:       "index",
+			class:      FUNC_CLASS_STRING,
+			help:       `Returns the index (1-based) of the second argument within the first. Returns -1 if the second argument isn't a substring of the first. Stringifies non-string inputs. Uses UTF-8 encoding to count characters, not bytes.`,
+			binaryFunc: bifs.BIF_index,
+			examples: []string{
+				`index("abcde", "e") gives 5`,
+				`index("abcde", "x") gives 01`,
+				`index(12345, 34) gives 3`,
+				`index("forêt", "t") gives 5`,
+			},
+		},
 
 		{
 			name:      "tolower",
diff --git a/man/manpage.txt b/man/manpage.txt
index c78f1b415..17b05732b 100644
--- a/man/manpage.txt
+++ b/man/manpage.txt
@@ -189,12 +189,12 @@ MILLER(1)                                                            MILLER(1)
        capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh
        depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor
        fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values
-       gmt2localtime gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec hostname int
-       invqnorm is_absent is_array is_bool is_boolean is_empty is_empty_map is_error
-       is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty
-       is_not_map is_not_null is_null is_numeric is_present is_string joink joinkv
-       joinv json_parse json_stringify latin1_to_utf8 leafcount leftpad length
-       localtime2gmt localtime2sec log log10 log1p logifit lstrip madd mapdiff
+       gmt2localtime gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec hostname index
+       int invqnorm is_absent is_array is_bool is_boolean is_empty is_empty_map
+       is_error is_float is_int is_map is_nan is_nonempty_map is_not_array
+       is_not_empty is_not_map is_not_null is_null is_numeric is_present is_string
+       joink joinkv joinv json_parse json_stringify latin1_to_utf8 leafcount leftpad
+       length localtime2gmt localtime2sec log log10 log1p logifit lstrip madd mapdiff
        mapexcept mapselect mapsum max md5 mexp min mmul msub os pow qnorm reduce
        regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt
        sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 sha512
@@ -2366,6 +2366,14 @@ MILLER(1)                                                            MILLER(1)
    1mhostname0m
         (class=system #args=0) Returns the hostname as a string.
 
+   1mindex0m
+        (class=string #args=2) Returns the index (1-based) of the second argument within the first. Returns -1 if the second argument isn't a substring of the first. Stringifies non-string inputs. Uses UTF-8 encoding to count characters, not bytes.
+       Examples:
+       index("abcde", "e") gives 5
+       index("abcde", "x") gives 01
+       index(12345, 34) gives 3
+       index("fort", "t") gives 5
+
    1mint0m
         (class=conversion #args=1,2) Convert int/float/bool/string to int. If the second argument is omitted and the first argument is a string, base is inferred from the first argument's prefix. If the second argument is provided and the first argument is a string, the second argument is used as the base. If the second argument is provided and the first argument is not a string, the second argument is ignored.
        Examples:
@@ -3325,4 +3333,4 @@ MILLER(1)                                                            MILLER(1)
 
 
 
-                                  2023-03-24                         MILLER(1)
+                                  2023-03-27                         MILLER(1)
diff --git a/man/mlr.1 b/man/mlr.1
index a56102ec0..1c50f0b02 100644
--- a/man/mlr.1
+++ b/man/mlr.1
@@ -2,12 +2,12 @@
 .\"     Title: mlr
 .\"    Author: [see the "AUTHOR" section]
 .\" Generator: ./mkman.rb
-.\"      Date: 2023-03-24
+.\"      Date: 2023-03-27
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "MILLER" "1" "2023-03-24" "\ \&" "\ \&"
+.TH "MILLER" "1" "2023-03-27" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Portability definitions
 .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -236,12 +236,12 @@ asserting_present asserting_string atan atan2 atanh bitcount boolean
 capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh
 depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor
 fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values
-gmt2localtime gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec hostname int
-invqnorm is_absent is_array is_bool is_boolean is_empty is_empty_map is_error
-is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty
-is_not_map is_not_null is_null is_numeric is_present is_string joink joinkv
-joinv json_parse json_stringify latin1_to_utf8 leafcount leftpad length
-localtime2gmt localtime2sec log log10 log1p logifit lstrip madd mapdiff
+gmt2localtime gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec hostname index
+int invqnorm is_absent is_array is_bool is_boolean is_empty is_empty_map
+is_error is_float is_int is_map is_nan is_nonempty_map is_not_array
+is_not_empty is_not_map is_not_null is_null is_numeric is_present is_string
+joink joinkv joinv json_parse json_stringify latin1_to_utf8 leafcount leftpad
+length localtime2gmt localtime2sec log log10 log1p logifit lstrip madd mapdiff
 mapexcept mapselect mapsum max md5 mexp min mmul msub os pow qnorm reduce
 regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt
 sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 sha512
@@ -3357,6 +3357,20 @@ gsub("prefix4529:suffix8567", "(....ix)([0-9]+)", "[\e1 : \e2]") gives "[prefix
 .fi
 .if n \{\
 .RE
+.SS "index"
+.if n \{\
+.RS 0
+.\}
+.nf
+ (class=string #args=2) Returns the index (1-based) of the second argument within the first. Returns -1 if the second argument isn't a substring of the first. Stringifies non-string inputs. Uses UTF-8 encoding to count characters, not bytes.
+Examples:
+index("abcde", "e") gives 5
+index("abcde", "x") gives 01
+index(12345, 34) gives 3
+index("forêt", "t") gives 5
+.fi
+.if n \{\
+.RE
 .SS "int"
 .if n \{\
 .RS 0
diff --git a/test/cases/dsl-index/0001/cmd b/test/cases/dsl-index/0001/cmd
new file mode 100644
index 000000000..557ef51a6
--- /dev/null
+++ b/test/cases/dsl-index/0001/cmd
@@ -0,0 +1 @@
+mlr --icsv --opprint put -f ${CASEDIR}/mlr ${CASEDIR}/input
diff --git a/test/cases/dsl-index/0001/experr b/test/cases/dsl-index/0001/experr
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/dsl-index/0001/expout b/test/cases/dsl-index/0001/expout
new file mode 100644
index 000000000..2b4fc0407
--- /dev/null
+++ b/test/cases/dsl-index/0001/expout
@@ -0,0 +1,13 @@
+a            b  c
+abcde        a  1
+abcde        b  2
+abcde        c  3
+abcde        d  4
+abcde        e  5
+abcde        x  -1
+forêt cachée fo 1
+forêt cachée êt 4
+forêt cachée ê  4
+forêt cachée e  12
+forêt cachée x  -1
+12345        34 3
diff --git a/test/cases/dsl-index/0001/input b/test/cases/dsl-index/0001/input
new file mode 100644
index 000000000..5ded386d3
--- /dev/null
+++ b/test/cases/dsl-index/0001/input
@@ -0,0 +1,13 @@
+a,b
+abcde,a
+abcde,b
+abcde,c
+abcde,d
+abcde,e
+abcde,x
+forêt cachée,fo
+forêt cachée,êt
+forêt cachée,ê
+forêt cachée,e
+forêt cachée,x
+12345,34
diff --git a/test/cases/dsl-index/0001/mlr b/test/cases/dsl-index/0001/mlr
new file mode 100644
index 000000000..a95391469
--- /dev/null
+++ b/test/cases/dsl-index/0001/mlr
@@ -0,0 +1 @@
+$c = index($a, $b)