diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 1d8e0185f..3b48c8ef5 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -50,7 +50,7 @@ DESCRIPTION insertion-ordered hash map. This encompasses a variety of data formats, including but not limited to the familiar CSV, TSV, and JSON. (Miller can handle positionally-indexed data as a special case.) This - manpage documents mlr 6.2.0. + manpage documents mlr 6.2.0-dev. EXAMPLES mlr --icsv --opprint cat example.csv @@ -192,11 +192,11 @@ VERB LIST altkv bar bootstrap cat check clean-whitespace count-distinct count count-similar cut decimate fill-down fill-empty filter flatten format-values fraction gap grep group-by group-like having-fields head histogram json-parse - json-stringify join label least-frequent merge-fields most-frequent nest - nothing put regularize remove-empty-columns rename reorder repeat reshape - sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort - sort-within-records split stats1 stats2 step tac tail tee template top - unflatten uniq unsparsify + json-stringify join label latin1-to-utf8 utf8-to-latin1 least-frequent + merge-fields most-frequent nest nothing put regularize remove-empty-columns + rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle + skip-trivial-records sort sort-within-records split stats1 stats2 step tac + tail tee template top unflatten uniq unsparsify FUNCTION LIST abs acos acosh any append apply arrayify asin asinh asserting_absent @@ -212,16 +212,17 @@ FUNCTION LIST is_absent is_array is_bool is_boolean is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present is_string joink joinkv joinv - json_parse json_stringify leafcount length localtime2gmt localtime2sec log - log10 log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 - mexp min mmul msub os pow qnorm reduce regextract regextract_or_else round - roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime - select sgn sha1 sha256 sha512 sin sinh sort splita splitax splitkv splitkvx - splitnv splitnvx sqrt ssub strftime strftime_local string strip strlen - strptime strptime_local sub substr substr0 substr1 system systime systimeint - tan tanh tolower toupper truncate typeof unflatten unformat unformatx uptime - urand urand32 urandelement urandint urandrange version ! != !=~ % & && * ** + - - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ + json_parse json_stringify latin1_to_utf8 leafcount length localtime2gmt + localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect + mapsum max md5 mexp min mmul msub os pow qnorm reduce regextract + regextract_or_else round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms + sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh sort splita + splitax splitkv splitkvx splitnv splitnvx sqrt ssub strftime strftime_local + string strip strlen strptime strptime_local sub substr substr0 substr1 system + systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat + unformatx uptime urand urand32 urandelement urandint urandrange utf8_to_latin1 + version ! != !=~ % & && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> + >>> ?: ?? ??? ^ ^^ | || ~ COMMENTS-IN-DATA FLAGS Miller lets you put comments in your data, such as @@ -1319,6 +1320,20 @@ VERBS Options: -h|--help Show this message. + latin1-to-utf8 + Usage: mlr latin1-to-utf8, with no options. + Recursively converts record strings from Latin-1 to UTF-8. + For field-level control, please see the latin1_to_utf8 DSL function. + Options: + -h|--help Show this message. + + utf8-to-latin1 + Usage: mlr utf8-to-latin1, with no options. + Recursively converts record strings from Latin-1 to UTF-8. + For field-level control, please see the utf8_to_latin1 DSL function. + Options: + -h|--help Show this message. + least-frequent Usage: mlr least-frequent [options] Shows the least frequently occurring distinct values for specified field names. @@ -2363,6 +2378,12 @@ FUNCTIONS FOR FILTER/PUT json_stringify (class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output. + latin1_to_utf8 + (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it. + Examples: + $y = latin1_to_utf8($x) + $* = latin1_to_utf8($*) + leafcount (class=collections #args=1) Counts total number of terminal values in map/array. For single-level map/array, same as length. @@ -2694,6 +2715,12 @@ FUNCTIONS FOR FILTER/PUT urandrange (class=math #args=2) Floating-point numbers uniformly distributed on the interval [a, b). + utf8_to_latin1 + (class=string #args=1) Tries to convert UTF-8-encoded string to Latin-1-encoded string. If argument is array or map, recurses into it. + Examples: + $y = utf8_to_latin1($x) + $* = utf8_to_latin1($*) + version (class=system #args=0) Returns the Miller version as a string. @@ -3195,5 +3222,5 @@ SEE ALSO - 2022-03-19 MILLER(1) + 2022-03-20 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index ae13d1f77..90f7c91ba 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -29,7 +29,7 @@ DESCRIPTION insertion-ordered hash map. This encompasses a variety of data formats, including but not limited to the familiar CSV, TSV, and JSON. (Miller can handle positionally-indexed data as a special case.) This - manpage documents mlr 6.2.0. + manpage documents mlr 6.2.0-dev. EXAMPLES mlr --icsv --opprint cat example.csv @@ -171,11 +171,11 @@ VERB LIST altkv bar bootstrap cat check clean-whitespace count-distinct count count-similar cut decimate fill-down fill-empty filter flatten format-values fraction gap grep group-by group-like having-fields head histogram json-parse - json-stringify join label least-frequent merge-fields most-frequent nest - nothing put regularize remove-empty-columns rename reorder repeat reshape - sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort - sort-within-records split stats1 stats2 step tac tail tee template top - unflatten uniq unsparsify + json-stringify join label latin1-to-utf8 utf8-to-latin1 least-frequent + merge-fields most-frequent nest nothing put regularize remove-empty-columns + rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle + skip-trivial-records sort sort-within-records split stats1 stats2 step tac + tail tee template top unflatten uniq unsparsify FUNCTION LIST abs acos acosh any append apply arrayify asin asinh asserting_absent @@ -191,16 +191,17 @@ FUNCTION LIST is_absent is_array is_bool is_boolean is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present is_string joink joinkv joinv - json_parse json_stringify leafcount length localtime2gmt localtime2sec log - log10 log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 - mexp min mmul msub os pow qnorm reduce regextract regextract_or_else round - roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime - select sgn sha1 sha256 sha512 sin sinh sort splita splitax splitkv splitkvx - splitnv splitnvx sqrt ssub strftime strftime_local string strip strlen - strptime strptime_local sub substr substr0 substr1 system systime systimeint - tan tanh tolower toupper truncate typeof unflatten unformat unformatx uptime - urand urand32 urandelement urandint urandrange version ! != !=~ % & && * ** + - - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ + json_parse json_stringify latin1_to_utf8 leafcount length localtime2gmt + localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect + mapsum max md5 mexp min mmul msub os pow qnorm reduce regextract + regextract_or_else round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms + sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh sort splita + splitax splitkv splitkvx splitnv splitnvx sqrt ssub strftime strftime_local + string strip strlen strptime strptime_local sub substr substr0 substr1 system + systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat + unformatx uptime urand urand32 urandelement urandint urandrange utf8_to_latin1 + version ! != !=~ % & && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> + >>> ?: ?? ??? ^ ^^ | || ~ COMMENTS-IN-DATA FLAGS Miller lets you put comments in your data, such as @@ -1298,6 +1299,20 @@ VERBS Options: -h|--help Show this message. + latin1-to-utf8 + Usage: mlr latin1-to-utf8, with no options. + Recursively converts record strings from Latin-1 to UTF-8. + For field-level control, please see the latin1_to_utf8 DSL function. + Options: + -h|--help Show this message. + + utf8-to-latin1 + Usage: mlr utf8-to-latin1, with no options. + Recursively converts record strings from Latin-1 to UTF-8. + For field-level control, please see the utf8_to_latin1 DSL function. + Options: + -h|--help Show this message. + least-frequent Usage: mlr least-frequent [options] Shows the least frequently occurring distinct values for specified field names. @@ -2342,6 +2357,12 @@ FUNCTIONS FOR FILTER/PUT json_stringify (class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output. + latin1_to_utf8 + (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it. + Examples: + $y = latin1_to_utf8($x) + $* = latin1_to_utf8($*) + leafcount (class=collections #args=1) Counts total number of terminal values in map/array. For single-level map/array, same as length. @@ -2673,6 +2694,12 @@ FUNCTIONS FOR FILTER/PUT urandrange (class=math #args=2) Floating-point numbers uniformly distributed on the interval [a, b). + utf8_to_latin1 + (class=string #args=1) Tries to convert UTF-8-encoded string to Latin-1-encoded string. If argument is array or map, recurses into it. + Examples: + $y = utf8_to_latin1($x) + $* = utf8_to_latin1($*) + version (class=system #args=0) Returns the Miller version as a string. @@ -3174,4 +3201,4 @@ SEE ALSO - 2022-03-19 MILLER(1) + 2022-03-20 MILLER(1) diff --git a/docs/src/pix/latin1-to-utf8.png b/docs/src/pix/latin1-to-utf8.png new file mode 100644 index 000000000..6917dfdfc Binary files /dev/null and b/docs/src/pix/latin1-to-utf8.png differ diff --git a/docs/src/pix/utf8-to-latin1.png b/docs/src/pix/utf8-to-latin1.png new file mode 100644 index 000000000..0a124f213 Binary files /dev/null and b/docs/src/pix/utf8-to-latin1.png differ diff --git a/docs/src/reference-dsl-builtin-functions.md b/docs/src/reference-dsl-builtin-functions.md index 6e7845cc6..051c89ea2 100644 --- a/docs/src/reference-dsl-builtin-functions.md +++ b/docs/src/reference-dsl-builtin-functions.md @@ -74,7 +74,7 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary * [**Hashing functions**](#hashing-functions): [md5](#md5), [sha1](#sha1), [sha256](#sha256), [sha512](#sha512). * [**Higher-order-functions functions**](#higher-order-functions-functions): [any](#any), [apply](#apply), [every](#every), [fold](#fold), [reduce](#reduce), [select](#select), [sort](#sort). * [**Math functions**](#math-functions): [abs](#abs), [acos](#acos), [acosh](#acosh), [asin](#asin), [asinh](#asinh), [atan](#atan), [atan2](#atan2), [atanh](#atanh), [cbrt](#cbrt), [ceil](#ceil), [cos](#cos), [cosh](#cosh), [erf](#erf), [erfc](#erfc), [exp](#exp), [expm1](#expm1), [floor](#floor), [invqnorm](#invqnorm), [log](#log), [log10](#log10), [log1p](#log1p), [logifit](#logifit), [max](#max), [min](#min), [qnorm](#qnorm), [round](#round), [roundm](#roundm), [sgn](#sgn), [sin](#sin), [sinh](#sinh), [sqrt](#sqrt), [tan](#tan), [tanh](#tanh), [urand](#urand), [urand32](#urand32), [urandelement](#urandelement), [urandint](#urandint), [urandrange](#urandrange). -* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [format](#format), [gssub](#gssub), [gsub](#gsub), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [\.](#dot). +* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [format](#format), [gssub](#gssub), [gsub](#gsub), [latin1_to_utf8](#latin1_to_utf8), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot). * [**System functions**](#system-functions): [hostname](#hostname), [os](#os), [system](#system), [version](#version). * [**Time functions**](#time-functions): [dhms2fsec](#dhms2fsec), [dhms2sec](#dhms2sec), [fsec2dhms](#fsec2dhms), [fsec2hms](#fsec2hms), [gmt2localtime](#gmt2localtime), [gmt2sec](#gmt2sec), [hms2fsec](#hms2fsec), [hms2sec](#hms2sec), [localtime2gmt](#localtime2gmt), [localtime2sec](#localtime2sec), [sec2dhms](#sec2dhms), [sec2gmt](#sec2gmt), [sec2gmtdate](#sec2gmtdate), [sec2hms](#sec2hms), [sec2localdate](#sec2localdate), [sec2localtime](#sec2localtime), [strftime](#strftime), [strftime_local](#strftime_local), [strptime](#strptime), [strptime_local](#strptime_local), [systime](#systime), [systimeint](#systimeint), [uptime](#uptime). * [**Typing functions**](#typing-functions): [asserting_absent](#asserting_absent), [asserting_array](#asserting_array), [asserting_bool](#asserting_bool), [asserting_boolean](#asserting_boolean), [asserting_empty](#asserting_empty), [asserting_empty_map](#asserting_empty_map), [asserting_error](#asserting_error), [asserting_float](#asserting_float), [asserting_int](#asserting_int), [asserting_map](#asserting_map), [asserting_nonempty_map](#asserting_nonempty_map), [asserting_not_array](#asserting_not_array), [asserting_not_empty](#asserting_not_empty), [asserting_not_map](#asserting_not_map), [asserting_not_null](#asserting_not_null), [asserting_null](#asserting_null), [asserting_numeric](#asserting_numeric), [asserting_present](#asserting_present), [asserting_string](#asserting_string), [is_absent](#is_absent), [is_array](#is_array), [is_bool](#is_bool), [is_boolean](#is_boolean), [is_empty](#is_empty), [is_empty_map](#is_empty_map), [is_error](#is_error), [is_float](#is_float), [is_int](#is_int), [is_map](#is_map), [is_nan](#is_nan), [is_nonempty_map](#is_nonempty_map), [is_not_array](#is_not_array), [is_not_empty](#is_not_empty), [is_not_map](#is_not_map), [is_not_null](#is_not_null), [is_null](#is_null), [is_numeric](#is_numeric), [is_present](#is_present), [is_string](#is_string), [typeof](#typeof). @@ -1012,6 +1012,15 @@ gsub("prefix4529:suffix8567", "(....ix)([0-9]+)", "[\1 : \2]") gives "[prefix : +### latin1_to_utf8 +
+latin1_to_utf8 (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it. +Examples: +$y = latin1_to_utf8($x) +$* = latin1_to_utf8($*) ++ + ### lstrip
lstrip (class=string #args=1) Strip leading whitespace from string.
@@ -1130,6 +1139,15 @@ is_error(unformatx("{}h{}m{}s", "3:47:22")) gives true.
+### utf8_to_latin1
++utf8_to_latin1 (class=string #args=1) Tries to convert UTF-8-encoded string to Latin-1-encoded string. If argument is array or map, recurses into it. +Examples: +$y = utf8_to_latin1($x) +$* = utf8_to_latin1($*) ++ + ### \. diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index 2b48162e5..7a6b753ee 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -1873,6 +1873,39 @@ Alice 56 missing Carol 45 present +## latin1-to-utf8 + +
+mlr latin1-to-utf8 -h ++
+Usage: mlr latin1-to-utf8, with no options. +Recursively converts record strings from Latin-1 to UTF-8. +For field-level control, please see the latin1_to_utf8 DSL function. +Options: +-h|--help Show this message. ++ + + +## utf8-to-latin1 + +
+mlr utf8-to-latin1 -h ++
+Usage: mlr utf8-to-latin1, with no options. +Recursively converts record strings from Latin-1 to UTF-8. +For field-level control, please see the utf8_to_latin1 DSL function. +Options: +-h|--help Show this message. ++ +In this example, the English and German pangrams are convertible from UTF-8 to Latin-1, but the +Russian one is not: + + + ## least-frequent
diff --git a/docs/src/reference-verbs.md.in b/docs/src/reference-verbs.md.in index 96df0f9f7..7bc8b6ee2 100644 --- a/docs/src/reference-verbs.md.in +++ b/docs/src/reference-verbs.md.in @@ -615,6 +615,25 @@ GENMD-RUN-COMMAND mlr --icsv --implicit-csv-header --opprint label name,age,status data/headerless.csv GENMD-EOF +## latin1-to-utf8 + +GENMD-RUN-COMMAND +mlr latin1-to-utf8 -h +GENMD-EOF + + + +## utf8-to-latin1 + +GENMD-RUN-COMMAND +mlr utf8-to-latin1 -h +GENMD-EOF + +In this example, the English and German pangrams are convertible from UTF-8 to Latin-1, but the +Russian one is not: + + + ## least-frequent GENMD-RUN-COMMAND diff --git a/docs/src/special-symbols-and-formatting.md b/docs/src/special-symbols-and-formatting.md index e1d39b7a6..c84301d59 100644 --- a/docs/src/special-symbols-and-formatting.md +++ b/docs/src/special-symbols-and-formatting.md @@ -170,6 +170,8 @@ The [`gssub`](reference-dsl-builtin-functions.md#gssub) functions exist precisely for this reason: so you don't have to escape anything. +## Latin-1 and UTF-8 character encodings + The `ssub` and `gssub` functions are also handy for dealing with non-UTF-8 strings such as Latin 1, since Go's `regexp` library -- which Miller uses -- requires UTF-8 strings. For example: @@ -186,6 +188,24 @@ The `ssub` and `gssub` functions are also handy for dealing with non-UTF-8 strin Kaðlín og Þormundr+More generally, though, we have the DSL functions +[`latin1_to_utf8`](reference-dsl-builtin-functions.md#latin1_to_utf8) and +[`utf8_to_latin1`](reference-dsl-builtin-functions.md#utf8_to_latin1) +and the verbs +[`latin1-to-utf8`](reference-verbs.md#latin1-to-utf8) and +[`utf8-to-latin1`](reference-verbs.md#utf8-to-latin1). The former let you fix encodings on a field-by-field +level; the latter, for all records (with less keystroking). (Latin 1 is also known as +[ISO/IEC 8859-1](https://en.wikipedia.org/wiki/ISO/IEC_8859-1).) + +In this example, all the inputs are convertible from Latin-1 to UTF-8: + + + +In this example, the English and German pangrams are convertible from UTF-8 to Latin-1, but the +Russian one is not: + + + ## How to apply math to regex output? * Use parentheses for capture groups diff --git a/docs/src/special-symbols-and-formatting.md.in b/docs/src/special-symbols-and-formatting.md.in index f7e249c54..ff8971805 100644 --- a/docs/src/special-symbols-and-formatting.md.in +++ b/docs/src/special-symbols-and-formatting.md.in @@ -94,6 +94,8 @@ The [`gssub`](reference-dsl-builtin-functions.md#gssub) functions exist precisely for this reason: so you don't have to escape anything. +## Latin-1 and UTF-8 character encodings + The `ssub` and `gssub` functions are also handy for dealing with non-UTF-8 strings such as Latin 1, since Go's `regexp` library -- which Miller uses -- requires UTF-8 strings. For example: @@ -107,6 +109,24 @@ mlr -n put 'end { }' GENMD-EOF +More generally, though, we have the DSL functions +[`latin1_to_utf8`](reference-dsl-builtin-functions.md#latin1_to_utf8) and +[`utf8_to_latin1`](reference-dsl-builtin-functions.md#utf8_to_latin1) +and the verbs +[`latin1-to-utf8`](reference-verbs.md#latin1-to-utf8) and +[`utf8-to-latin1`](reference-verbs.md#utf8-to-latin1). The former let you fix encodings on a field-by-field +level; the latter, for all records (with less keystroking). (Latin 1 is also known as +[ISO/IEC 8859-1](https://en.wikipedia.org/wiki/ISO/IEC_8859-1).) + +In this example, all the inputs are convertible from Latin-1 to UTF-8: + + + +In this example, the English and German pangrams are convertible from UTF-8 to Latin-1, but the +Russian one is not: + + + ## How to apply math to regex output? * Use parentheses for capture groups diff --git a/internal/pkg/bifs/base.go b/internal/pkg/bifs/base.go index 500a84103..700cfab26 100644 --- a/internal/pkg/bifs/base.go +++ b/internal/pkg/bifs/base.go @@ -202,9 +202,33 @@ func _more(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { return mlrval.FromInt(1) } -// recuriseBinaryFuncOnInput1 is for fmtifnum and other functions which apply themselves recursively +// recurseUnaryFuncOnInput1 is for BIF_latin1_to_utf8 and other functions which apply themselves +// recursively on array/map inputs. +func recurseUnaryFuncOnInput1(unaryFunc UnaryFunc, input1 *mlrval.Mlrval) *mlrval.Mlrval { + if input1.IsArray() { + inputArray := input1.GetArray() + lib.InternalCodingErrorIf(inputArray == nil) + outputArray := make([]*mlrval.Mlrval, len(inputArray)) + for i := range inputArray { + outputArray[i] = unaryFunc(inputArray[i]) + } + return mlrval.FromArray(outputArray) + } else if input1.IsMap() { + inputMap := input1.GetMap() + lib.InternalCodingErrorIf(inputMap == nil) + outputMap := mlrval.NewMlrmap() + for pe := inputMap.Head; pe != nil; pe = pe.Next { + outputMap.PutReference(pe.Key, unaryFunc(pe.Value)) + } + return mlrval.FromMap(outputMap) + } else { + return unaryFunc(input1) + } +} + +// recurseBinaryFuncOnInput1 is for fmtifnum and other functions which apply themselves recursively // on array/map inputs. -func recuriseBinaryFuncOnInput1(binaryFunc BinaryFunc, input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { +func recurseBinaryFuncOnInput1(binaryFunc BinaryFunc, input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { if input1.IsArray() { inputArray := input1.GetArray() lib.InternalCodingErrorIf(inputArray == nil) @@ -222,6 +246,6 @@ func recuriseBinaryFuncOnInput1(binaryFunc BinaryFunc, input1, input2 *mlrval.Ml } return mlrval.FromMap(outputMap) } else { - return fmtnum_dispositions[input1.Type()][input2.Type()](input1, input2) + return binaryFunc(input1, input2) } } diff --git a/internal/pkg/bifs/strings.go b/internal/pkg/bifs/strings.go index def2d9349..f30d0fd0f 100644 --- a/internal/pkg/bifs/strings.go +++ b/internal/pkg/bifs/strings.go @@ -418,7 +418,7 @@ var fmtnum_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{ func BIF_fmtnum(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { if input1.IsArray() || input1.IsMap() { - return recuriseBinaryFuncOnInput1(BIF_fmtnum, input1, input2) + return recurseBinaryFuncOnInput1(BIF_fmtnum, input1, input2) } else { return fmtnum_dispositions[input1.Type()][input2.Type()](input1, input2) } @@ -426,7 +426,7 @@ func BIF_fmtnum(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { func BIF_fmtifnum(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { if input1.IsArray() || input1.IsMap() { - return recuriseBinaryFuncOnInput1(BIF_fmtifnum, input1, input2) + return recurseBinaryFuncOnInput1(BIF_fmtifnum, input1, input2) } else { output := fmtnum_dispositions[input1.Type()][input2.Type()](input1, input2) if output.IsError() { @@ -436,3 +436,37 @@ func BIF_fmtifnum(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { } } } + +func BIF_latin1_to_utf8(input1 *mlrval.Mlrval) *mlrval.Mlrval { + if input1.IsArray() || input1.IsMap() { + return recurseUnaryFuncOnInput1(BIF_latin1_to_utf8, input1) + } else if input1.IsString() { + output, err := lib.TryLatin1ToUTF8(input1.String()) + if err != nil { + // Somewhat arbitrary design decision + // return input1 + return mlrval.ERROR + } else { + return mlrval.FromString(output) + } + } else { + return input1 + } +} + +func BIF_utf8_to_latin1(input1 *mlrval.Mlrval) *mlrval.Mlrval { + if input1.IsArray() || input1.IsMap() { + return recurseUnaryFuncOnInput1(BIF_utf8_to_latin1, input1) + } else if input1.IsString() { + output, err := lib.TryUTF8ToLatin1(input1.String()) + if err != nil { + // Somewhat arbitrary design decision + // return input1 + return mlrval.ERROR + } else { + return mlrval.FromString(output) + } + } else { + return input1 + } +} diff --git a/internal/pkg/dsl/cst/builtin_function_manager.go b/internal/pkg/dsl/cst/builtin_function_manager.go index 07d065f10..e23012ac9 100644 --- a/internal/pkg/dsl/cst/builtin_function_manager.go +++ b/internal/pkg/dsl/cst/builtin_function_manager.go @@ -598,6 +598,30 @@ with type-inference. On non-match, returns error -- use is_error() to check.`, binaryFunc: bifs.BIF_unformatx, }, + { + name: "latin1_to_utf8", + class: FUNC_CLASS_STRING, + help: `Tries to convert Latin-1-encoded string to UTF-8-encoded string. +If argument is array or map, recurses into it.`, + examples: []string{ + `$y = latin1_to_utf8($x)`, + `$* = latin1_to_utf8($*)`, + }, + unaryFunc: bifs.BIF_latin1_to_utf8, + }, + + { + name: "utf8_to_latin1", + class: FUNC_CLASS_STRING, + help: `Tries to convert UTF-8-encoded string to Latin-1-encoded string. +If argument is array or map, recurses into it.`, + examples: []string{ + `$y = utf8_to_latin1($x)`, + `$* = utf8_to_latin1($*)`, + }, + unaryFunc: bifs.BIF_utf8_to_latin1, + }, + // ---------------------------------------------------------------- // FUNC_CLASS_HASHING diff --git a/internal/pkg/dsl/cst/builtin_functions.go b/internal/pkg/dsl/cst/builtin_functions.go index eb8ec1912..65b86023b 100644 --- a/internal/pkg/dsl/cst/builtin_functions.go +++ b/internal/pkg/dsl/cst/builtin_functions.go @@ -507,7 +507,7 @@ func (node *DotCallsiteNode) Evaluate( // Case 1: map.attribute as shorthand for map["attribute"] value2 := mapvalue1.Get(node.string2) if value2 == nil { - return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "map access [" + node.string2 + "]") + return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "map access ["+node.string2+"]") } else { return value2 } diff --git a/internal/pkg/dsl/cst/evaluable.go b/internal/pkg/dsl/cst/evaluable.go index 82e1e063e..d46cab5b1 100644 --- a/internal/pkg/dsl/cst/evaluable.go +++ b/internal/pkg/dsl/cst/evaluable.go @@ -129,7 +129,7 @@ func (node *IndirectFieldValueNode) Evaluate( os.Exit(1) } if value == nil { - return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$[" + fieldName.String() + "]") + return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$["+fieldName.String()+"]") } return value } @@ -164,7 +164,7 @@ func (node *IndirectOosvarValueNode) Evaluate( value := state.Oosvars.Get(oosvarName.String()) if value == nil { - return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "@[" + oosvarName.String() + "]") + return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "@["+oosvarName.String()+"]") } return value diff --git a/internal/pkg/lib/latin1.go b/internal/pkg/lib/latin1.go new file mode 100644 index 000000000..121a267d3 --- /dev/null +++ b/internal/pkg/lib/latin1.go @@ -0,0 +1,38 @@ +package lib + +import ( + "bytes" + "fmt" + "unicode/utf8" +) + +func TryLatin1ToUTF8(input string) (string, error) { + var buffer bytes.Buffer + for _, b := range []byte(input) { + // 0x00-0xff map to 0x0000-0xffff + buffer.WriteRune(rune(b)) + } + output := buffer.String() + return output, nil +} + +func TryUTF8ToLatin1(input string) (string, error) { + var buffer bytes.Buffer + + bytes := []byte(input) + for len(bytes) > 0 { + r, size := utf8.DecodeRune(bytes) + + if r < 0x0080 { + buffer.WriteByte(byte(r)) + } else if r >= 0x80 && r <= 0x00ff { + buffer.WriteByte(byte(r)) + } else { + return "", fmt.Errorf("character 0x%08x (%v) is not encodable as Latin-1", int(r), r) + } + + bytes = bytes[size:] + } + output := buffer.String() + return output, nil +} diff --git a/internal/pkg/lib/latin1_test.go b/internal/pkg/lib/latin1_test.go new file mode 100644 index 000000000..947d0f042 --- /dev/null +++ b/internal/pkg/lib/latin1_test.go @@ -0,0 +1,100 @@ +// ================================================================ +// Most Miller tests (thousands of them) are command-line-driven via +// mlr regtest. Here are some cases needing special focus. +// ================================================================ + +package lib + +import ( + "github.com/stretchr/testify/assert" + "testing" +) + +type tDataForLatin1 struct { + input string + expectedOutput string + expectError bool +} + +var dataForLatin1ToUTF8 = []tDataForLatin1{ + { + "", + "", + false, + }, + { + "The quick brown fox jumped over the lazy dogs.", + "The quick brown fox jumped over the lazy dogs.", + false, + }, + { + "a\xe4o\xf6", + "a\xc3\xa4o\xc3\xb6", // "aäoö" -- showing explicitly here "\u00e4" encodes as "\xc3\xa4" + false, + }, + { + "Victor jagt zw\xf6lf Boxk\xe4mpfer quer \xfcber den gro\xdfen Sylter Deich", + "Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich", + false, + }, +} + +var dataForUTF8ToLatin1 = []tDataForLatin1{ + { + "", + "", + false, + }, + { + "The quick brown fox jumped over the lazy dogs.", + "The quick brown fox jumped over the lazy dogs.", + false, + }, + { + "a\xc3\xa4o\xc3\xb6", // "aäoö" -- showing explicitly here "\u00e4" encodes as "\xc3\xa4" + "a\xe4o\xf6", + false, + }, + { + "Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich", + "Victor jagt zw\xf6lf Boxk\xe4mpfer quer \xfcber den gro\xdfen Sylter Deich", + false, + }, + { + "Съешь же ещё этих мягких французских булок да выпей чаю", + "", + true, + }, +} + +func TestLatin1ToUTF8(t *testing.T) { + for i, entry := range dataForLatin1ToUTF8 { + actualOutput, err := TryLatin1ToUTF8(entry.input) + if entry.expectError { + assert.NotNil(t, err) + } else { + assert.Nil(t, err) + } + if actualOutput != entry.expectedOutput { + t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n", + i, entry.input, entry.expectedOutput, actualOutput, + ) + } + } +} + +func TestUTF8ToLatin1(t *testing.T) { + for i, entry := range dataForUTF8ToLatin1 { + actualOutput, err := TryUTF8ToLatin1(entry.input) + if entry.expectError { + assert.NotNil(t, err) + } else { + assert.Nil(t, err) + } + if actualOutput != entry.expectedOutput { + t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n", + i, entry.input, entry.expectedOutput, actualOutput, + ) + } + } +} diff --git a/internal/pkg/mlrval/mlrval_get.go b/internal/pkg/mlrval/mlrval_get.go index cdf9775f9..038e8933c 100644 --- a/internal/pkg/mlrval/mlrval_get.go +++ b/internal/pkg/mlrval/mlrval_get.go @@ -147,7 +147,7 @@ func (mv *Mlrval) AssertNumeric() { _ = mv.GetNumericToFloatValueOrDie() } -func (mv *Mlrval) StrictModeCheck(strictMode bool, description string) *Mlrval{ +func (mv *Mlrval) StrictModeCheck(strictMode bool, description string) *Mlrval { if strictMode && mv.IsAbsent() { fmt.Fprintf(os.Stderr, "mlr: %s is absent and strict mode was requested.\n", description) os.Exit(1) diff --git a/internal/pkg/transformers/aaa_transformer_table.go b/internal/pkg/transformers/aaa_transformer_table.go index 463b745a4..848d24883 100644 --- a/internal/pkg/transformers/aaa_transformer_table.go +++ b/internal/pkg/transformers/aaa_transformer_table.go @@ -39,6 +39,8 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{ JSONStringifySetup, JoinSetup, LabelSetup, + Latin1ToUTF8Setup, + UTF8ToLatin1Setup, LeastFrequentSetup, MergeFieldsSetup, MostFrequentSetup, diff --git a/internal/pkg/transformers/latin1_to_utf8.go b/internal/pkg/transformers/latin1_to_utf8.go new file mode 100644 index 000000000..9ba7602f9 --- /dev/null +++ b/internal/pkg/transformers/latin1_to_utf8.go @@ -0,0 +1,122 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "strings" + + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/lib" + "github.com/johnkerl/miller/internal/pkg/mlrval" + "github.com/johnkerl/miller/internal/pkg/types" +) + +// ---------------------------------------------------------------- +const verbNameLatin1ToUTF8 = "latin1-to-utf8" + +var Latin1ToUTF8Setup = TransformerSetup{ + Verb: verbNameLatin1ToUTF8, + UsageFunc: transformerLatin1ToUTF8Usage, + ParseCLIFunc: transformerLatin1ToUTF8ParseCLI, + IgnoresInput: false, +} + +func transformerLatin1ToUTF8Usage( + o *os.File, + doExit bool, + exitCode int, +) { + fmt.Fprintf(o, "Usage: %s %s, with no options.\n", "mlr", verbNameLatin1ToUTF8) + fmt.Fprintf(o, "Recursively converts record strings from Latin-1 to UTF-8.\n") + fmt.Fprintf(o, "For field-level control, please see the latin1_to_utf8 DSL function.\n") + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") + if doExit { + os.Exit(exitCode) + } +} + +func transformerLatin1ToUTF8ParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + argi++ + + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + if args[argi] == "--" { + break // All transformers must do this so main-flags can follow verb-flags + } + argi++ + + if opt == "-h" || opt == "--help" { + transformerLatin1ToUTF8Usage(os.Stdout, true, 0) + + } else { + transformerLatin1ToUTF8Usage(os.Stderr, true, 1) + } + } + + *pargi = argi + if !doConstruct { // All transformers must do this for main command-line parsing + return nil + } + + transformer, err := NewTransformerLatin1ToUTF8() + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + + return transformer +} + +// ---------------------------------------------------------------- +type TransformerLatin1ToUTF8 struct { +} + +func NewTransformerLatin1ToUTF8() (*TransformerLatin1ToUTF8, error) { + tr := &TransformerLatin1ToUTF8{} + return tr, nil +} + +// ---------------------------------------------------------------- + +func (tr *TransformerLatin1ToUTF8) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + if !inrecAndContext.EndOfStream { + inrec := inrecAndContext.Record + + for pe := inrec.Head; pe != nil; pe = pe.Next { + inval := pe.Value + if inval.IsString() { + output, err := lib.TryLatin1ToUTF8(pe.Value.String()) + if err == nil { + pe.Value = mlrval.FromString(output) + } else { + pe.Value = mlrval.ERROR + } + } + } + + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(inrec, &inrecAndContext.Context)) + + } else { // end of record stream + outputRecordsAndContexts.PushBack(inrecAndContext) + } +} diff --git a/internal/pkg/transformers/utf8_to_latin1.go b/internal/pkg/transformers/utf8_to_latin1.go new file mode 100644 index 000000000..d83fd227f --- /dev/null +++ b/internal/pkg/transformers/utf8_to_latin1.go @@ -0,0 +1,122 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "strings" + + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/lib" + "github.com/johnkerl/miller/internal/pkg/mlrval" + "github.com/johnkerl/miller/internal/pkg/types" +) + +// ---------------------------------------------------------------- +const verbNameUTF8ToLatin1 = "utf8-to-latin1" + +var UTF8ToLatin1Setup = TransformerSetup{ + Verb: verbNameUTF8ToLatin1, + UsageFunc: transformerUTF8ToLatin1Usage, + ParseCLIFunc: transformerUTF8ToLatin1ParseCLI, + IgnoresInput: false, +} + +func transformerUTF8ToLatin1Usage( + o *os.File, + doExit bool, + exitCode int, +) { + fmt.Fprintf(o, "Usage: %s %s, with no options.\n", "mlr", verbNameUTF8ToLatin1) + fmt.Fprintf(o, "Recursively converts record strings from Latin-1 to UTF-8.\n") + fmt.Fprintf(o, "For field-level control, please see the utf8_to_latin1 DSL function.\n") + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") + if doExit { + os.Exit(exitCode) + } +} + +func transformerUTF8ToLatin1ParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + argi++ + + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + if args[argi] == "--" { + break // All transformers must do this so main-flags can follow verb-flags + } + argi++ + + if opt == "-h" || opt == "--help" { + transformerUTF8ToLatin1Usage(os.Stdout, true, 0) + + } else { + transformerUTF8ToLatin1Usage(os.Stderr, true, 1) + } + } + + *pargi = argi + if !doConstruct { // All transformers must do this for main command-line parsing + return nil + } + + transformer, err := NewTransformerUTF8ToLatin1() + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + + return transformer +} + +// ---------------------------------------------------------------- +type TransformerUTF8ToLatin1 struct { +} + +func NewTransformerUTF8ToLatin1() (*TransformerUTF8ToLatin1, error) { + tr := &TransformerUTF8ToLatin1{} + return tr, nil +} + +// ---------------------------------------------------------------- + +func (tr *TransformerUTF8ToLatin1) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + if !inrecAndContext.EndOfStream { + inrec := inrecAndContext.Record + + for pe := inrec.Head; pe != nil; pe = pe.Next { + inval := pe.Value + if inval.IsString() { + output, err := lib.TryUTF8ToLatin1(pe.Value.String()) + if err == nil { + pe.Value = mlrval.FromString(output) + } else { + pe.Value = mlrval.ERROR + } + } + } + + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(inrec, &inrecAndContext.Context)) + + } else { // end of record stream + outputRecordsAndContexts.PushBack(inrecAndContext) + } +} diff --git a/man/manpage.txt b/man/manpage.txt index ae13d1f77..90f7c91ba 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -29,7 +29,7 @@ DESCRIPTION insertion-ordered hash map. This encompasses a variety of data formats, including but not limited to the familiar CSV, TSV, and JSON. (Miller can handle positionally-indexed data as a special case.) This - manpage documents mlr 6.2.0. + manpage documents mlr 6.2.0-dev. EXAMPLES mlr --icsv --opprint cat example.csv @@ -171,11 +171,11 @@ VERB LIST altkv bar bootstrap cat check clean-whitespace count-distinct count count-similar cut decimate fill-down fill-empty filter flatten format-values fraction gap grep group-by group-like having-fields head histogram json-parse - json-stringify join label least-frequent merge-fields most-frequent nest - nothing put regularize remove-empty-columns rename reorder repeat reshape - sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort - sort-within-records split stats1 stats2 step tac tail tee template top - unflatten uniq unsparsify + json-stringify join label latin1-to-utf8 utf8-to-latin1 least-frequent + merge-fields most-frequent nest nothing put regularize remove-empty-columns + rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle + skip-trivial-records sort sort-within-records split stats1 stats2 step tac + tail tee template top unflatten uniq unsparsify FUNCTION LIST abs acos acosh any append apply arrayify asin asinh asserting_absent @@ -191,16 +191,17 @@ FUNCTION LIST is_absent is_array is_bool is_boolean is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present is_string joink joinkv joinv - json_parse json_stringify leafcount length localtime2gmt localtime2sec log - log10 log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 - mexp min mmul msub os pow qnorm reduce regextract regextract_or_else round - roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime - select sgn sha1 sha256 sha512 sin sinh sort splita splitax splitkv splitkvx - splitnv splitnvx sqrt ssub strftime strftime_local string strip strlen - strptime strptime_local sub substr substr0 substr1 system systime systimeint - tan tanh tolower toupper truncate typeof unflatten unformat unformatx uptime - urand urand32 urandelement urandint urandrange version ! != !=~ % & && * ** + - - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ + json_parse json_stringify latin1_to_utf8 leafcount length localtime2gmt + localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect + mapsum max md5 mexp min mmul msub os pow qnorm reduce regextract + regextract_or_else round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms + sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh sort splita + splitax splitkv splitkvx splitnv splitnvx sqrt ssub strftime strftime_local + string strip strlen strptime strptime_local sub substr substr0 substr1 system + systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat + unformatx uptime urand urand32 urandelement urandint urandrange utf8_to_latin1 + version ! != !=~ % & && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> + >>> ?: ?? ??? ^ ^^ | || ~ COMMENTS-IN-DATA FLAGS Miller lets you put comments in your data, such as @@ -1298,6 +1299,20 @@ VERBS Options: -h|--help Show this message. + latin1-to-utf8 + Usage: mlr latin1-to-utf8, with no options. + Recursively converts record strings from Latin-1 to UTF-8. + For field-level control, please see the latin1_to_utf8 DSL function. + Options: + -h|--help Show this message. + + utf8-to-latin1 + Usage: mlr utf8-to-latin1, with no options. + Recursively converts record strings from Latin-1 to UTF-8. + For field-level control, please see the utf8_to_latin1 DSL function. + Options: + -h|--help Show this message. + least-frequent Usage: mlr least-frequent [options] Shows the least frequently occurring distinct values for specified field names. @@ -2342,6 +2357,12 @@ FUNCTIONS FOR FILTER/PUT json_stringify (class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output. + latin1_to_utf8 + (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it. + Examples: + $y = latin1_to_utf8($x) + $* = latin1_to_utf8($*) + leafcount (class=collections #args=1) Counts total number of terminal values in map/array. For single-level map/array, same as length. @@ -2673,6 +2694,12 @@ FUNCTIONS FOR FILTER/PUT urandrange (class=math #args=2) Floating-point numbers uniformly distributed on the interval [a, b). + utf8_to_latin1 + (class=string #args=1) Tries to convert UTF-8-encoded string to Latin-1-encoded string. If argument is array or map, recurses into it. + Examples: + $y = utf8_to_latin1($x) + $* = utf8_to_latin1($*) + version (class=system #args=0) Returns the Miller version as a string. @@ -3174,4 +3201,4 @@ SEE ALSO - 2022-03-19 MILLER(1) + 2022-03-20 MILLER(1) diff --git a/man/mlr.1 b/man/mlr.1 index f8d8fb9ab..8561a3659 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2022-03-19 +.\" Date: 2022-03-20 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2022-03-19" "\ \&" "\ \&" +.TH "MILLER" "1" "2022-03-20" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -47,7 +47,7 @@ on integer-indexed fields: if the natural data structure for the latter is the array, then Miller's natural data structure is the insertion-ordered hash map. This encompasses a variety of data formats, including but not limited to the familiar CSV, TSV, and JSON. (Miller can handle positionally-indexed data as -a special case.) This manpage documents mlr 6.2.0. +a special case.) This manpage documents mlr 6.2.0-dev. .SH "EXAMPLES" .sp @@ -212,11 +212,11 @@ for all things with "map" in their names. altkv bar bootstrap cat check clean-whitespace count-distinct count count-similar cut decimate fill-down fill-empty filter flatten format-values fraction gap grep group-by group-like having-fields head histogram json-parse -json-stringify join label least-frequent merge-fields most-frequent nest -nothing put regularize remove-empty-columns rename reorder repeat reshape -sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort -sort-within-records split stats1 stats2 step tac tail tee template top -unflatten uniq unsparsify +json-stringify join label latin1-to-utf8 utf8-to-latin1 least-frequent +merge-fields most-frequent nest nothing put regularize remove-empty-columns +rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle +skip-trivial-records sort sort-within-records split stats1 stats2 step tac +tail tee template top unflatten uniq unsparsify .fi .if n \{\ .RE @@ -238,16 +238,17 @@ gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec hostname int invqnorm is_absent is_array is_bool is_boolean is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present is_string joink joinkv joinv -json_parse json_stringify leafcount length localtime2gmt localtime2sec log -log10 log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 -mexp min mmul msub os pow qnorm reduce regextract regextract_or_else round -roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime -select sgn sha1 sha256 sha512 sin sinh sort splita splitax splitkv splitkvx -splitnv splitnvx sqrt ssub strftime strftime_local string strip strlen -strptime strptime_local sub substr substr0 substr1 system systime systimeint -tan tanh tolower toupper truncate typeof unflatten unformat unformatx uptime -urand urand32 urandelement urandint urandrange version ! != !=~ % & && * ** + -- . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ +json_parse json_stringify latin1_to_utf8 leafcount length localtime2gmt +localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect +mapsum max md5 mexp min mmul msub os pow qnorm reduce regextract +regextract_or_else round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms +sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh sort splita +splitax splitkv splitkvx splitnv splitnvx sqrt ssub strftime strftime_local +string strip strlen strptime strptime_local sub substr substr0 substr1 system +systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat +unformatx uptime urand urand32 urandelement urandint urandrange utf8_to_latin1 +version ! != !=~ % & && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> +>>> ?: ?? ??? ^ ^^ | || ~ .fi .if n \{\ .RE @@ -1626,6 +1627,32 @@ have the respective name. (Fields past the nth are left with their original names.) Particularly useful with --inidx or --implicit-csv-header, to give useful names to otherwise integer-indexed fields. +Options: +-h|--help Show this message. +.fi +.if n \{\ +.RE +.SS "latin1-to-utf8" +.if n \{\ +.RS 0 +.\} +.nf +Usage: mlr latin1-to-utf8, with no options. +Recursively converts record strings from Latin-1 to UTF-8. +For field-level control, please see the latin1_to_utf8 DSL function. +Options: +-h|--help Show this message. +.fi +.if n \{\ +.RE +.SS "utf8-to-latin1" +.if n \{\ +.RS 0 +.\} +.nf +Usage: mlr utf8-to-latin1, with no options. +Recursively converts record strings from Latin-1 to UTF-8. +For field-level control, please see the utf8_to_latin1 DSL function. Options: -h|--help Show this message. .fi @@ -3445,6 +3472,18 @@ joinv({"a":3,"b":4,"c":5}, ",") = "3,4,5" .fi .if n \{\ .RE +.SS "latin1_to_utf8" +.if n \{\ +.RS 0 +.\} +.nf + (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it. +Examples: +$y = latin1_to_utf8($x) +$* = latin1_to_utf8($*) +.fi +.if n \{\ +.RE .SS "leafcount" .if n \{\ .RS 0 @@ -4256,6 +4295,18 @@ Int-valued example: '$n=floor(20+urand()*11)'. .fi .if n \{\ .RE +.SS "utf8_to_latin1" +.if n \{\ +.RS 0 +.\} +.nf + (class=string #args=1) Tries to convert UTF-8-encoded string to Latin-1-encoded string. If argument is array or map, recurses into it. +Examples: +$y = utf8_to_latin1($x) +$* = utf8_to_latin1($*) +.fi +.if n \{\ +.RE .SS "version" .if n \{\ .RS 0 diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index b5724e8f2..65e52b571 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -481,6 +481,22 @@ useful names to otherwise integer-indexed fields. Options: -h|--help Show this message. +================================================================ +latin1-to-utf8 +Usage: mlr latin1-to-utf8, with no options. +Recursively converts record strings from Latin-1 to UTF-8. +For field-level control, please see the latin1_to_utf8 DSL function. +Options: +-h|--help Show this message. + +================================================================ +utf8-to-latin1 +Usage: mlr utf8-to-latin1, with no options. +Recursively converts record strings from Latin-1 to UTF-8. +For field-level control, please see the utf8_to_latin1 DSL function. +Options: +-h|--help Show this message. + ================================================================ least-frequent Usage: mlr least-frequent [options] diff --git a/test/cases/dsl-latin1/0001/cmd b/test/cases/dsl-latin1/0001/cmd new file mode 100644 index 000000000..34b2574a2 --- /dev/null +++ b/test/cases/dsl-latin1/0001/cmd @@ -0,0 +1 @@ +mlr --xtab --from ${CASEDIR}/input put -f ${CASEDIR}/mlr diff --git a/test/cases/dsl-latin1/0001/experr b/test/cases/dsl-latin1/0001/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/dsl-latin1/0001/expout b/test/cases/dsl-latin1/0001/expout new file mode 100644 index 000000000..0b71aef17 --- /dev/null +++ b/test/cases/dsl-latin1/0001/expout @@ -0,0 +1,15 @@ +x The quick brown fox jumped over the lazy dogs. +y The quick brown fox jumped over the lazy dogs. +z The quick brown fox jumped over the lazy dogs. + +x Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. +y Victor jagt zwlf Boxkmpfer quer ber den groen Sylter Deich. +z Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. + +x Съешь же ещё этих мягких французских булок да выпей чаю. +y (error) +z (error) + +x This¹ is² it³. +y This is it. +z This¹ is² it³. diff --git a/test/cases/dsl-latin1/0001/input b/test/cases/dsl-latin1/0001/input new file mode 100644 index 000000000..3b597cd70 --- /dev/null +++ b/test/cases/dsl-latin1/0001/input @@ -0,0 +1,7 @@ +x The quick brown fox jumped over the lazy dogs. + +x Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. + +x Съешь же ещё этих мягких французских булок да выпей чаю. + +x This¹ is² it³. diff --git a/test/cases/dsl-latin1/0001/mlr b/test/cases/dsl-latin1/0001/mlr new file mode 100644 index 000000000..a26683338 --- /dev/null +++ b/test/cases/dsl-latin1/0001/mlr @@ -0,0 +1,2 @@ +$y = utf8_to_latin1($x); +$z = latin1_to_utf8($y); diff --git a/test/cases/dsl-latin1/0002/cmd b/test/cases/dsl-latin1/0002/cmd new file mode 100644 index 000000000..34b2574a2 --- /dev/null +++ b/test/cases/dsl-latin1/0002/cmd @@ -0,0 +1 @@ +mlr --xtab --from ${CASEDIR}/input put -f ${CASEDIR}/mlr diff --git a/test/cases/dsl-latin1/0002/experr b/test/cases/dsl-latin1/0002/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/dsl-latin1/0002/expout b/test/cases/dsl-latin1/0002/expout new file mode 100644 index 000000000..ec6c2e346 --- /dev/null +++ b/test/cases/dsl-latin1/0002/expout @@ -0,0 +1,7 @@ +x The quick brown fox jumped over the lazy dogs. + +x Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. + +x (error) + +x This¹ is² it³. diff --git a/test/cases/dsl-latin1/0002/input b/test/cases/dsl-latin1/0002/input new file mode 100644 index 000000000..3b597cd70 --- /dev/null +++ b/test/cases/dsl-latin1/0002/input @@ -0,0 +1,7 @@ +x The quick brown fox jumped over the lazy dogs. + +x Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. + +x Съешь же ещё этих мягких французских булок да выпей чаю. + +x This¹ is² it³. diff --git a/test/cases/dsl-latin1/0002/mlr b/test/cases/dsl-latin1/0002/mlr new file mode 100644 index 000000000..cee468f70 --- /dev/null +++ b/test/cases/dsl-latin1/0002/mlr @@ -0,0 +1,2 @@ +$* = utf8_to_latin1($*); +$* = latin1_to_utf8($*); diff --git a/test/cases/verb-latin1-to-utf8/0001/cmd b/test/cases/verb-latin1-to-utf8/0001/cmd new file mode 100644 index 000000000..366fdb73d --- /dev/null +++ b/test/cases/verb-latin1-to-utf8/0001/cmd @@ -0,0 +1 @@ +mlr --xtab --from test/input/latin1.xtab latin1-to-utf8 diff --git a/test/cases/verb-latin1-to-utf8/0001/experr b/test/cases/verb-latin1-to-utf8/0001/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-latin1-to-utf8/0001/expout b/test/cases/verb-latin1-to-utf8/0001/expout new file mode 100644 index 000000000..d453b3fd6 --- /dev/null +++ b/test/cases/verb-latin1-to-utf8/0001/expout @@ -0,0 +1,5 @@ +x The quick brown fox jumped over the lazy dogs. + +x Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. + +x This¹ is² it³. diff --git a/test/cases/verb-utf8-to-latin1/0001/cmd b/test/cases/verb-utf8-to-latin1/0001/cmd new file mode 100644 index 000000000..141212006 --- /dev/null +++ b/test/cases/verb-utf8-to-latin1/0001/cmd @@ -0,0 +1 @@ +mlr --xtab --from test/input/utf8.xtab utf8-to-latin1 diff --git a/test/cases/verb-utf8-to-latin1/0001/experr b/test/cases/verb-utf8-to-latin1/0001/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-utf8-to-latin1/0001/expout b/test/cases/verb-utf8-to-latin1/0001/expout new file mode 100644 index 000000000..34e9d3802 --- /dev/null +++ b/test/cases/verb-utf8-to-latin1/0001/expout @@ -0,0 +1,7 @@ +x The quick brown fox jumped over the lazy dogs. + +x Victor jagt zwlf Boxkmpfer quer ber den groen Sylter Deich. + +x (error) + +x This is it. diff --git a/test/input/latin1.xtab b/test/input/latin1.xtab new file mode 100644 index 000000000..c445437c1 --- /dev/null +++ b/test/input/latin1.xtab @@ -0,0 +1,5 @@ +x The quick brown fox jumped over the lazy dogs. + +x Victor jagt zwlf Boxkmpfer quer ber den groen Sylter Deich. + +x This is it. diff --git a/test/input/utf8.xtab b/test/input/utf8.xtab new file mode 100644 index 000000000..3b597cd70 --- /dev/null +++ b/test/input/utf8.xtab @@ -0,0 +1,7 @@ +x The quick brown fox jumped over the lazy dogs. + +x Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. + +x Съешь же ещё этих мягких французских булок да выпей чаю. + +x This¹ is² it³.