From 2107d520fa850fdc5cc45406c7ccd4cd8f62d09f Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 20 Aug 2023 12:20:15 -0400 Subject: [PATCH] Can't use ${field_name} if it contains UTF-8 characters also encodeable as Latin-1 (#1363) * unit-test data * docgen * windows unit-test accommodations --- docs/src/manpage.md | 2 +- docs/src/manpage.txt | 2 +- internal/pkg/parsing/lexer/lexer.go | 7 ++++--- internal/pkg/parsing/lexer/transitiontable.go | 12 ++++++++++++ internal/pkg/parsing/mlr.bnf | 1 + man/manpage.txt | 2 +- man/mlr.1 | 4 ++-- test/cases/dsl-utf8-field-names/0001/cmd | 1 + test/cases/dsl-utf8-field-names/0001/experr | 0 test/cases/dsl-utf8-field-names/0001/expout | 3 +++ test/cases/dsl-utf8-field-names/0001/mlr | 1 + test/cases/dsl-utf8-field-names/0002/cmd | 1 + test/cases/dsl-utf8-field-names/0002/experr | 0 test/cases/dsl-utf8-field-names/0002/expout | 3 +++ test/cases/dsl-utf8-field-names/0002/mlr | 1 + test/cases/dsl-utf8-field-names/0003/cmd | 1 + test/cases/dsl-utf8-field-names/0003/experr | 0 test/cases/dsl-utf8-field-names/0003/expout | 3 +++ test/cases/dsl-utf8-field-names/0003/mlr | 1 + test/cases/dsl-utf8-field-names/0004/cmd | 1 + test/cases/dsl-utf8-field-names/0004/experr | 0 test/cases/dsl-utf8-field-names/0004/expout | 3 +++ test/cases/dsl-utf8-field-names/0004/mlr | 1 + test/input/datos-plurilingües.csv | 4 ++++ 24 files changed, 46 insertions(+), 8 deletions(-) create mode 100644 test/cases/dsl-utf8-field-names/0001/cmd create mode 100644 test/cases/dsl-utf8-field-names/0001/experr create mode 100644 test/cases/dsl-utf8-field-names/0001/expout create mode 100644 test/cases/dsl-utf8-field-names/0001/mlr create mode 100644 test/cases/dsl-utf8-field-names/0002/cmd create mode 100644 test/cases/dsl-utf8-field-names/0002/experr create mode 100644 test/cases/dsl-utf8-field-names/0002/expout create mode 100644 test/cases/dsl-utf8-field-names/0002/mlr create mode 100644 test/cases/dsl-utf8-field-names/0003/cmd create mode 100644 test/cases/dsl-utf8-field-names/0003/experr create mode 100644 test/cases/dsl-utf8-field-names/0003/expout create mode 100644 test/cases/dsl-utf8-field-names/0003/mlr create mode 100644 test/cases/dsl-utf8-field-names/0004/cmd create mode 100644 test/cases/dsl-utf8-field-names/0004/experr create mode 100644 test/cases/dsl-utf8-field-names/0004/expout create mode 100644 test/cases/dsl-utf8-field-names/0004/mlr create mode 100644 test/input/datos-plurilingües.csv diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 1a9ebea12..d80193433 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -3470,5 +3470,5 @@ MILLER(1) MILLER(1) - 2023-08-19 MILLER(1) + 2023-08-20 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 7372e3768..0c04fc330 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -3449,4 +3449,4 @@ MILLER(1) MILLER(1) - 2023-08-19 MILLER(1) + 2023-08-20 MILLER(1) diff --git a/internal/pkg/parsing/lexer/lexer.go b/internal/pkg/parsing/lexer/lexer.go index fa32f4989..18fd8ea7a 100644 --- a/internal/pkg/parsing/lexer/lexer.go +++ b/internal/pkg/parsing/lexer/lexer.go @@ -12,7 +12,7 @@ import ( const ( NoState = -1 NumStates = 336 - NumSymbols = 652 + NumSymbols = 653 ) type Lexer struct { @@ -779,6 +779,7 @@ Lexer symbols: 647: 'A'-'Z' 648: 'a'-'z' 649: '0'-'9' -650: \u0100-\U0010ffff -651: . +650: \u00a0-\u00ff +651: \u0100-\U0010ffff +652: . */ diff --git a/internal/pkg/parsing/lexer/transitiontable.go b/internal/pkg/parsing/lexer/transitiontable.go index ed8c21d83..447900840 100644 --- a/internal/pkg/parsing/lexer/transitiontable.go +++ b/internal/pkg/parsing/lexer/transitiontable.go @@ -1500,6 +1500,8 @@ var TransTab = TransitionTable{ return 159 case r == 126: // ['~','~'] return 159 + case 160 <= r && r <= 255: // [\u00a0,\u00ff] + return 159 case 256 <= r && r <= 1114111: // [\u0100,\U0010ffff] return 159 } @@ -1840,6 +1842,8 @@ var TransTab = TransitionTable{ return 184 case r == 126: // ['~','~'] return 184 + case 160 <= r && r <= 255: // [\u00a0,\u00ff] + return 184 case 256 <= r && r <= 1114111: // [\u0100,\U0010ffff] return 184 } @@ -3144,6 +3148,8 @@ var TransTab = TransitionTable{ return 239 case r == 126: // ['~','~'] return 159 + case 160 <= r && r <= 255: // [\u00a0,\u00ff] + return 159 case 256 <= r && r <= 1114111: // [\u0100,\U0010ffff] return 159 } @@ -3444,6 +3450,8 @@ var TransTab = TransitionTable{ return 254 case r == 126: // ['~','~'] return 184 + case 160 <= r && r <= 255: // [\u00a0,\u00ff] + return 184 case 256 <= r && r <= 1114111: // [\u0100,\U0010ffff] return 184 } @@ -4604,6 +4612,8 @@ var TransTab = TransitionTable{ return 239 case r == 126: // ['~','~'] return 159 + case 160 <= r && r <= 255: // [\u00a0,\u00ff] + return 159 case 256 <= r && r <= 1114111: // [\u0100,\U0010ffff] return 159 } @@ -4792,6 +4802,8 @@ var TransTab = TransitionTable{ return 254 case r == 126: // ['~','~'] return 184 + case 160 <= r && r <= 255: // [\u00a0,\u00ff] + return 184 case 256 <= r && r <= 1114111: // [\u0100,\U0010ffff] return 184 } diff --git a/internal/pkg/parsing/mlr.bnf b/internal/pkg/parsing/mlr.bnf index a14ed4475..6f987c827 100644 --- a/internal/pkg/parsing/mlr.bnf +++ b/internal/pkg/parsing/mlr.bnf @@ -271,6 +271,7 @@ _braced_char | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | ']' | '^' | '_' | '`' | '|' | '~' | ( '\\' '{' ) | ( '\\' '}' ) + | '\u00a0'-'\u00ff' | '\u0100'-'\U0010FFFF' ; braced_field_name: '$' '{' _braced_char { _braced_char } '}' ; diff --git a/man/manpage.txt b/man/manpage.txt index 7372e3768..0c04fc330 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -3449,4 +3449,4 @@ MILLER(1) MILLER(1) - 2023-08-19 MILLER(1) + 2023-08-20 MILLER(1) diff --git a/man/mlr.1 b/man/mlr.1 index 4e1dc9ca3..ab56c69bb 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2023-08-19 +.\" Date: 2023-08-20 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2023-08-19" "\ \&" "\ \&" +.TH "MILLER" "1" "2023-08-20" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/test/cases/dsl-utf8-field-names/0001/cmd b/test/cases/dsl-utf8-field-names/0001/cmd new file mode 100644 index 000000000..c05a5b774 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0001/cmd @@ -0,0 +1 @@ +mlr --c2p filter -f ${CASEDIR}/mlr test/input/datos-plurilingües.csv diff --git a/test/cases/dsl-utf8-field-names/0001/experr b/test/cases/dsl-utf8-field-names/0001/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/dsl-utf8-field-names/0001/expout b/test/cases/dsl-utf8-field-names/0001/expout new file mode 100644 index 000000000..79c7e5186 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0001/expout @@ -0,0 +1,3 @@ +año ποσότητα +2021 130 +2022 145 diff --git a/test/cases/dsl-utf8-field-names/0001/mlr b/test/cases/dsl-utf8-field-names/0001/mlr new file mode 100644 index 000000000..df6b0abb3 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0001/mlr @@ -0,0 +1 @@ +$año > 2020 diff --git a/test/cases/dsl-utf8-field-names/0002/cmd b/test/cases/dsl-utf8-field-names/0002/cmd new file mode 100644 index 000000000..c05a5b774 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0002/cmd @@ -0,0 +1 @@ +mlr --c2p filter -f ${CASEDIR}/mlr test/input/datos-plurilingües.csv diff --git a/test/cases/dsl-utf8-field-names/0002/experr b/test/cases/dsl-utf8-field-names/0002/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/dsl-utf8-field-names/0002/expout b/test/cases/dsl-utf8-field-names/0002/expout new file mode 100644 index 000000000..79c7e5186 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0002/expout @@ -0,0 +1,3 @@ +año ποσότητα +2021 130 +2022 145 diff --git a/test/cases/dsl-utf8-field-names/0002/mlr b/test/cases/dsl-utf8-field-names/0002/mlr new file mode 100644 index 000000000..2d8badb71 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0002/mlr @@ -0,0 +1 @@ +${año} > 2020 diff --git a/test/cases/dsl-utf8-field-names/0003/cmd b/test/cases/dsl-utf8-field-names/0003/cmd new file mode 100644 index 000000000..c05a5b774 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0003/cmd @@ -0,0 +1 @@ +mlr --c2p filter -f ${CASEDIR}/mlr test/input/datos-plurilingües.csv diff --git a/test/cases/dsl-utf8-field-names/0003/experr b/test/cases/dsl-utf8-field-names/0003/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/dsl-utf8-field-names/0003/expout b/test/cases/dsl-utf8-field-names/0003/expout new file mode 100644 index 000000000..79c7e5186 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0003/expout @@ -0,0 +1,3 @@ +año ποσότητα +2021 130 +2022 145 diff --git a/test/cases/dsl-utf8-field-names/0003/mlr b/test/cases/dsl-utf8-field-names/0003/mlr new file mode 100644 index 000000000..38bb2d731 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0003/mlr @@ -0,0 +1 @@ +$ποσότητα > 100 diff --git a/test/cases/dsl-utf8-field-names/0004/cmd b/test/cases/dsl-utf8-field-names/0004/cmd new file mode 100644 index 000000000..c05a5b774 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0004/cmd @@ -0,0 +1 @@ +mlr --c2p filter -f ${CASEDIR}/mlr test/input/datos-plurilingües.csv diff --git a/test/cases/dsl-utf8-field-names/0004/experr b/test/cases/dsl-utf8-field-names/0004/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/dsl-utf8-field-names/0004/expout b/test/cases/dsl-utf8-field-names/0004/expout new file mode 100644 index 000000000..79c7e5186 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0004/expout @@ -0,0 +1,3 @@ +año ποσότητα +2021 130 +2022 145 diff --git a/test/cases/dsl-utf8-field-names/0004/mlr b/test/cases/dsl-utf8-field-names/0004/mlr new file mode 100644 index 000000000..c2d122478 --- /dev/null +++ b/test/cases/dsl-utf8-field-names/0004/mlr @@ -0,0 +1 @@ +${ποσότητα} > 100 diff --git a/test/input/datos-plurilingües.csv b/test/input/datos-plurilingües.csv new file mode 100644 index 000000000..620d3566d --- /dev/null +++ b/test/input/datos-plurilingües.csv @@ -0,0 +1,4 @@ +año,ποσότητα +2020,100 +2021,130 +2022,145 \ No newline at end of file