diff --git a/docs/src/reference-main-strings.md b/docs/src/reference-main-strings.md index 5c5674e32..d74ec4a26 100644 --- a/docs/src/reference-main-strings.md +++ b/docs/src/reference-main-strings.md @@ -173,6 +173,25 @@ You can use the following backslash escapes for strings such as between the doub * `\"`: double quote * `\123`: Octal 123, etc. for `\000` up to `\377` * `\x7f`: Hexadecimal 7f, etc. for `\x00` up to `\xff` +* `\u2766`, `\U00010877:`: Unicode literals. For technical reasons, you must supply four hex digits after `\u` and eight hex digits after `\U`. + +
+mlr repl ++
+[mlr] "a\nb" +"a +b" + +[mlr] "a\tb" +"a b" + +[mlr] "a\x62c" +"abc" + +[mlr] "\u2766\U00010877" +"❦𐡷" +See also [https://en.wikipedia.org/wiki/Escape_sequences_in_C](https://en.wikipedia.org/wiki/Escape_sequences_in_C). diff --git a/docs/src/reference-main-strings.md.in b/docs/src/reference-main-strings.md.in index 37e6914a3..30099aab2 100644 --- a/docs/src/reference-main-strings.md.in +++ b/docs/src/reference-main-strings.md.in @@ -121,6 +121,23 @@ You can use the following backslash escapes for strings such as between the doub * `\"`: double quote * `\123`: Octal 123, etc. for `\000` up to `\377` * `\x7f`: Hexadecimal 7f, etc. for `\x00` up to `\xff` +* `\u2766`, `\U00010877:`: Unicode literals. For technical reasons, you must supply four hex digits after `\u` and eight hex digits after `\U`. + +GENMD-CARDIFY-HIGHLIGHT-ONE +mlr repl +[mlr] "a\nb" +"a +b" + +[mlr] "a\tb" +"a b" + +[mlr] "a\x62c" +"abc" + +[mlr] "\u2766\U00010877" +"❦𐡷" +GENMD-EOF See also [https://en.wikipedia.org/wiki/Escape_sequences_in_C](https://en.wikipedia.org/wiki/Escape_sequences_in_C). diff --git a/internal/pkg/lib/unbackslash.go b/internal/pkg/lib/unbackslash.go index ac3467c99..f5e411a46 100644 --- a/internal/pkg/lib/unbackslash.go +++ b/internal/pkg/lib/unbackslash.go @@ -49,6 +49,10 @@ func UnbackslashStringLiteral(input string) string { // } else { // return input // } + // + // ... and, given that desire, we don't a priori know how many digits in Unicode + // escape sequences -- so we *require* that people use four hex digits after \u + // and eight hex digits after \U. var buffer bytes.Buffer diff --git a/internal/pkg/parsing/lexer/lexer.go b/internal/pkg/parsing/lexer/lexer.go index fe0b8e30e..461c12904 100644 --- a/internal/pkg/parsing/lexer/lexer.go +++ b/internal/pkg/parsing/lexer/lexer.go @@ -12,7 +12,7 @@ import ( const ( NoState = -1 NumStates = 328 - NumSymbols = 574 + NumSymbols = 578 ) type Lexer struct { @@ -607,100 +607,104 @@ Lexer symbols: 475: '\' 476: 'x' 477: '\' -478: 'u' +478: 'a' 479: '\' -480: 'U' +480: 'v' 481: '\' -482: '.' +482: 'u' 483: '\' -484: '*' +484: 'U' 485: '\' -486: '0' +486: '.' 487: '\' -488: '1' +488: '*' 489: '\' -490: '2' +490: '0' 491: '\' -492: '3' +492: '1' 493: '\' -494: '4' +494: '2' 495: '\' -496: '5' +496: '3' 497: '\' -498: '6' +498: '4' 499: '\' -500: '7' +500: '5' 501: '\' -502: '8' +502: '6' 503: '\' -504: '9' -505: 'e' -506: 'E' -507: 't' -508: 'r' -509: 'u' -510: 'e' -511: 'f' -512: 'a' -513: 'l' -514: 's' -515: 'e' -516: ' ' -517: '!' -518: '#' -519: '$' -520: '%' -521: '&' -522: ''' -523: '\' -524: '(' -525: ')' -526: '*' -527: '+' -528: ',' -529: '-' -530: '.' -531: '/' -532: ':' -533: ';' -534: '<' -535: '=' -536: '>' -537: '?' -538: '@' -539: '[' -540: ']' -541: '^' -542: '_' -543: '`' -544: '|' -545: '~' -546: '\' -547: '{' -548: '\' -549: '}' -550: ' ' -551: '\t' -552: '\n' -553: '\r' -554: '#' -555: '\n' -556: 'a'-'z' -557: 'A'-'Z' -558: \u0100-\U0010ffff -559: '0'-'9' -560: '0'-'9' -561: 'a'-'f' -562: 'A'-'F' -563: '0'-'7' -564: '0'-'1' -565: 'A'-'Z' -566: 'a'-'z' -567: '0'-'9' -568: \u0100-\U0010ffff +504: '7' +505: '\' +506: '8' +507: '\' +508: '9' +509: 'e' +510: 'E' +511: 't' +512: 'r' +513: 'u' +514: 'e' +515: 'f' +516: 'a' +517: 'l' +518: 's' +519: 'e' +520: ' ' +521: '!' +522: '#' +523: '$' +524: '%' +525: '&' +526: ''' +527: '\' +528: '(' +529: ')' +530: '*' +531: '+' +532: ',' +533: '-' +534: '.' +535: '/' +536: ':' +537: ';' +538: '<' +539: '=' +540: '>' +541: '?' +542: '@' +543: '[' +544: ']' +545: '^' +546: '_' +547: '`' +548: '|' +549: '~' +550: '\' +551: '{' +552: '\' +553: '}' +554: ' ' +555: '\t' +556: '\n' +557: '\r' +558: '#' +559: '\n' +560: 'a'-'z' +561: 'A'-'Z' +562: \u0100-\U0010ffff +563: '0'-'9' +564: '0'-'9' +565: 'a'-'f' +566: 'A'-'F' +567: '0'-'7' +568: '0'-'1' 569: 'A'-'Z' 570: 'a'-'z' 571: '0'-'9' 572: \u0100-\U0010ffff -573: . +573: 'A'-'Z' +574: 'a'-'z' +575: '0'-'9' +576: \u0100-\U0010ffff +577: . */ diff --git a/internal/pkg/parsing/lexer/transitiontable.go b/internal/pkg/parsing/lexer/transitiontable.go index 009f3ef4b..9f58d64dc 100644 --- a/internal/pkg/parsing/lexer/transitiontable.go +++ b/internal/pkg/parsing/lexer/transitiontable.go @@ -1208,6 +1208,8 @@ var TransTab = TransitionTable{ return 57 case r == 93: // [']',']'] return 57 + case r == 97: // ['a','a'] + return 57 case r == 98: // ['b','b'] return 57 case r == 102: // ['f','f'] @@ -1220,6 +1222,8 @@ var TransTab = TransitionTable{ return 57 case r == 117: // ['u','u'] return 57 + case r == 118: // ['v','v'] + return 57 case r == 120: // ['x','x'] return 57 } diff --git a/internal/pkg/parsing/mlr.bnf b/internal/pkg/parsing/mlr.bnf index 05e097bbd..58d9f33ef 100644 --- a/internal/pkg/parsing/mlr.bnf +++ b/internal/pkg/parsing/mlr.bnf @@ -89,8 +89,7 @@ _string_literal_element | ( '\\' '[' ) | ( '\\' ']' ) | ( '\\' 'b' ) | ( '\\' 'f' ) | ( '\\' 'n' ) | ( '\\' 'r' ) - | ( '\\' 't' ) - | ( '\\' 'x' ) + | ( '\\' 't' ) | ( '\\' 'x' ) | ( '\\' 'a' ) | ( '\\' 'v' ) | ( '\\' 'u' ) | ( '\\' 'U' ) | ( '\\' '.' ) | ( '\\' '*' ) diff --git a/todo.txt b/todo.txt index e71aedbb1..f1b1e09d3 100644 --- a/todo.txt +++ b/todo.txt @@ -1,14 +1,12 @@ =============================================================== RELEASES * plan 6.1.0 - ? strptime - ? inference - ? datediff et al. + ? strptime/882 ? mlr join --left-fields a,b,c o fmt/unfmt/regex doc o FAQ/examples reorg m strptime/strftime tabulate options - m unicode string literals + k unicode string literals k natural sort order k IANA-TSV w/ \{X} k still need csv --lazy-quotes @@ -23,8 +21,10 @@ k ?foo and ??foo @ repl help k doc-improves * plan 6.2.0 + ? datediff et al. ? rank ? YAML + ? #908 inferencing options ================================================================ FEATURES