Webdoc information on Unicode string literals (#935)

2026-01-23 02:14:13 +00:00 · 2022-02-09 00:28:04 -05:00 · 2022-02-09 00:28:04 -05:00 · 8eeb82809e
commit 8eeb82809e
parent de9e17f73b
7 changed files with 133 additions and 86 deletions
--- a/docs/src/reference-main-strings.md
+++ b/docs/src/reference-main-strings.md
@ -173,6 +173,25 @@ You can use the following backslash escapes for strings such as between the doub
 * `\"`: double quote
 * `\123`: Octal 123, etc. for `\000` up to `\377`
 * `\x7f`: Hexadecimal 7f, etc. for `\x00` up to `\xff`
+* `\u2766`, `\U00010877:`: Unicode literals. For technical reasons, you must supply four hex digits after `\u` and eight hex digits after `\U`.
+
+<pre class="pre-highlight-in-pair">
+<b>mlr repl</b>
+</pre>
+<pre class="pre-non-highlight-in-pair">
+[mlr] "a\nb"
+"a
+b"
+
+[mlr] "a\tb"
+"a	b"
+
+[mlr] "a\x62c"
+"abc"
+
+[mlr] "\u2766\U00010877"
+"❦𐡷"
+</pre>

 See also [https://en.wikipedia.org/wiki/Escape_sequences_in_C](https://en.wikipedia.org/wiki/Escape_sequences_in_C).

--- a/docs/src/reference-main-strings.md.in
+++ b/docs/src/reference-main-strings.md.in
@ -121,6 +121,23 @@ You can use the following backslash escapes for strings such as between the doub
 * `\"`: double quote
 * `\123`: Octal 123, etc. for `\000` up to `\377`
 * `\x7f`: Hexadecimal 7f, etc. for `\x00` up to `\xff`
+* `\u2766`, `\U00010877:`: Unicode literals. For technical reasons, you must supply four hex digits after `\u` and eight hex digits after `\U`.
+
+GENMD-CARDIFY-HIGHLIGHT-ONE
+mlr repl
+[mlr] "a\nb"
+"a
+b"
+
+[mlr] "a\tb"
+"a	b"
+
+[mlr] "a\x62c"
+"abc"
+
+[mlr] "\u2766\U00010877"
+"❦𐡷"
+GENMD-EOF

 See also [https://en.wikipedia.org/wiki/Escape_sequences_in_C](https://en.wikipedia.org/wiki/Escape_sequences_in_C).

--- a/internal/pkg/lib/unbackslash.go
+++ b/internal/pkg/lib/unbackslash.go
@ -49,6 +49,10 @@ func UnbackslashStringLiteral(input string) string {
 	//	} else {
 	//		return input
 	//	}
+	//
+	// ... and, given that desire, we don't a priori know how many digits in Unicode
+	// escape sequences -- so we *require* that people use four hex digits after \u
+	// and eight hex digits after \U.

 	var buffer bytes.Buffer

--- a/internal/pkg/parsing/lexer/lexer.go
+++ b/internal/pkg/parsing/lexer/lexer.go
@ -12,7 +12,7 @@ import (
 const (
 	NoState    = -1
 	NumStates  = 328
-	NumSymbols = 574
+	NumSymbols = 578
 )

 type Lexer struct {
@ -607,100 +607,104 @@ Lexer symbols:
 475: '\'
 476: 'x'
 477: '\'
-478: 'u'
+478: 'a'
 479: '\'
-480: 'U'
+480: 'v'
 481: '\'
-482: '.'
+482: 'u'
 483: '\'
-484: '*'
+484: 'U'
 485: '\'
-486: '0'
+486: '.'
 487: '\'
-488: '1'
+488: '*'
 489: '\'
-490: '2'
+490: '0'
 491: '\'
-492: '3'
+492: '1'
 493: '\'
-494: '4'
+494: '2'
 495: '\'
-496: '5'
+496: '3'
 497: '\'
-498: '6'
+498: '4'
 499: '\'
-500: '7'
+500: '5'
 501: '\'
-502: '8'
+502: '6'
 503: '\'
-504: '9'
-505: 'e'
-506: 'E'
-507: 't'
-508: 'r'
-509: 'u'
-510: 'e'
-511: 'f'
-512: 'a'
-513: 'l'
-514: 's'
-515: 'e'
-516: ' '
-517: '!'
-518: '#'
-519: '$'
-520: '%'
-521: '&'
-522: '''
-523: '\'
-524: '('
-525: ')'
-526: '*'
-527: '+'
-528: ','
-529: '-'
-530: '.'
-531: '/'
-532: ':'
-533: ';'
-534: '<'
-535: '='
-536: '>'
-537: '?'
-538: '@'
-539: '['
-540: ']'
-541: '^'
-542: '_'
-543: '`'
-544: '|'
-545: '~'
-546: '\'
-547: '{'
-548: '\'
-549: '}'
-550: ' '
-551: '\t'
-552: '\n'
-553: '\r'
-554: '#'
-555: '\n'
-556: 'a'-'z'
-557: 'A'-'Z'
-558: \u0100-\U0010ffff
-559: '0'-'9'
-560: '0'-'9'
-561: 'a'-'f'
-562: 'A'-'F'
-563: '0'-'7'
-564: '0'-'1'
-565: 'A'-'Z'
-566: 'a'-'z'
-567: '0'-'9'
-568: \u0100-\U0010ffff
+504: '7'
+505: '\'
+506: '8'
+507: '\'
+508: '9'
+509: 'e'
+510: 'E'
+511: 't'
+512: 'r'
+513: 'u'
+514: 'e'
+515: 'f'
+516: 'a'
+517: 'l'
+518: 's'
+519: 'e'
+520: ' '
+521: '!'
+522: '#'
+523: '$'
+524: '%'
+525: '&'
+526: '''
+527: '\'
+528: '('
+529: ')'
+530: '*'
+531: '+'
+532: ','
+533: '-'
+534: '.'
+535: '/'
+536: ':'
+537: ';'
+538: '<'
+539: '='
+540: '>'
+541: '?'
+542: '@'
+543: '['
+544: ']'
+545: '^'
+546: '_'
+547: '`'
+548: '|'
+549: '~'
+550: '\'
+551: '{'
+552: '\'
+553: '}'
+554: ' '
+555: '\t'
+556: '\n'
+557: '\r'
+558: '#'
+559: '\n'
+560: 'a'-'z'
+561: 'A'-'Z'
+562: \u0100-\U0010ffff
+563: '0'-'9'
+564: '0'-'9'
+565: 'a'-'f'
+566: 'A'-'F'
+567: '0'-'7'
+568: '0'-'1'
 569: 'A'-'Z'
 570: 'a'-'z'
 571: '0'-'9'
 572: \u0100-\U0010ffff
-573: .
+573: 'A'-'Z'
+574: 'a'-'z'
+575: '0'-'9'
+576: \u0100-\U0010ffff
+577: .
 */
--- a/internal/pkg/parsing/lexer/transitiontable.go
+++ b/internal/pkg/parsing/lexer/transitiontable.go
@ -1208,6 +1208,8 @@ var TransTab = TransitionTable{
 			return 57
 		case r == 93: // [']',']']
 			return 57
+		case r == 97: // ['a','a']
+			return 57
 		case r == 98: // ['b','b']
 			return 57
 		case r == 102: // ['f','f']
@ -1220,6 +1222,8 @@ var TransTab = TransitionTable{
 			return 57
 		case r == 117: // ['u','u']
 			return 57
+		case r == 118: // ['v','v']
+			return 57
 		case r == 120: // ['x','x']
 			return 57
 		}
--- a/internal/pkg/parsing/mlr.bnf
+++ b/internal/pkg/parsing/mlr.bnf
@ -89,8 +89,7 @@ _string_literal_element
  | ( '\\' '[' ) | ( '\\' ']' )
  | ( '\\' 'b' ) | ( '\\' 'f' )
  | ( '\\' 'n' ) | ( '\\' 'r' )
-  | ( '\\' 't' )
-  | ( '\\' 'x' )
+  | ( '\\' 't' ) | ( '\\' 'x' ) | ( '\\' 'a' ) | ( '\\' 'v' )
  | ( '\\' 'u' ) | ( '\\' 'U' )
  | ( '\\' '.' )
  | ( '\\' '*' )
--- a/todo.txt
+++ b/todo.txt
@ -1,14 +1,12 @@
 =============================================================== RELEASES

 * plan 6.1.0
-  ? strptime
-  ? inference
-  ? datediff et al.
+  ? strptime/882
  ? mlr join --left-fields a,b,c
  o fmt/unfmt/regex doc
  o FAQ/examples reorg
  m strptime/strftime tabulate options
-  m unicode string literals
+  k unicode string literals
  k natural sort order
  k IANA-TSV w/ \{X}
  k still need csv --lazy-quotes
@ -23,8 +21,10 @@
  k ?foo and ??foo @ repl help
  k doc-improves
 * plan 6.2.0
+  ? datediff et al.
  ? rank
  ? YAML
+  ? #908 inferencing options

 ================================================================
 FEATURES