grammar/case_rule: extend the testcase to check non-ASCII tokens

pmderodat · pmderodat · commit 4048175e62bc · 2024-03-13T16:30:42.000Z
The logic of case/match lexing rules may be complex when working on
source buffers encoded using varying length charsets such as UTF-8.
Extend this testcase so that the "backwards codepoint lookup" behavior
is exercised with a multi-bytes codepoint.
diff --git a/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt b/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt
@@ -2,7 +2,7 @@ lexer foo_lexer {
 
     char
     dot <- "."
-    id <- p"[a-zA-Z]+"
+    id <- p"[a-zA-Zé🙂]+"
     tick <- "'"
     newline <- p"\n"
 
diff --git a/testsuite/tests/grammar/case_rule/main.py b/testsuite/tests/grammar/case_rule/main.py
@@ -10,6 +10,7 @@
     ('simple-attr', "a'b"),
     ('char-dot', "'a'.b"),
     ('id-char', "a'b'"),
+    ('unicode-id-char', "\xe9'\U0001f642'"),
 ):
     print('== {} =='.format(label))
     u = ctx.get_from_buffer('{}.txt'.format(label), text)
diff --git a/testsuite/tests/grammar/case_rule/test.out b/testsuite/tests/grammar/case_rule/test.out
@@ -24,5 +24,14 @@ main.py: Running...
 <Token Tick "'" at 1:4-1:5>
 <Token Termination at 1:5-1:5>
 
+== unicode-id-char ==
+1:5-1:5: Expected Id, got Termination
+--
+<Token Id 'é' at 1:1-1:2>
+<Token Tick "'" at 1:2-1:3>
+<Token Id '🙂' at 1:3-1:4>
+<Token Tick "'" at 1:4-1:5>
+<Token Termination at 1:5-1:5>
+
 main.py: Done.
 Done