Merge branch 'mr/pmderodat/utf8-preparatory' into 'master'

pmderodat · pmderodat · commit 29576673a223 · 2024-03-13T17:09:33.000Z
Preparatory work for the transition of source buffers to UTF-8

See merge request eng/libadalang/langkit!1022
diff --git a/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt b/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt
@@ -2,7 +2,7 @@ lexer foo_lexer {
 
     char
     dot <- "."
-    id <- p"[a-zA-Z]+"
+    id <- p"[a-zA-Zé🙂]+"
     tick <- "'"
     newline <- p"\n"
 
diff --git a/testsuite/tests/grammar/case_rule/main.py b/testsuite/tests/grammar/case_rule/main.py
@@ -10,6 +10,7 @@
     ('simple-attr', "a'b"),
     ('char-dot', "'a'.b"),
     ('id-char', "a'b'"),
+    ('unicode-id-char', "\xe9'\U0001f642'"),
 ):
     print('== {} =='.format(label))
     u = ctx.get_from_buffer('{}.txt'.format(label), text)
diff --git a/testsuite/tests/grammar/case_rule/test.out b/testsuite/tests/grammar/case_rule/test.out
@@ -24,5 +24,14 @@ main.py: Running...
 <Token Tick "'" at 1:4-1:5>
 <Token Termination at 1:5-1:5>
 
+== unicode-id-char ==
+1:5-1:5: Expected Id, got Termination
+--
+<Token Id 'é' at 1:1-1:2>
+<Token Tick "'" at 1:2-1:3>
+<Token Id '🙂' at 1:3-1:4>
+<Token Tick "'" at 1:4-1:5>
+<Token Termination at 1:5-1:5>
+
 main.py: Done.
 Done
diff --git a/testsuite/tests/misc/unicode/empty.txt b/testsuite/tests/misc/unicode/empty.txt
diff --git a/testsuite/tests/misc/unicode/expected_concrete_syntax.lkt b/testsuite/tests/misc/unicode/expected_concrete_syntax.lkt
@@ -0,0 +1,16 @@
+import lexer_example
+
+@with_lexer(foo_lexer)
+grammar foo_grammar {
+    @main_rule main_rule <- list+(Example(@example StrLit(@string)))
+}
+
+@abstract class FooNode implements Node[FooNode] {
+}
+
+class Example: FooNode {
+    @parse_field f: StrLit
+}
+
+class StrLit: FooNode implements TokenNode {
+}
diff --git a/testsuite/tests/misc/unicode/main-iso-8859-1.txt b/testsuite/tests/misc/unicode/main-iso-8859-1.txt
@@ -0,0 +1,43 @@
+# ��������������������������������������������������������������������������� #
+# ��������������������������������������������������������������������������� #
+
+example "1�"
+
+# ��������������������������������������������������������������������������� #
+
+example "1�2�"
+
+# ��������������������������������������������������������������������������� #
+
+example "1�2�3�"
+
+# ��������������������������������������������������������������������������� #
+
+example "1�2�3�4�"
+
+# ��������������������������������������������������������������������������� #
+
+example "1�2�3�4�5�"
+
+# ��������������������������������������������������������������������������� #
+
+example "1�2�3�4�5�6�"
+
+# ��������������������������������������������������������������������������� #
+
+example "1�2�3�4�5�6�7�"
+
+# ��������������������������������������������������������������������������� #
+
+example "1�2�3�4�5�6�7�8�"
+
+# ��������������������������������������������������������������������������� #
+
+example "1�2�3�4�5�6�7�8�9�"
+
+# ��������������������������������������������������������������������������� #
+
+example "1�2�3�4�5�6�7�8�9�0�"
+
+# ��������������������������������������������������������������������������� #
+# ��������������������������������������������������������������������������� #
diff --git a/testsuite/tests/misc/unicode/main.adb b/testsuite/tests/misc/unicode/main.adb
@@ -0,0 +1,131 @@
+with Ada.Text_IO; use Ada.Text_IO;
+
+with GNAT.Strings;  use GNAT.Strings;
+with GNATCOLL.Mmap; use GNATCOLL.Mmap;
+
+with Langkit_Support.File_Readers; use Langkit_Support.File_Readers;
+with Langkit_Support.Slocs;        use Langkit_Support.Slocs;
+with Libfoolang.Analysis;          use Libfoolang.Analysis;
+with Libfoolang.Common;            use Libfoolang.Common;
+
+with Support; use Support;
+
+procedure Main is
+
+   Empty_File     : constant String := "empty.txt";
+   Empty_Buffer   : aliased constant String := "";
+
+   Example_File   : constant String := "main-iso-8859-1.txt";
+   Example_Buffer : String_Access := Read_Whole_File (Example_File);
+
+   procedure Check
+     (From_Buffer      : Boolean := False;
+      Empty_File       : Boolean := False;
+      Wrong_Encoding   : Boolean := False;
+      With_File_Reader : Boolean := False);
+
+   -----------
+   -- Check --
+   -----------
+
+   procedure Check
+     (From_Buffer      : Boolean := False;
+      Empty_File       : Boolean := False;
+      Wrong_Encoding   : Boolean := False;
+      With_File_Reader : Boolean := False)
+   is
+      Charset  : constant String :=
+        (if Wrong_Encoding then "utf-8" else "iso-8859-1");
+      Filename : constant String :=
+        (if Empty_File then Main.Empty_File else Example_File);
+      Buffer   : constant access constant String :=
+        (if Empty_File then Empty_Buffer'Access else Example_Buffer);
+
+      Ctx : Analysis_Context;
+      U   : Analysis_Unit;
+   begin
+      --  Put some label for this check
+
+      Put ("== ");
+      Put (if From_Buffer then "buffer" else "file");
+      Put (" | ");
+      Put (if Empty_File then "empty-file" else "example-file");
+      Put (" | ");
+      Put (if Wrong_Encoding then "wrong-encoding" else "correct-encoding");
+      Put (" | ");
+      Put (if With_File_Reader then "file-reader" else "default");
+      Put_Line (" ==");
+      New_Line;
+
+      --  Parse the source according to requested settings
+
+      Ctx := Create_Context
+        (File_Reader => (if With_File_Reader
+                         then Get_File_Reader
+                         else No_File_Reader_Reference));
+      if From_Buffer then
+         U := Ctx.Get_From_Buffer
+           (Filename => Filename,
+            Charset  => Charset,
+            Buffer   => Buffer.all);
+      else
+         U := Ctx.Get_From_File
+           (Filename => Filename, Charset => Charset);
+      end if;
+
+      --  Display parsing errors, if any
+
+      if U.Has_Diagnostics then
+         Put_Line ("Errors:");
+         for D of U.Diagnostics loop
+            Put_Line ("  " & U.Format_GNU_Diagnostic (D));
+         end loop;
+         New_Line;
+      end if;
+
+      --  Summarize the content of the parsed unit
+
+      if U.Root.Is_Null then
+         Put_Line ("No root node");
+      else
+         Put_Line ("Root node children:" & U.Root.Children_Count'Image);
+         declare
+            D : constant Token_Data_Type := Data (U.First_Token);
+         begin
+            Put_Line
+              ("First token: "
+               & Kind (D)'Image
+               & " at " & Image (Sloc_Range (D)));
+         end;
+         declare
+            D : constant Token_Data_Type := Data (U.Last_Token);
+         begin
+            Put_Line
+              ("Last token:  "
+               & Kind (D)'Image
+               & " at " & Image (Sloc_Range (D)));
+         end;
+      end if;
+      New_Line;
+   end Check;
+
+begin
+   --  Get_From_File
+
+   Check;
+   Check (With_File_Reader => True);
+
+   Check (Empty_File => True);
+   Check (Empty_File => True, With_File_Reader => True);
+
+   Check (Wrong_Encoding => True);
+   Check (Wrong_Encoding => True, With_File_Reader => True);
+
+   --  Get_From_Buffer
+
+   Check (From_Buffer => True);
+   Check (From_Buffer => True, Empty_File => True);
+   Check (From_Buffer => True, Wrong_Encoding => True);
+
+   Free (Example_Buffer);
+end Main;
diff --git a/testsuite/tests/misc/unicode/support.adb b/testsuite/tests/misc/unicode/support.adb
@@ -0,0 +1,46 @@
+with Langkit_Support.Diagnostics; use Langkit_Support.Diagnostics;
+
+package body Support is
+
+   type My_FR is new File_Reader_Interface with null record;
+
+   overriding procedure Read
+     (Self        : My_FR;
+      Filename    : String;
+      Charset     : String;
+      Read_BOM    : Boolean;
+      Contents    : out Decoded_File_Contents;
+      Diagnostics : in out Diagnostics_Vectors.Vector);
+
+   overriding procedure Release (Self : in out My_FR) is null;
+
+   ----------
+   -- Read --
+   ----------
+
+   overriding procedure Read
+     (Self        : My_FR;
+      Filename    : String;
+      Charset     : String;
+      Read_BOM    : Boolean;
+      Contents    : out Decoded_File_Contents;
+      Diagnostics : in out Diagnostics_Vectors.Vector)
+   is
+   begin
+      Direct_Read (Filename, Charset, Read_BOM, Contents, Diagnostics);
+      if Diagnostics.Is_Empty and then Contents.Buffer.all'Length > 79 then
+         Contents.Buffer.all (Contents.First .. Contents.First + 79) :=
+           (1 .. 80 => ' ');
+      end if;
+   end Read;
+
+   ---------------------
+   -- Get_File_Reader --
+   ---------------------
+
+   function Get_File_Reader return File_Reader_Reference is
+   begin
+      return Create_File_Reader_Reference (My_FR'(null record));
+   end Get_File_Reader;
+
+end Support;
diff --git a/testsuite/tests/misc/unicode/support.ads b/testsuite/tests/misc/unicode/support.ads
@@ -0,0 +1,5 @@
+with Langkit_Support.File_Readers; use Langkit_Support.File_Readers;
+
+package Support is
+   function Get_File_Reader return File_Reader_Reference;
+end Support;
diff --git a/testsuite/tests/misc/unicode/test.out b/testsuite/tests/misc/unicode/test.out
@@ -0,0 +1,76 @@
+== file | example-file | correct-encoding | default ==
+
+Root node children: 10
+First token: FOO_COMMENT at 1:1-1:80
+Last token:  FOO_TERMINATION at 44:1-44:1
+
+== file | example-file | correct-encoding | file-reader ==
+
+Root node children: 10
+First token: FOO_WHITESPACE at 1:1-1:81
+Last token:  FOO_TERMINATION at 43:1-43:1
+
+== file | empty-file | correct-encoding | default ==
+
+Errors:
+  empty.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+== file | empty-file | correct-encoding | file-reader ==
+
+Errors:
+  empty.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+== file | example-file | wrong-encoding | default ==
+
+Errors:
+  main-iso-8859-1.txt:1:3: Could not decode source as "utf-8"
+  main-iso-8859-1.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+== file | example-file | wrong-encoding | file-reader ==
+
+Errors:
+  main-iso-8859-1.txt:1:3: Could not decode source as "utf-8"
+  main-iso-8859-1.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+== buffer | example-file | correct-encoding | default ==
+
+Root node children: 10
+First token: FOO_COMMENT at 1:1-1:80
+Last token:  FOO_TERMINATION at 44:1-44:1
+
+== buffer | empty-file | correct-encoding | default ==
+
+Errors:
+  empty.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+== buffer | example-file | wrong-encoding | default ==
+
+Errors:
+  main-iso-8859-1.txt:1:3: Could not decode source as "utf-8"
+  main-iso-8859-1.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+Done
diff --git a/testsuite/tests/misc/unicode/test.py b/testsuite/tests/misc/unicode/test.py
@@ -0,0 +1,25 @@
+"""
+Check that the handling of Unicode for various parsing settings (get from
+file/buffer, encoding, file reader, ...) works correctly.
+"""
+
+from langkit.dsl import ASTNode, Field, T
+
+from utils import build_and_run
+
+
+class FooNode(ASTNode):
+    pass
+
+
+class Example(FooNode):
+    f = Field(type=T.StrLit)
+
+
+class StrLit(FooNode):
+    token_node = True
+
+
+build_and_run(lkt_file="expected_concrete_syntax.lkt", gpr_mains=["main.adb"])
+
+print("Done")
diff --git a/testsuite/tests/misc/unicode/test.yaml b/testsuite/tests/misc/unicode/test.yaml
@@ -0,0 +1 @@
+driver: python