Skip to content

Commit 6f53bc0

Browse files
committed
Merge branch 'mr/pmderodat/lkt-literals-decoding' into 'master'
Lkt: rework error handling in char/string literal decoding See merge request eng/libadalang/langkit!1016
2 parents d6b4ea9 + 67aada8 commit 6f53bc0

24 files changed

+423
-78
lines changed

contrib/lkt/extensions/src/liblktlang-implementation-extensions.adb

Lines changed: 148 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -10,32 +10,46 @@ with Liblktlang.Public_Converters; use Liblktlang.Public_Converters;
1010

1111
package body Liblktlang.Implementation.Extensions is
1212

13-
function Common_Denoted_String (Node : Bare_Lkt_Node) return String_Type;
13+
function Common_Denoted_String
14+
(Node : Bare_Lkt_Node) return Internal_Decoded_String_Value;
1415
-- Common implementation for the ``p_denoted_string`` property of all
1516
-- string/pattern literal nodes.
1617

1718
procedure Read_Denoted_Char
18-
(Buffer : Text_Type;
19-
Cursor : in out Positive;
20-
Result : out Character_Type);
21-
-- Read the next denoted character starting at ``Buffer (Cursor)``. Upon
22-
-- return, ``Cursor`` points to the first item in ``Buffer`` for the next
23-
-- character to read (or to the closing double quote if the character read
24-
-- was the last one), and ``Result`` is set to the character that was just
25-
-- read.
19+
(Buffer : Text_Type;
20+
For_Char_Lit : Boolean;
21+
Cursor : in out Positive;
22+
Cursor_Sloc : in out Source_Location;
23+
Result : out Internal_Decoded_Char_Value);
24+
-- Read the next denoted character starting at ``Buffer (Cursor)``.
25+
--
26+
-- The location of the character at ``Buffer (Cursor)`` must be passed to
27+
-- ``Cursor_Sloc``, which is updated to follow the evolution of ``Cursor``.
28+
--
29+
-- Upon return, ``Cursor`` points to the first item in ``Buffer`` for the
30+
-- next character to read (or to the closing single/double quote if the
31+
-- character read was the last one) and ``Result`` is set to the character
32+
-- that was just read, or to an error message if reading one character was
33+
-- unsuccessful.
2634

2735
---------------------------
2836
-- Common_Denoted_String --
2937
---------------------------
3038

31-
function Common_Denoted_String (Node : Bare_Lkt_Node) return String_Type is
39+
function Common_Denoted_String
40+
(Node : Bare_Lkt_Node) return Internal_Decoded_String_Value
41+
is
42+
Tab_Stop : constant Positive := Node.Unit.Context.Tab_Stop;
43+
3244
N_Text : constant Text_Type := Text (Node);
3345
pragma Assert (N_Text (N_Text'Last) = '"');
3446

35-
Cursor : Natural := N_Text'First + 1;
47+
Cursor : Natural := N_Text'First + 1;
48+
Cursor_Sloc : Source_Location := Start_Sloc (Sloc_Range (Node));
3649

3750
Result : Text_Type (1 .. N_Text'Length);
3851
Result_Last : Natural := Result'First - 1;
52+
Char_Value : Internal_Decoded_Char_Value;
3953
begin
4054
-- Make sure that the slice starts at the first denoted character in the
4155
-- presence of string literal prefix.
@@ -45,29 +59,53 @@ package body Liblktlang.Implementation.Extensions is
4559
Cursor := Cursor + 1;
4660
end if;
4761

62+
-- Update Cursor_Sloc so that it reflects the location of N_Text
63+
-- (Cursor).
64+
65+
Cursor_Sloc.Column :=
66+
Cursor_Sloc.Column
67+
+ Column_Count (N_Text (N_Text'First .. Cursor), Tab_Stop);
68+
4869
while Cursor /= N_Text'Last loop
4970
Result_Last := Result_Last + 1;
50-
Read_Denoted_Char (N_Text, Cursor, Result (Result_Last));
71+
Read_Denoted_Char (N_Text, False, Cursor, Cursor_Sloc, Char_Value);
72+
if Char_Value.Has_Error then
73+
return
74+
(Value => Empty_String,
75+
Has_Error => True,
76+
Error_Sloc => Char_Value.Error_Sloc,
77+
Error_Message => Char_Value.Error_Message);
78+
end if;
79+
Result (Result_Last) := Char_Value.Value;
5180
end loop;
5281

53-
return Create_String (Result (Result'First .. Result_Last));
82+
return
83+
(Value => Create_String (Result (Result'First .. Result_Last)),
84+
Has_Error => False,
85+
Error_Sloc => No_Source_Location,
86+
Error_Message => Empty_String);
5487
end Common_Denoted_String;
5588

5689
-----------------------
5790
-- Read_Denoted_Char --
5891
-----------------------
5992

6093
procedure Read_Denoted_Char
61-
(Buffer : Text_Type;
62-
Cursor : in out Positive;
63-
Result : out Character_Type)
64-
is
65-
-- Note that, since buffer comes from a successfully lexed character,
66-
-- string or pattern literal token, it is supposed to be well-formed:
67-
-- "when other" clauses in the code below are thus dead code.
94+
(Buffer : Text_Type;
95+
For_Char_Lit : Boolean;
96+
Cursor : in out Positive;
97+
Cursor_Sloc : in out Source_Location;
98+
Result : out Internal_Decoded_Char_Value) is
6899
begin
100+
Result :=
101+
(Value => ' ',
102+
Has_Error => False,
103+
Error_Sloc => Cursor_Sloc,
104+
Error_Message => Empty_String);
105+
69106
if Buffer (Cursor) = '\' then
70107
Cursor := Cursor + 1;
108+
Cursor_Sloc.Column := Cursor_Sloc.Column + 1;
71109
declare
72110
function Read_Digits (N : Positive) return Character_Type;
73111
-- Read N hexadecimal digits (encoding a codepoint number) and
@@ -90,7 +128,10 @@ package body Liblktlang.Implementation.Extensions is
90128
Digit_Value : Unsigned_32;
91129
begin
92130
for I in 1 .. N loop
93-
Digit_Char := Buffer (Cursor + I);
131+
Cursor := Cursor + 1;
132+
Cursor_Sloc.Column := Cursor_Sloc.Column + 1;
133+
134+
Digit_Char := Buffer (Cursor);
94135
case Digit_Char is
95136
when '0' .. '9' =>
96137
Digit_Value :=
@@ -105,14 +146,19 @@ package body Liblktlang.Implementation.Extensions is
105146
Character_Type'Pos (Digit_Char)
106147
- Character_Type'Pos ('A') + 10;
107148
when others =>
108-
raise Program_Error;
149+
Result.Has_Error := True;
150+
Result.Error_Message :=
151+
Create_String ("invalid escape sequence");
152+
return ' ';
109153
end case;
110154
Codepoint := 16 * Codepoint + Digit_Value;
111155
end loop;
112156

113-
-- Move past the escape sequence prefix and the digits
157+
-- Move past the last digit of the escape sequence
158+
159+
Cursor := Cursor + 1;
160+
Cursor_Sloc.Column := Cursor_Sloc.Column + 1;
114161

115-
Cursor := Cursor + 1 + N;
116162
return Character_Type'Val (Codepoint);
117163
end Read_Digits;
118164

@@ -124,36 +170,73 @@ package body Liblktlang.Implementation.Extensions is
124170
(Codepoint : Character) return Character_Type is
125171
begin
126172
Cursor := Cursor + 1;
173+
Cursor_Sloc.Column := Cursor_Sloc.Column + 1;
127174
return Character_Type'Val (Character'Pos (Codepoint));
128175
end Short_Escape_Sequence;
129176

130177
begin
131-
Result :=
132-
(case Buffer (Cursor) is
178+
case Buffer (Cursor) is
133179

134180
-- Escape sequences for codepoint numbers
135181

136-
when 'x' => Read_Digits (2),
137-
when 'u' => Read_Digits (4),
138-
when 'U' => Read_Digits (8),
182+
when 'x' =>
183+
Result.Value := Read_Digits (2);
184+
when 'u' =>
185+
Result.Value := Read_Digits (4);
186+
when 'U' =>
187+
Result.Value := Read_Digits (8);
139188

140189
-- Short escape sequences
141190

142-
when '0' => Short_Escape_Sequence (ASCII.NUL),
143-
when 'a' => Short_Escape_Sequence (ASCII.BEL),
144-
when 'b' => Short_Escape_Sequence (ASCII.BS),
145-
when 't' => Short_Escape_Sequence (ASCII.HT),
146-
when 'n' => Short_Escape_Sequence (ASCII.LF),
147-
when 'v' => Short_Escape_Sequence (ASCII.VT),
148-
when 'f' => Short_Escape_Sequence (ASCII.FF),
149-
when 'r' => Short_Escape_Sequence (ASCII.CR),
150-
when '\' => Short_Escape_Sequence ('\'),
151-
when '"' => Short_Escape_Sequence ('"'),
152-
153-
when others => raise Program_Error);
191+
when '0' =>
192+
Result.Value := Short_Escape_Sequence (ASCII.NUL);
193+
when 'a' =>
194+
Result.Value := Short_Escape_Sequence (ASCII.BEL);
195+
when 'b' =>
196+
Result.Value := Short_Escape_Sequence (ASCII.BS);
197+
when 't' =>
198+
Result.Value := Short_Escape_Sequence (ASCII.HT);
199+
when 'n' =>
200+
Result.Value := Short_Escape_Sequence (ASCII.LF);
201+
when 'v' =>
202+
Result.Value := Short_Escape_Sequence (ASCII.VT);
203+
when 'f' =>
204+
Result.Value := Short_Escape_Sequence (ASCII.FF);
205+
when 'r' =>
206+
Result.Value := Short_Escape_Sequence (ASCII.CR);
207+
when '\' =>
208+
Result.Value := Short_Escape_Sequence ('\');
209+
when '"' =>
210+
if For_Char_Lit then
211+
Result.Has_Error := True;
212+
Result.Error_Message :=
213+
Create_String ("invalid escape sequence");
214+
else
215+
Result.Value := Short_Escape_Sequence ('"');
216+
end if;
217+
when ''' =>
218+
if For_Char_Lit then
219+
Result.Value := Short_Escape_Sequence (''');
220+
else
221+
Result.Has_Error := True;
222+
Result.Error_Message :=
223+
Create_String ("invalid escape sequence");
224+
end if;
225+
226+
when others =>
227+
Result.Has_Error := True;
228+
Result.Error_Message :=
229+
Create_String ("invalid escape sequence");
230+
end case;
231+
232+
if Result.Has_Error then
233+
return;
234+
end if;
154235
end;
155236
else
156-
Result := Buffer (Cursor);
237+
Result.Value := Buffer (Cursor);
238+
Cursor_Sloc.Column :=
239+
Cursor_Sloc.Column + Column_Count (Buffer (Cursor .. Cursor));
157240
Cursor := Cursor + 1;
158241
end if;
159242
end Read_Denoted_Char;
@@ -331,17 +414,32 @@ package body Liblktlang.Implementation.Extensions is
331414
------------------------------
332415

333416
function Char_Lit_P_Denoted_Value
334-
(Node : Bare_Char_Lit) return Character_Type
417+
(Node : Bare_Char_Lit) return Internal_Decoded_Char_Value
335418
is
336419
N_Text : constant Text_Type := Text (Node);
337420
pragma Assert (N_Text (N_Text'First) = ''');
338421
pragma Assert (N_Text (N_Text'Last) = ''');
339422

340-
Cursor : Positive := N_Text'First + 1;
341-
Result : Character_Type;
423+
Cursor : Positive := N_Text'First + 1;
424+
Cursor_Sloc : Source_Location := Start_Sloc (Sloc_Range (Node));
425+
Result : Internal_Decoded_Char_Value;
342426
begin
343-
Read_Denoted_Char (N_Text, Cursor, Result);
344-
pragma Assert (Cursor = N_Text'Last);
427+
-- Before reading the denoted character, update Cursor_Sloc so that it
428+
-- corresponds to the character right after the opening single quote.
429+
430+
Cursor_Sloc.Column := Cursor_Sloc.Column + 1;
431+
Read_Denoted_Char (N_Text, True, Cursor, Cursor_Sloc, Result);
432+
433+
-- Ensure that reading one character has moved the cursor to the closing
434+
-- quote. If it is not the case, there are too many characters in this
435+
-- literal.
436+
437+
if not Result.Has_Error and then Cursor /= N_Text'Last then
438+
Result.Has_Error := True;
439+
Result.Error_Sloc := Cursor_Sloc;
440+
Result.Error_Message :=
441+
Create_String ("exactly one character expected");
442+
end if;
345443
return Result;
346444
end Char_Lit_P_Denoted_Value;
347445

@@ -350,7 +448,7 @@ package body Liblktlang.Implementation.Extensions is
350448
--------------------------------
351449

352450
function String_Lit_P_Denoted_Value
353-
(Node : Bare_String_Lit) return String_Type is
451+
(Node : Bare_String_Lit) return Internal_Decoded_String_Value is
354452
begin
355453
return Common_Denoted_String (Node);
356454
end String_Lit_P_Denoted_Value;
@@ -360,7 +458,7 @@ package body Liblktlang.Implementation.Extensions is
360458
-------------------------------
361459

362460
function Token_Lit_P_Denoted_Value
363-
(Node : Bare_Token_Lit) return String_Type is
461+
(Node : Bare_Token_Lit) return Internal_Decoded_String_Value is
364462
begin
365463
return Common_Denoted_String (Node);
366464
end Token_Lit_P_Denoted_Value;
@@ -370,7 +468,7 @@ package body Liblktlang.Implementation.Extensions is
370468
---------------------------------------
371469

372470
function Token_Pattern_Lit_P_Denoted_Value
373-
(Node : Bare_Token_Pattern_Lit) return String_Type is
471+
(Node : Bare_Token_Pattern_Lit) return Internal_Decoded_String_Value is
374472
begin
375473
return Common_Denoted_String (Node);
376474
end Token_Pattern_Lit_P_Denoted_Value;

contrib/lkt/extensions/src/liblktlang-implementation-extensions.ads

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,19 +31,19 @@ package Liblktlang.Implementation.Extensions is
3131
-- Return the prefix of this string
3232

3333
function Char_Lit_P_Denoted_Value
34-
(Node : Bare_Char_Lit) return Character_Type;
34+
(Node : Bare_Char_Lit) return Internal_Decoded_Char_Value;
3535
-- Return the content of the given character literal node
3636

3737
function String_Lit_P_Denoted_Value
38-
(Node : Bare_String_Lit) return String_Type;
38+
(Node : Bare_String_Lit) return Internal_Decoded_String_Value;
3939
-- Return the content of the given string literal node
4040

4141
function Token_Lit_P_Denoted_Value
42-
(Node : Bare_Token_Lit) return String_Type;
42+
(Node : Bare_Token_Lit) return Internal_Decoded_String_Value;
4343
-- Return the content of the given token literal node
4444

4545
function Token_Pattern_Lit_P_Denoted_Value
46-
(Node : Bare_Token_Pattern_Lit) return String_Type;
46+
(Node : Bare_Token_Pattern_Lit) return Internal_Decoded_String_Value;
4747
-- Return the content of the given token pattern literal node
4848

4949
end Liblktlang.Implementation.Extensions;

contrib/lkt/language/lexer.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,8 @@ class Token(LexerToken):
9191
("HEX_DIGITS_2", r'{HEX_DIGIT}{HEX_DIGIT}'),
9292
("HEX_DIGITS_4", r'{HEX_DIGITS_2}{HEX_DIGITS_2}'),
9393
("HEX_DIGITS_8", r'{HEX_DIGITS_4}{HEX_DIGITS_4}'),
94-
("STRING_LIT", r'\"(\\\"|[^\n\"])*\"'),
95-
("CHAR_LIT",
96-
r"'(\\'|[^\n']|\\x{HEX_DIGITS_2}|\\u{HEX_DIGITS_4}|\\U{HEX_DIGITS_8})'"),
94+
("STRING_LIT", r'"(\\"|\\[^"]|[^\n"\\])*"'),
95+
("CHAR_LIT", r"'(\\'|[^\n']*)'"),
9796
)
9897

9998

0 commit comments

Comments
 (0)