Skip to content

Commit 3d93c50

Browse files
nepalmaximyurchuk
authored andcommitted
Merge GH PR #9404
commit_hash:d780798556aedbe2be898d69185380f2ecb95f9c
1 parent 8937a36 commit 3d93c50

File tree

17 files changed

+296
-62
lines changed

17 files changed

+296
-62
lines changed

yql/essentials/parser/lexer_common/lexer.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ struct TParsedToken {
1818
// TODO: TStringBuf for Name & Content
1919
TString Name;
2020
TString Content;
21-
// Position of first token symbol
21+
// Position of first token byte/symbol
22+
// When antlr3 lexer is used, LinePos is a position as in a byte array,
23+
// but when antlr4 lexer is used, LinePos is a position as in a symbol array,
2224
ui32 Line = 0; // starts from 1
2325
ui32 LinePos = 0; // starts from 0
2426
};

yql/essentials/parser/lexer_common/tokens.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include "lexer.h"
22

3-
43
namespace NSQLTranslation {
54

65
IOutputStream& OutputTokens(IOutputStream& out, TParsedTokenList::const_iterator begin, TParsedTokenList::const_iterator end) {
@@ -18,5 +17,4 @@ bool Tokenize(ILexer& lexer, const TString& query, const TString& queryName, TPa
1817
return lexer.Tokenize(query, queryName, onNextToken, issues, maxErrors);
1918
}
2019

21-
2220
}

yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ namespace NProtoAST {
6060
try {
6161
Lexer.ReportErrors(&errors);
6262
auto src = Lexer.get_tokSource();
63+
6364
for (;;) {
6465
auto token = src->nextToken();
6566
auto type = token->getType();
@@ -69,6 +70,7 @@ namespace NProtoAST {
6970
last.Content = token->getText();
7071
last.Line = token->get_line();
7172
last.LinePos = token->get_charPositionInLine();
73+
7274
onNextToken(std::move(last));
7375
if (isEOF) {
7476
break;

yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ namespace NProtoAST {
8989

9090
void CollectTokens(IErrorCollector& errors, const NSQLTranslation::ILexer::TTokenCallback& onNextToken) {
9191
try {
92+
bool error = false;
93+
typename antlr4::YqlErrorListener listener(&errors, &error);
94+
Lexer.removeErrorListeners();
95+
Lexer.addErrorListener(&listener);
96+
9297
for (;;) {
9398
auto token = Lexer.nextToken();
9499
auto type = token->getType();

yql/essentials/sql/v1/SQLv1.g.in

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1724,7 +1724,6 @@ MINUS: '-';
17241724
TILDA: '~';
17251725
ASTERISK: '*';
17261726
SLASH: '/';
1727-
BACKSLASH: '\\';
17281727
PERCENT: '%';
17291728
SEMICOLON: ';';
17301729
DOT: '.';
@@ -1736,9 +1735,6 @@ COLON: ':';
17361735
COMMAT: '@';
17371736
DOUBLE_COMMAT: '@@';
17381737
DOLLAR: '$';
1739-
QUOTE_DOUBLE: '"'; // This comment for fix syntax highlighting "
1740-
QUOTE_SINGLE: '\'';
1741-
BACKTICK: '`';
17421738
LBRACE_CURLY: '{';
17431739
RBRACE_CURLY: '}';
17441740
CARET: '^';
@@ -1747,6 +1743,11 @@ ARROW: '->';
17471743
RBRACE_SQUARE: ']';
17481744
LBRACE_SQUARE: '['; // pair ]
17491745

1746+
fragment BACKSLASH: '\\';
1747+
fragment QUOTE_DOUBLE: '"';
1748+
fragment QUOTE_SINGLE: '\'';
1749+
fragment BACKTICK: '`';
1750+
17501751
// http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782
17511752
fragment A:('a'|'A');
17521753
fragment B:('b'|'B');

yql/essentials/sql/v1/SQLv1Antlr4.g.in

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1723,7 +1723,6 @@ MINUS: '-';
17231723
TILDA: '~';
17241724
ASTERISK: '*';
17251725
SLASH: '/';
1726-
BACKSLASH: '\\';
17271726
PERCENT: '%';
17281727
SEMICOLON: ';';
17291728
DOT: '.';
@@ -1735,9 +1734,6 @@ COLON: ':';
17351734
COMMAT: '@';
17361735
DOUBLE_COMMAT: '@@';
17371736
DOLLAR: '$';
1738-
QUOTE_DOUBLE: '"'; // This comment for fix syntax highlighting "
1739-
QUOTE_SINGLE: '\'';
1740-
BACKTICK: '`';
17411737
LBRACE_CURLY: '{';
17421738
RBRACE_CURLY: '}';
17431739
CARET: '^';
@@ -1746,6 +1742,11 @@ ARROW: '->';
17461742
RBRACE_SQUARE: ']';
17471743
LBRACE_SQUARE: '['; // pair ]
17481744

1745+
fragment BACKSLASH: '\\';
1746+
fragment QUOTE_DOUBLE: '"';
1747+
fragment QUOTE_SINGLE: '\'';
1748+
fragment BACKTICK: '`';
1749+
17491750
// http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782
17501751
fragment A:('a'|'A');
17511752
fragment B:('b'|'B');

yql/essentials/sql/v1/format/sql_format.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ using namespace NSQLv1Generated;
2626

2727
using NSQLTranslation::TParsedToken;
2828
using NSQLTranslation::TParsedTokenList;
29+
using NSQLTranslationV1::IsProbablyKeyword;
2930
using TTokenIterator = TParsedTokenList::const_iterator;
3031

3132
TTokenIterator SkipWS(TTokenIterator curr, TTokenIterator end) {
@@ -55,7 +56,7 @@ bool Validate(const TParsedTokenList& query, const TParsedTokenList& formattedQu
5556
if (in->Name != out->Name) {
5657
return false;
5758
}
58-
if (AsciiEqualsIgnoreCase(in->Name, in->Content)) {
59+
if (IsProbablyKeyword(*in)) {
5960
if (!AsciiEqualsIgnoreCase(in->Content, out->Content)) {
6061
return false;
6162
}

yql/essentials/sql/v1/lexer/lexer.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include <yql/essentials/parser/proto_ast/gen/v1_antlr4/SQLv1Antlr4Lexer.h>
1010
#include <yql/essentials/parser/proto_ast/gen/v1_ansi_antlr4/SQLv1Antlr4Lexer.h>
1111

12+
#include <util/string/ascii.h>
13+
1214
#if defined(_tsan_enabled_)
1315
#include <util/system/mutex.h>
1416
#endif
@@ -74,4 +76,8 @@ NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4) {
7476
return NSQLTranslation::ILexer::TPtr(new TV1Lexer(ansi, antlr4));
7577
}
7678

79+
bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) {
80+
return AsciiEqualsIgnoreCase(token.Name, token.Content);
81+
}
82+
7783
} // namespace NSQLTranslationV1

yql/essentials/sql/v1/lexer/lexer.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,10 @@ namespace NSQLTranslationV1 {
66

77
NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4);
88

9+
// "Probably" because YQL keyword can be an identifier
10+
// depending on a query context. For example
11+
// in SELECT * FROM group - group is an identifier, but
12+
// in SELECT * FROM ... GROUP BY ... - group is a keyword.
13+
bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token);
14+
915
}
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
#include "lexer.h"
2+
3+
#include <yql/essentials/core/issue/yql_issue.h>
4+
#include <yql/essentials/sql/settings/translation_settings.h>
5+
6+
#include <library/cpp/testing/unittest/registar.h>
7+
8+
using namespace NSQLTranslation;
9+
using namespace NSQLTranslationV1;
10+
11+
std::pair<TParsedTokenList, NYql::TIssues> Tokenize(ILexer::TPtr& lexer, TString queryUtf8) {
12+
TParsedTokenList tokens;
13+
NYql::TIssues issues;
14+
Tokenize(*lexer, queryUtf8, "Query", tokens, issues, SQL_MAX_PARSER_ERRORS);
15+
return {tokens, issues};
16+
}
17+
18+
TVector<TString> GetIssueMessages(ILexer::TPtr& lexer, TString queryUtf8) {
19+
TVector<TString> messages;
20+
for (const auto& issue : Tokenize(lexer, queryUtf8).second) {
21+
messages.emplace_back(issue.ToString(/* oneLine = */ true));
22+
}
23+
return messages;
24+
}
25+
26+
TVector<TString> GetTokenViews(ILexer::TPtr& lexer, TString queryUtf8) {
27+
TVector<TString> names;
28+
for (auto& token : Tokenize(lexer, queryUtf8).first) {
29+
TString view = std::move(token.Name);
30+
if (view == "ID_PLAIN" || view == "STRING_VALUE") {
31+
view.append(" (");
32+
view.append(token.Content);
33+
view.append(")");
34+
}
35+
names.emplace_back(std::move(view));
36+
}
37+
return names;
38+
}
39+
40+
void AssertEquivialent(const TParsedToken& lhs, const TParsedToken& rhs) {
41+
if (lhs.Name == "EOF" && rhs.Name == "EOF") {
42+
return;
43+
}
44+
45+
UNIT_ASSERT_VALUES_EQUAL(lhs.Name, rhs.Name);
46+
UNIT_ASSERT_VALUES_EQUAL(lhs.Content, rhs.Content);
47+
UNIT_ASSERT_VALUES_EQUAL(lhs.Line, rhs.Line);
48+
}
49+
50+
void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs) {
51+
UNIT_ASSERT_VALUES_EQUAL(lhs.size(), rhs.size());
52+
for (size_t i = 0; i < lhs.size(); ++i) {
53+
AssertEquivialent(lhs.at(i), rhs.at(i));
54+
}
55+
}
56+
57+
Y_UNIT_TEST_SUITE(SQLv1Lexer) {
58+
Y_UNIT_TEST(AntlrVersionIndependent) {
59+
const TVector<TString> queriesUtf8 = {
60+
"",
61+
" ",
62+
"SELECT",
63+
"SEL", // identifier
64+
"SELECT FROM test",
65+
"SELECT * FROM",
66+
" SELECT * FROM ",
67+
"SELECT \"\xF0\x9F\x98\x8A\" FROM ydb",
68+
(
69+
"SELECT \"\xF0\x9F\x98\x8A Hello, друзья\", count, name\n"
70+
"FROM table -- главная таблица 数据库 \n"
71+
"WHERE count < 6\n"
72+
" AND name = \"可靠性\"\n"
73+
" AND count > 12"),
74+
"\"select\"select",
75+
};
76+
77+
auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
78+
auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);
79+
80+
for (const auto& query : queriesUtf8) {
81+
auto [tokens3, issues3] = Tokenize(lexer3, query);
82+
auto [tokens4, issues4] = Tokenize(lexer4, query);
83+
AssertEquivialent(tokens3, tokens4);
84+
UNIT_ASSERT(issues3.Empty());
85+
UNIT_ASSERT(issues4.Empty());
86+
}
87+
}
88+
89+
TVector<TString> InvalidQueries();
90+
91+
void TestInvalidTokensSkipped(bool antlr4, const TVector<TVector<TString>>& expected) {
92+
auto lexer = MakeLexer(/* ansi = */ false, antlr4);
93+
94+
auto input = InvalidQueries();
95+
UNIT_ASSERT_VALUES_EQUAL(input.size(), expected.size());
96+
97+
for (size_t i = 0; i < input.size(); ++i) {
98+
UNIT_ASSERT_VALUES_EQUAL(GetTokenViews(lexer, input[i]), expected[i]);
99+
}
100+
}
101+
102+
TVector<TString> InvalidQueries() {
103+
return {
104+
/* 0: */ "\xF0\x9F\x98\x8A",
105+
/* 1: */ "select \"aaaa",
106+
/* 2: */ "\"\\\"",
107+
/* 3: */ "\xF0\x9F\x98\x8A SELECT * FR",
108+
/* 4: */ "! SELECT * from",
109+
/* 5: */ "\xF0\x9F\x98\x8Aselect ! from",
110+
/* 6: */ "\"",
111+
/* 7: */ "!select",
112+
/* 8: */ "SELECT \\\"\xF0\x9F\x98\x8A\\\" FROM test",
113+
};
114+
}
115+
116+
Y_UNIT_TEST(ErrorRecoveryAntlr3) {
117+
TVector<TVector<TString>> actual = {
118+
/* 0: */ {"EOF"},
119+
/* 1: */ {"SELECT", "WS", "EOF"},
120+
/* 2: */ {"EOF"},
121+
/* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"},
122+
/* 4: */ {"ID_PLAIN (ELECT)", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"},
123+
/* 5: */ {"SELECT", "WS", "ID_PLAIN (rom)", "EOF"},
124+
/* 6: */ {"EOF"},
125+
/* 7: */ {"ID_PLAIN (lect)", "EOF"},
126+
/* 8: */ {"SELECT", "WS", "EOF"},
127+
};
128+
TestInvalidTokensSkipped(/* antlr4 = */ false, actual);
129+
}
130+
131+
Y_UNIT_TEST(ErrorRecoveryAntlr4) {
132+
TVector<TVector<TString>> actual = {
133+
/* 0: */ {"EOF"},
134+
/* 1: */ {"SELECT", "WS", "EOF"},
135+
/* 2: */ {"EOF"},
136+
/* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"},
137+
/* 4: */ {"SELECT", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"},
138+
/* 5: */ {"SELECT", "WS", "FROM", "EOF"},
139+
/* 6: */ {"EOF"},
140+
/* 7: */ {"ID_PLAIN (elect)", "EOF"},
141+
/* 8: */ {"SELECT", "WS", "EOF"},
142+
};
143+
TestInvalidTokensSkipped(/* antlr4 = */ true, actual);
144+
}
145+
146+
Y_UNIT_TEST(IssuesCollected) {
147+
auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
148+
auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);
149+
150+
for (const auto& query : InvalidQueries()) {
151+
auto issues3 = GetIssueMessages(lexer3, query);
152+
auto issues4 = GetIssueMessages(lexer4, query);
153+
154+
UNIT_ASSERT(!issues3.empty());
155+
UNIT_ASSERT(!issues4.empty());
156+
}
157+
}
158+
159+
Y_UNIT_TEST(IssueMessagesAntlr3) {
160+
auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
161+
162+
auto actual = GetIssueMessages(lexer3, "\xF0\x9F\x98\x8A SELECT * FR");
163+
164+
TVector<TString> expected = {
165+
"<main>:1:0: Error: Unexpected character '\xF0\x9F\x98\x8A' (Unicode character <128522>) : cannot match to any predicted input...",
166+
"<main>:1:1: Error: Unexpected character : cannot match to any predicted input...",
167+
"<main>:1:2: Error: Unexpected character : cannot match to any predicted input...",
168+
"<main>:1:3: Error: Unexpected character : cannot match to any predicted input...",
169+
};
170+
171+
UNIT_ASSERT_VALUES_EQUAL(actual, expected);
172+
}
173+
174+
Y_UNIT_TEST(IssueMessagesAntlr4) {
175+
auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);
176+
177+
auto actual = GetIssueMessages(lexer4, "\xF0\x9F\x98\x8A SELECT * FR");
178+
179+
TVector<TString> expected = {
180+
"<main>:1:0: Error: token recognition error at: '\xF0\x9F\x98\x8A'",
181+
};
182+
183+
UNIT_ASSERT_VALUES_EQUAL(actual, expected);
184+
}
185+
}

0 commit comments

Comments
 (0)