Skip to content

Commit 7f2c7fe

Browse files
authored
Adds numeric literals to grammar. (#20)
Specifically adds the `Number` rule for parsing decimals and integers. Also adds the `IntegerLiteral` and `DecimalLiteral` to capture the respective literal tokens. Notes: * Adds `num-traits`, `num-bigint`, and `bigdecimal` to provide support for integer/decimal parsing. * Adds `PairExt::unexpected()` for dealing with unexpected rules. * Adds `PairExt::syntax_error()` for easily creating syntax errors from a pair. * Updates scanner doc test to `panic!` for unexpected tokens. * Fixes the bad recognizer test because it only has scanner support and we cannot detect that `99_RANCH` is syntactically incorrect but lexes to `99` and `_RANCH`. Real PartiQL parsing will fix this in the future.
1 parent a5a616f commit 7f2c7fe

File tree

5 files changed

+262
-12
lines changed

5 files changed

+262
-12
lines changed

partiql-parser/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ version = "0.0.0"
2323
pest = "~2.1.3"
2424
pest_derive = "~2.1.0"
2525
thiserror = "~1.0.24"
26+
num-traits = "~0.2.14"
27+
num-bigint = "~0.4.0"
28+
bigdecimal = "~0.2.0"
2629

2730
[dev-dependencies]
2831
rstest = "~0.9.0"

partiql-parser/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
//! // get the parsed variant of the token
2222
//! match first.content() {
2323
//! Keyword(kw) => assert_eq!("SELECT", kw),
24-
//! Identifier(_) | StringLiteral(_) => panic!("Didn't get a keyword!"),
24+
//! _ => panic!("Didn't get a keyword!"),
2525
//! }
2626
//! // the entire text of a token can be fetched--which looks the roughly the
2727
//! // same for a keyword.
@@ -31,7 +31,7 @@
3131
//! // get the parsed variant of the token
3232
//! match second.content() {
3333
//! StringLiteral(text) => assert_eq!("🦄💩", text),
34-
//! Keyword(_) | Identifier(_) => panic!("Didn't get a string literal!"),
34+
//! _ => panic!("Didn't get a string literal!"),
3535
//! }
3636
//! // the other thing we can do is get line/column information from a token
3737
//! assert_eq!(LineAndColumn::try_at(1, 8)?, second.start());

partiql-parser/src/partiql.pest

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,42 @@ Scanner = _{ SOI ~ Token }
1414
Token = _{
1515
Keyword
1616
| String
17+
| Number
1718
| Identifier
1819
}
1920

21+
//
22+
// Numeric Literals
23+
//
24+
25+
Number = ${
26+
DecimalExp
27+
| Decimal
28+
| Integer
29+
}
30+
31+
DecimalExp = {
32+
Decimal ~ ("e" | "E") ~ Integer
33+
}
34+
35+
Decimal = {
36+
Integer? ~ "." ~ Fraction
37+
}
38+
39+
Fraction = {
40+
Digit*
41+
}
42+
43+
// XXX this explicitly supports arbitrary zero prefixing in various places
44+
// which is part of the SQL grammar and also supported in implementations
45+
// like Postgres/SQLite/MySQL/etc.
46+
Integer = {
47+
Sign? ~ Digit+
48+
}
49+
50+
Sign = _{ "+" | "-" }
51+
Digit = _{ '0'..'9' }
52+
2053
//
2154
// String Literals
2255
//
@@ -40,7 +73,7 @@ NonQuotedIdentifier = @{
4073

4174
NonQuotedIdentifierStart = _{ "_" | "$" | 'a'..'z' | 'A'..'Z' }
4275

43-
NonQuotedIdentifierCont = _{ NonQuotedIdentifierStart | '0'..'9' }
76+
NonQuotedIdentifierCont = _{ NonQuotedIdentifierStart | Digit }
4477

4578
QuotedIdentifier = @{ "\"" ~ QuotedIdentifierContent* ~ "\"" }
4679

partiql-parser/src/peg.rs

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,11 @@ pub(crate) trait PairExt<'val, R: RuleType> {
4646
/// Translates the end position of the [`Pair`] into a [`LineAndColumn`].
4747
fn end(&self) -> ParserResult<LineAndColumn>;
4848

49-
/// Returns an `Err` with a syntax error from the pair
50-
fn syntax_error<T>(&self) -> ParserResult<T>;
49+
/// Returns an `Err` with a syntax error from the unexpected pair.
50+
fn unexpected<T>(&self) -> ParserResult<T>;
51+
52+
/// Returns an `Err` with a syntax error from this pair with a message.
53+
fn syntax_error<T, S: Into<String>>(&self, message: S) -> ParserResult<T>;
5154
}
5255

5356
impl<'val, R: RuleType> PairExt<'val, R> for Pair<'val, R> {
@@ -61,8 +64,15 @@ impl<'val, R: RuleType> PairExt<'val, R> for Pair<'val, R> {
6164
self.as_span().end_pos().line_col().try_into()
6265
}
6366

64-
fn syntax_error<T>(&self) -> ParserResult<T> {
65-
syntax_error(format!("Unexpected rule: {:?}", self), self.start()?.into())
67+
fn unexpected<T>(&self) -> ParserResult<T> {
68+
self.syntax_error(format!("Unexpected rule: {:?}", self))
69+
}
70+
71+
fn syntax_error<T, S: Into<String>>(&self, message: S) -> ParserResult<T> {
72+
let position = self
73+
.start()
74+
.map_or(Position::Unknown, |location| location.into());
75+
syntax_error(message, position)
6676
}
6777
}
6878

@@ -86,7 +96,7 @@ mod tests {
8696
#[rstest]
8797
#[case::simple("select \"🍦\" fRoM \"🚽\" WHERE is_defined", Ok(()))]
8898
#[case::error(
89-
"SELECT SOMETHING FROM 99_RANCH",
99+
"SELECT SOMETHING FROM 💩",
90100
syntax_error("IGNORED MESSAGE", Position::at(1, 23))
91101
)]
92102
fn recognize(#[case] input: &str, #[case] expected: ParserResult<()>) -> ParserResult<()> {

partiql-parser/src/scanner.rs

Lines changed: 208 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,31 @@
99
1010
use crate::peg::{PairExt, PairsExt, PartiQLParser, Rule};
1111
use crate::prelude::*;
12+
use bigdecimal::BigDecimal;
13+
use num_bigint::BigInt;
14+
use num_traits::Num;
1215
use pest::iterators::Pair;
13-
use pest::Parser;
16+
use pest::{Parser, RuleType};
1417
use std::borrow::Cow;
1518

1619
/// The parsed content associated with a [`Token`] that has been scanned.
1720
#[derive(Clone, Debug, Eq, PartialEq)]
1821
pub enum Content<'val> {
1922
/// A PartiQL keyword. Contains the slice for the keyword case folded to upper case.
2023
Keyword(Cow<'val, str>),
24+
2125
/// An identifier. Contains the slice for the text of the identifier.
2226
Identifier(Cow<'val, str>),
27+
28+
/// An integer literal. Stores this as an as a [`BigInt`].
29+
///
30+
/// Users will likely deal with smaller integers and encode this in execution/compilation
31+
/// as `i64` or the like, but the parser need not deal with that detail.
32+
IntegerLiteral(BigInt),
33+
34+
/// A decimal literal. Contains the parsed [`BigDecimal`] for the literal.
35+
DecimalLiteral(BigDecimal),
36+
2337
/// A string literal. Contains the slice for the content of the literal.
2438
StringLiteral(Cow<'val, str>),
2539
// TODO things like literals, punctuation, etc.
@@ -123,6 +137,18 @@ fn normalize_quoted_ident(raw_text: &str) -> Cow<str> {
123137
.into()
124138
}
125139

140+
fn parse_num<T, R, E>(pair: Pair<R>) -> ParserResult<T>
141+
where
142+
T: Num<FromStrRadixErr = E>,
143+
R: RuleType,
144+
E: std::fmt::Display,
145+
{
146+
match T::from_str_radix(pair.as_str(), 10) {
147+
Ok(value) => Ok(value),
148+
Err(e) => pair.syntax_error(format!("Could not parse number {}: {}", pair.as_str(), e)),
149+
}
150+
}
151+
126152
impl<'val> PartiQLScanner<'val> {
127153
fn do_next_token(&mut self) -> ParserResult<Token<'val>> {
128154
// the scanner rule is expected to return a single node
@@ -144,10 +170,20 @@ impl<'val> PartiQLScanner<'val> {
144170
Rule::QuotedIdentifier => {
145171
Content::Identifier(normalize_quoted_ident(ident_pair.as_str()))
146172
}
147-
_ => return ident_pair.syntax_error(),
173+
_ => return ident_pair.unexpected(),
174+
}
175+
}
176+
Rule::Number => {
177+
let number_pair = pair.into_inner().exactly_one()?;
178+
match number_pair.as_rule() {
179+
Rule::Integer => Content::IntegerLiteral(parse_num(number_pair)?),
180+
Rule::Decimal | Rule::DecimalExp => {
181+
Content::DecimalLiteral(parse_num(number_pair)?)
182+
}
183+
_ => return number_pair.unexpected(),
148184
}
149185
}
150-
_ => return pair.syntax_error(),
186+
_ => return pair.unexpected(),
151187
};
152188

153189
Ok(Token {
@@ -292,7 +328,7 @@ mod test {
292328
]
293329
)]
294330
#[case::bad_identifier(
295-
" 99ranch",
331+
" 💩",
296332
vec![
297333
syntax_error("IGNORED MESSAGE", Position::at(1, 9)),
298334
]
@@ -349,6 +385,174 @@ mod test {
349385
syntax_error("IGNORED MESSAGE", Position::at(1, 32)),
350386
]
351387
)]
388+
#[case::numeric_literals(
389+
"1 -0099 1.1 +00055.023100 99.1234e0010",
390+
vec![
391+
Ok(Token {
392+
content: Content::IntegerLiteral(1.into()),
393+
start: LineAndColumn::at(1, 1),
394+
end: LineAndColumn::at(1, 2),
395+
text: "1",
396+
remainder: Remainder {
397+
input: " -0099 1.1 +00055.023100 99.1234e0010",
398+
offset: LineAndColumn::at(1, 2)
399+
}
400+
}),
401+
Ok(Token {
402+
content: Content::IntegerLiteral(BigInt::from(-99)),
403+
start: LineAndColumn::at(1, 3),
404+
end: LineAndColumn::at(1, 8),
405+
text: "-0099",
406+
remainder: Remainder {
407+
input: " 1.1 +00055.023100 99.1234e0010",
408+
offset: LineAndColumn::at(1, 8)
409+
}
410+
}),
411+
Ok(Token {
412+
content: Content::DecimalLiteral(BigDecimal::from_str_radix("1.1", 10).unwrap()),
413+
start: LineAndColumn::at(1, 9),
414+
end: LineAndColumn::at(1, 12),
415+
text: "1.1",
416+
remainder: Remainder {
417+
input: " +00055.023100 99.1234e0010",
418+
offset: LineAndColumn::at(1, 12)
419+
}
420+
}),
421+
Ok(Token {
422+
content: Content::DecimalLiteral(BigDecimal::from_str_radix("55.023100", 10).unwrap()),
423+
start: LineAndColumn::at(1, 13),
424+
end: LineAndColumn::at(1, 26),
425+
text: "+00055.023100",
426+
remainder: Remainder {
427+
input: " 99.1234e0010",
428+
offset: LineAndColumn::at(1, 26)
429+
}
430+
}),
431+
Ok(Token {
432+
content: Content::DecimalLiteral(BigDecimal::from_str_radix("99.1234e10", 10).unwrap()),
433+
start: LineAndColumn::at(1, 27),
434+
end: LineAndColumn::at(1, 39),
435+
text: "99.1234e0010",
436+
remainder: Remainder {
437+
input: "",
438+
offset: LineAndColumn::at(1, 39)
439+
}
440+
}),
441+
syntax_error("IGNORED MESSAGE", Position::at(1, 39)),
442+
]
443+
)]
444+
#[case::numeric_literals_with_pads(
445+
"+0005 .0001 -00.0002 000003.004E+001",
446+
vec![
447+
Ok(Token {
448+
content: Content::IntegerLiteral(5.into()),
449+
start: LineAndColumn::at(1, 1),
450+
end: LineAndColumn::at(1, 6),
451+
text: "+0005",
452+
remainder: Remainder {
453+
input: " .0001 -00.0002 000003.004E+001",
454+
offset: LineAndColumn::at(1, 6)
455+
}
456+
}),
457+
Ok(Token {
458+
content: Content::DecimalLiteral(BigDecimal::from_str_radix("0.0001", 10).unwrap()),
459+
start: LineAndColumn::at(1, 7),
460+
end: LineAndColumn::at(1, 12),
461+
text: ".0001",
462+
remainder: Remainder {
463+
input: " -00.0002 000003.004E+001",
464+
offset: LineAndColumn::at(1, 12)
465+
}
466+
}),
467+
Ok(Token {
468+
content: Content::DecimalLiteral(BigDecimal::from_str_radix("-0.0002", 10).unwrap()),
469+
start: LineAndColumn::at(1, 13),
470+
end: LineAndColumn::at(1, 21),
471+
text: "-00.0002",
472+
remainder: Remainder {
473+
input: " 000003.004E+001",
474+
offset: LineAndColumn::at(1, 21)
475+
}
476+
}),
477+
Ok(Token {
478+
content: Content::DecimalLiteral(BigDecimal::from_str_radix("3.004e1", 10).unwrap()),
479+
start: LineAndColumn::at(1, 22),
480+
end: LineAndColumn::at(1, 37),
481+
text: "000003.004E+001",
482+
remainder: Remainder {
483+
input: "",
484+
offset: LineAndColumn::at(1, 37)
485+
}
486+
}),
487+
syntax_error("IGNORED MESSAGE", Position::at(1, 37)),
488+
]
489+
)]
490+
#[case::zeroes(
491+
"0 000 .0 000.000 .0e0 0.0e000",
492+
vec![
493+
Ok(Token {
494+
content: Content::IntegerLiteral(0.into()),
495+
start: LineAndColumn::at(1, 1),
496+
end: LineAndColumn::at(1, 2),
497+
text: "0",
498+
remainder: Remainder {
499+
input: " 000 .0 000.000 .0e0 0.0e000",
500+
offset: LineAndColumn::at(1, 2)
501+
}
502+
}),
503+
Ok(Token {
504+
content: Content::IntegerLiteral(0.into()),
505+
start: LineAndColumn::at(1, 3),
506+
end: LineAndColumn::at(1, 6),
507+
text: "000",
508+
remainder: Remainder {
509+
input: " .0 000.000 .0e0 0.0e000",
510+
offset: LineAndColumn::at(1, 6)
511+
}
512+
}),
513+
Ok(Token {
514+
content: Content::DecimalLiteral(BigDecimal::from_str_radix("0.0", 10).unwrap()),
515+
start: LineAndColumn::at(1, 7),
516+
end: LineAndColumn::at(1, 9),
517+
text: ".0",
518+
remainder: Remainder {
519+
input: " 000.000 .0e0 0.0e000",
520+
offset: LineAndColumn::at(1, 9)
521+
}
522+
}),
523+
Ok(Token {
524+
content: Content::DecimalLiteral(BigDecimal::from_str_radix("0.000", 10).unwrap()),
525+
start: LineAndColumn::at(1, 10),
526+
end: LineAndColumn::at(1, 17),
527+
text: "000.000",
528+
remainder: Remainder {
529+
input: " .0e0 0.0e000",
530+
offset: LineAndColumn::at(1, 17)
531+
}
532+
}),
533+
Ok(Token {
534+
content: Content::DecimalLiteral(BigDecimal::from_str_radix("0.0", 10).unwrap()),
535+
start: LineAndColumn::at(1, 18),
536+
end: LineAndColumn::at(1, 22),
537+
text: ".0e0",
538+
remainder: Remainder {
539+
input: " 0.0e000",
540+
offset: LineAndColumn::at(1, 22)
541+
}
542+
}),
543+
Ok(Token {
544+
content: Content::DecimalLiteral(BigDecimal::from_str_radix("0.0", 10).unwrap()),
545+
start: LineAndColumn::at(1, 23),
546+
end: LineAndColumn::at(1, 30),
547+
text: "0.0e000",
548+
remainder: Remainder {
549+
input: "",
550+
offset: LineAndColumn::at(1, 30)
551+
}
552+
}),
553+
syntax_error("IGNORED MESSAGE", Position::at(1, 30)),
554+
]
555+
)]
352556
#[case::select_from(
353557
r#"SelEct '✨✨✨' fROM "┬─┬" "#,
354558
vec![

0 commit comments

Comments
 (0)