diff --git a/.gitignore b/.gitignore index 6dfd81a68..baccda415 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ Cargo.lock # IDEs .idea .vscode + +*.swp diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index b1f261b2e..fe1953d2a 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -52,7 +52,7 @@ pub fn parse_comment(parser: &mut Parser) -> Result { parser.expect_keyword(Keyword::ON)?; let token = parser.next_token(); - let (object_type, object_name) = match token { + let (object_type, object_name) = match token.token { Token::Word(w) if w.keyword == Keyword::COLUMN => { let object_name = parser.parse_object_name()?; (CommentObject::Column, object_name) diff --git a/src/parser.rs b/src/parser.rs index d1d26293d..45c472acd 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -105,7 +105,7 @@ impl fmt::Display for ParserError { impl std::error::Error for ParserError {} pub struct Parser<'a> { - tokens: Vec, + tokens: Vec, /// The index of the first unprocessed token in `self.tokens` index: usize, dialect: &'a dyn Dialect, @@ -113,7 +113,26 @@ pub struct Parser<'a> { impl<'a> Parser<'a> { /// Parse the specified tokens + /// To avoid breaking backwards compatibility, this function accepts + /// bare tokens. pub fn new(tokens: Vec, dialect: &'a dyn Dialect) -> Self { + Parser::new_without_locations(tokens, dialect) + } + + pub fn new_without_locations(tokens: Vec, dialect: &'a dyn Dialect) -> Self { + // Put in dummy locations + let tokens_with_locations: Vec = tokens + .into_iter() + .map(|token| TokenWithLocation { + token, + location: Location { line: 0, column: 0 }, + }) + .collect(); + Parser::new_with_locations(tokens_with_locations, dialect) + } + + /// Parse the specified tokens + pub fn new_with_locations(tokens: Vec, dialect: &'a dyn Dialect) -> Self { Parser { tokens, index: 0, @@ -157,7 +176,8 @@ impl<'a> Parser<'a> { return statement; } - match self.next_token() { + let next_token = self.next_token(); + match &next_token.token { Token::Word(w) => match w.keyword { Keyword::KILL => Ok(self.parse_kill()?), Keyword::DESCRIBE => Ok(self.parse_explain(true)?), @@ -202,13 +222,13 @@ impl<'a> Parser<'a> { Keyword::EXECUTE => Ok(self.parse_execute()?), Keyword::PREPARE => Ok(self.parse_prepare()?), Keyword::MERGE => Ok(self.parse_merge()?), - _ => self.expected("an SQL statement", Token::Word(w)), + _ => self.expected("an SQL statement", next_token), }, Token::LParen => { self.prev_token(); Ok(Statement::Query(Box::new(self.parse_query()?))) } - unexpected => self.expected("an SQL statement", unexpected), + _ => self.expected("an SQL statement", next_token), } } @@ -314,18 +334,20 @@ impl<'a> Parser<'a> { pub fn parse_wildcard_expr(&mut self) -> Result { let index = self.index; - match self.next_token() { - Token::Word(w) if self.peek_token() == Token::Period => { + let next_token = self.next_token(); + match next_token.token { + Token::Word(w) if self.peek_token().token == Token::Period => { let mut id_parts: Vec = vec![w.to_ident()]; while self.consume_token(&Token::Period) { - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Word(w) => id_parts.push(w.to_ident()), Token::Mul => { return Ok(WildcardExpr::QualifiedWildcard(ObjectName(id_parts))); } - unexpected => { - return self.expected("an identifier or a '*' after '.'", unexpected); + _ => { + return self.expected("an identifier or a '*' after '.'", next_token); } } } @@ -385,7 +407,7 @@ impl<'a> Parser<'a> { pub fn get_next_interval_precedence(&self) -> Result { let token = self.peek_token(); - match token { + match token.token { Token::Word(w) if w.keyword == Keyword::AND => Ok(0), Token::Word(w) if w.keyword == Keyword::OR => Ok(0), Token::Word(w) if w.keyword == Keyword::XOR => Ok(0), @@ -450,7 +472,8 @@ impl<'a> Parser<'a> { } })); - let expr = match self.next_token() { + let next_token = self.next_token(); + let expr = match next_token.token { Token::Word(w) => match w.keyword { Keyword::TRUE | Keyword::FALSE | Keyword::NULL => { self.prev_token(); @@ -510,15 +533,16 @@ impl<'a> Parser<'a> { } // Here `w` is a word, check if it's a part of a multi-part // identifier, a function call, or a simple identifier: - _ => match self.peek_token() { + _ => match self.peek_token().token { Token::LParen | Token::Period => { let mut id_parts: Vec = vec![w.to_ident()]; while self.consume_token(&Token::Period) { - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Word(w) => id_parts.push(w.to_ident()), - unexpected => { + _ => { return self - .expected("an identifier or a '*' after '.'", unexpected); + .expected("an identifier or a '*' after '.'", next_token); } } } @@ -598,7 +622,7 @@ impl<'a> Parser<'a> { Ok(expr) } else { let tok = self.next_token(); - let key = match tok { + let key = match tok.token { Token::Word(word) => word.to_ident(), _ => return parser_err!(format!("Expected identifier, found: {}", tok)), }; @@ -612,7 +636,7 @@ impl<'a> Parser<'a> { self.prev_token(); Ok(Expr::Value(self.parse_value()?)) } - unexpected => self.expected("an expression:", unexpected), + _ => self.expected("an expression:", next_token), }?; if self.parse_keyword(Keyword::COLLATE) { @@ -684,14 +708,15 @@ impl<'a> Parser<'a> { } pub fn parse_window_frame_units(&mut self) -> Result { - match self.next_token() { + let next_token = self.next_token(); + match &next_token.token { Token::Word(w) => match w.keyword { Keyword::ROWS => Ok(WindowFrameUnits::Rows), Keyword::RANGE => Ok(WindowFrameUnits::Range), Keyword::GROUPS => Ok(WindowFrameUnits::Groups), - _ => self.expected("ROWS, RANGE, GROUPS", Token::Word(w))?, + _ => self.expected("ROWS, RANGE, GROUPS", next_token)?, }, - unexpected => self.expected("ROWS, RANGE, GROUPS", unexpected), + _ => self.expected("ROWS, RANGE, GROUPS", next_token), } } @@ -720,7 +745,7 @@ impl<'a> Parser<'a> { let rows = if self.parse_keyword(Keyword::UNBOUNDED) { None } else { - Some(Box::new(match self.peek_token() { + Some(Box::new(match self.peek_token().token { Token::SingleQuotedString(_) => self.parse_interval()?, _ => self.parse_expr()?, })) @@ -980,7 +1005,7 @@ impl<'a> Parser<'a> { pub fn parse_trim_expr(&mut self) -> Result { self.expect_token(&Token::LParen)?; let mut trim_where = None; - if let Token::Word(word) = self.peek_token() { + if let Token::Word(word) = self.peek_token().token { if [Keyword::BOTH, Keyword::LEADING, Keyword::TRAILING] .iter() .any(|d| word.keyword == *d) @@ -1009,21 +1034,22 @@ impl<'a> Parser<'a> { } pub fn parse_trim_where(&mut self) -> Result { - match self.next_token() { + let next_token = self.next_token(); + match &next_token.token { Token::Word(w) => match w.keyword { Keyword::BOTH => Ok(TrimWhereField::Both), Keyword::LEADING => Ok(TrimWhereField::Leading), Keyword::TRAILING => Ok(TrimWhereField::Trailing), - _ => self.expected("trim_where field", Token::Word(w))?, + _ => self.expected("trim_where field", next_token)?, }, - unexpected => self.expected("trim_where field", unexpected), + _ => self.expected("trim_where field", next_token), } } /// Parses an array expression `[ex1, ex2, ..]` /// if `named` is `true`, came from an expression like `ARRAY[ex1, ex2]` pub fn parse_array_expr(&mut self, named: bool) -> Result { - if self.peek_token() == Token::RBracket { + if self.peek_token().token == Token::RBracket { let _ = self.next_token(); Ok(Expr::Array(Array { elem: vec![], @@ -1060,7 +1086,7 @@ impl<'a> Parser<'a> { Some(ListAggOnOverflow::Error) } else { self.expect_keyword(Keyword::TRUNCATE)?; - let filler = match self.peek_token() { + let filler = match self.peek_token().token { Token::Word(w) if w.keyword == Keyword::WITH || w.keyword == Keyword::WITHOUT => { @@ -1070,9 +1096,10 @@ impl<'a> Parser<'a> { | Token::EscapedStringLiteral(_) | Token::NationalStringLiteral(_) | Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)), - unexpected => { - self.expected("either filler, WITH, or WITHOUT in LISTAGG", unexpected)? - } + _ => self.expected( + "either filler, WITH, or WITHOUT in LISTAGG", + self.peek_token(), + )?, }; let with_count = self.parse_keyword(Keyword::WITH); if !with_count && !self.parse_keyword(Keyword::WITHOUT) { @@ -1158,7 +1185,8 @@ impl<'a> Parser<'a> { // EXTRACT supports a wider set of date/time fields than interval qualifiers, // so this function may need to be split in two. pub fn parse_date_time_field(&mut self) -> Result { - match self.next_token() { + let next_token = self.next_token(); + match &next_token.token { Token::Word(w) => match w.keyword { Keyword::YEAR => Ok(DateTimeField::Year), Keyword::MONTH => Ok(DateTimeField::Month), @@ -1186,14 +1214,14 @@ impl<'a> Parser<'a> { Keyword::TIMEZONE => Ok(DateTimeField::Timezone), Keyword::TIMEZONE_HOUR => Ok(DateTimeField::TimezoneHour), Keyword::TIMEZONE_MINUTE => Ok(DateTimeField::TimezoneMinute), - _ => self.expected("date/time field", Token::Word(w))?, + _ => self.expected("date/time field", next_token), }, - unexpected => self.expected("date/time field", unexpected), + _ => self.expected("date/time field", next_token), } } pub fn parse_not(&mut self) -> Result { - match self.peek_token() { + match self.peek_token().token { Token::Word(w) => match w.keyword { Keyword::EXISTS => { let negated = true; @@ -1291,7 +1319,7 @@ impl<'a> Parser<'a> { // // Note that PostgreSQL allows omitting the qualifier, so we provide // this more general implementation. - let leading_field = match self.peek_token() { + let leading_field = match self.peek_token().token { Token::Word(kw) if [ Keyword::YEAR, @@ -1371,7 +1399,7 @@ impl<'a> Parser<'a> { let tok = self.next_token(); - let regular_binary_operator = match &tok { + let regular_binary_operator = match &tok.token { Token::Spaceship => Some(BinaryOperator::Spaceship), Token::DoubleEq => Some(BinaryOperator::Eq), Token::Eq => Some(BinaryOperator::Eq), @@ -1451,7 +1479,7 @@ impl<'a> Parser<'a> { right: Box::new(self.parse_subexpr(precedence)?), }) } - } else if let Token::Word(w) = &tok { + } else if let Token::Word(w) = &tok.token { match w.keyword { Keyword::IS => { if self.parse_keyword(Keyword::NULL) { @@ -1489,7 +1517,7 @@ impl<'a> Parser<'a> { // self.expect_keyword(Keyword::ZONE)?; if self.parse_keywords(&[Keyword::TIME, Keyword::ZONE]) { let time_zone = self.next_token(); - match time_zone { + match time_zone.token { Token::SingleQuotedString(time_zone) => { log::trace!("Peek token: {:?}", self.peek_token()); Ok(Expr::AtTimeZone { @@ -1497,9 +1525,9 @@ impl<'a> Parser<'a> { time_zone, }) } - tok => self.expected( + _ => self.expected( "Expected Token::SingleQuotedString after AT TIME ZONE", - tok, + time_zone, ), } } else { @@ -1544,7 +1572,7 @@ impl<'a> Parser<'a> { } } // Can only happen if `get_next_precedence` got out of sync with this function - _ => parser_err!(format!("No infix parser for token {:?}", tok)), + _ => parser_err!(format!("No infix parser for token {:?}", tok.token)), } } else if Token::DoubleColon == tok { self.parse_pg_cast(expr) @@ -1571,7 +1599,7 @@ impl<'a> Parser<'a> { || Token::HashArrow == tok || Token::HashLongArrow == tok { - let operator = match tok { + let operator = match tok.token { Token::Arrow => JsonOperator::Arrow, Token::LongArrow => JsonOperator::LongArrow, Token::HashArrow => JsonOperator::HashArrow, @@ -1585,7 +1613,7 @@ impl<'a> Parser<'a> { }) } else { // Can only happen if `get_next_precedence` got out of sync with this function - parser_err!(format!("No infix parser for token {:?}", tok)) + parser_err!(format!("No infix parser for token {:?}", tok.token)) } } @@ -1713,13 +1741,13 @@ impl<'a> Parser<'a> { let token_1 = self.peek_nth_token(1); let token_2 = self.peek_nth_token(2); debug!("0: {token_0} 1: {token_1} 2: {token_2}"); - match token { + match token.token { Token::Word(w) if w.keyword == Keyword::OR => Ok(Self::OR_PREC), Token::Word(w) if w.keyword == Keyword::AND => Ok(Self::AND_PREC), Token::Word(w) if w.keyword == Keyword::XOR => Ok(Self::XOR_PREC), Token::Word(w) if w.keyword == Keyword::AT => { - match (self.peek_nth_token(1), self.peek_nth_token(2)) { + match (self.peek_nth_token(1).token, self.peek_nth_token(2).token) { (Token::Word(w), Token::Word(w2)) if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => { @@ -1729,7 +1757,7 @@ impl<'a> Parser<'a> { } } - Token::Word(w) if w.keyword == Keyword::NOT => match self.peek_nth_token(1) { + Token::Word(w) if w.keyword == Keyword::NOT => match self.peek_nth_token(1).token { // The precedence of NOT varies depending on keyword that // follows it. If it is followed by IN, BETWEEN, or LIKE, // it takes on the precedence of those tokens. Otherwise it @@ -1780,20 +1808,26 @@ impl<'a> Parser<'a> { /// Return the first non-whitespace token that has not yet been processed /// (or None if reached end-of-file) - pub fn peek_token(&self) -> Token { + pub fn peek_token(&self) -> TokenWithLocation { self.peek_nth_token(0) } /// Return nth non-whitespace token that has not yet been processed - pub fn peek_nth_token(&self, mut n: usize) -> Token { + pub fn peek_nth_token(&self, mut n: usize) -> TokenWithLocation { let mut index = self.index; loop { index += 1; match self.tokens.get(index - 1) { - Some(Token::Whitespace(_)) => continue, + Some(TokenWithLocation { + token: Token::Whitespace(_), + location: _, + }) => continue, non_whitespace => { if n == 0 { - return non_whitespace.cloned().unwrap_or(Token::EOF); + return non_whitespace.cloned().unwrap_or(TokenWithLocation { + token: Token::EOF, + location: Location { line: 0, column: 0 }, + }); } n -= 1; } @@ -1804,18 +1838,25 @@ impl<'a> Parser<'a> { /// Return the first non-whitespace token that has not yet been processed /// (or None if reached end-of-file) and mark it as processed. OK to call /// repeatedly after reaching EOF. - pub fn next_token(&mut self) -> Token { + pub fn next_token(&mut self) -> TokenWithLocation { loop { self.index += 1; match self.tokens.get(self.index - 1) { - Some(Token::Whitespace(_)) => continue, - token => return token.cloned().unwrap_or(Token::EOF), + Some(TokenWithLocation { + token: Token::Whitespace(_), + location: _, + }) => continue, + token => { + return token + .cloned() + .unwrap_or_else(|| TokenWithLocation::wrap(Token::EOF)) + } } } } /// Return the first unprocessed token, possibly whitespace. - pub fn next_token_no_skip(&mut self) -> Option<&Token> { + pub fn next_token_no_skip(&mut self) -> Option<&TokenWithLocation> { self.index += 1; self.tokens.get(self.index - 1) } @@ -1827,7 +1868,11 @@ impl<'a> Parser<'a> { loop { assert!(self.index > 0); self.index -= 1; - if let Some(Token::Whitespace(_)) = self.tokens.get(self.index) { + if let Some(TokenWithLocation { + token: Token::Whitespace(_), + location: _, + }) = self.tokens.get(self.index) + { continue; } return; @@ -1835,14 +1880,14 @@ impl<'a> Parser<'a> { } /// Report unexpected token - pub fn expected(&self, expected: &str, found: Token) -> Result { + pub fn expected(&self, expected: &str, found: TokenWithLocation) -> Result { parser_err!(format!("Expected {}, found: {}", expected, found)) } /// Look for an expected keyword and consume it if it exists #[must_use] pub fn parse_keyword(&mut self, expected: Keyword) -> bool { - match self.peek_token() { + match self.peek_token().token { Token::Word(w) if expected == w.keyword => { self.next_token(); true @@ -1869,7 +1914,7 @@ impl<'a> Parser<'a> { /// Look for one of the given keywords and return the one that matches. #[must_use] pub fn parse_one_of_keywords(&mut self, keywords: &[Keyword]) -> Option { - match self.peek_token() { + match self.peek_token().token { Token::Word(w) => { keywords .iter() @@ -1945,7 +1990,7 @@ impl<'a> Parser<'a> { // BigQuery allows trailing commas. // e.g. `SELECT 1, 2, FROM t` // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#trailing_commas - match self.peek_token() { + match self.peek_token().token { Token::Word(kw) if keywords::RESERVED_FOR_COLUMN_ALIAS .iter() @@ -2057,14 +2102,14 @@ impl<'a> Parser<'a> { let (mut table_flag, mut options, mut has_as, mut query) = (None, vec![], false, None); if self.parse_keyword(Keyword::TABLE) { let table_name = self.parse_object_name()?; - if self.peek_token() != Token::EOF { - if let Token::Word(word) = self.peek_token() { + if self.peek_token().token != Token::EOF { + if let Token::Word(word) = self.peek_token().token { if word.keyword == Keyword::OPTIONS { options = self.parse_options(Keyword::OPTIONS)? } }; - if self.peek_token() != Token::EOF { + if self.peek_token().token != Token::EOF { let (a, q) = self.parse_as_query()?; has_as = a; query = Some(q); @@ -2091,7 +2136,7 @@ impl<'a> Parser<'a> { if self.parse_keyword(Keyword::TABLE) { let table_name = self.parse_object_name()?; if self.peek_token() != Token::EOF { - if let Token::Word(word) = self.peek_token() { + if let Token::Word(word) = self.peek_token().token { if word.keyword == Keyword::OPTIONS { options = self.parse_options(Keyword::OPTIONS)? } @@ -2130,7 +2175,7 @@ impl<'a> Parser<'a> { /// Parse 'AS' before as query,such as `WITH XXX AS SELECT XXX` oer `CACHE TABLE AS SELECT XXX` pub fn parse_as_query(&mut self) -> Result<(bool, Query), ParserError> { - match self.peek_token() { + match self.peek_token().token { Token::Word(word) => match word.keyword { Keyword::AS => { self.next_token(); @@ -2148,7 +2193,7 @@ impl<'a> Parser<'a> { if has_table { let if_exists = self.parse_keywords(&[Keyword::IF, Keyword::EXISTS]); let table_name = self.parse_object_name()?; - if self.peek_token() == Token::EOF { + if self.peek_token().token == Token::EOF { Ok(Statement::UNCache { table_name, if_exists, @@ -2248,7 +2293,7 @@ impl<'a> Parser<'a> { Keyword::ARCHIVE => Ok(Some(CreateFunctionUsing::Archive(uri))), _ => self.expected( "JAR, FILE or ARCHIVE, got {:?}", - Token::make_keyword(format!("{:?}", keyword).as_str()), + TokenWithLocation::wrap(Token::make_keyword(format!("{:?}", keyword).as_str())), ), } } @@ -2410,7 +2455,8 @@ impl<'a> Parser<'a> { } pub fn parse_file_format(&mut self) -> Result { - match self.next_token() { + let next_token = self.next_token(); + match &next_token.token { Token::Word(w) => match w.keyword { Keyword::AVRO => Ok(FileFormat::AVRO), Keyword::JSONFILE => Ok(FileFormat::JSONFILE), @@ -2419,21 +2465,22 @@ impl<'a> Parser<'a> { Keyword::RCFILE => Ok(FileFormat::RCFILE), Keyword::SEQUENCEFILE => Ok(FileFormat::SEQUENCEFILE), Keyword::TEXTFILE => Ok(FileFormat::TEXTFILE), - _ => self.expected("fileformat", Token::Word(w)), + _ => self.expected("fileformat", next_token), }, - unexpected => self.expected("fileformat", unexpected), + _ => self.expected("fileformat", next_token), } } pub fn parse_analyze_format(&mut self) -> Result { - match self.next_token() { + let next_token = self.next_token(); + match &next_token.token { Token::Word(w) => match w.keyword { Keyword::TEXT => Ok(AnalyzeFormat::TEXT), Keyword::GRAPHVIZ => Ok(AnalyzeFormat::GRAPHVIZ), Keyword::JSON => Ok(AnalyzeFormat::JSON), - _ => self.expected("fileformat", Token::Word(w)), + _ => self.expected("fileformat", next_token), }, - unexpected => self.expected("fileformat", unexpected), + _ => self.expected("fileformat", next_token), } } @@ -2949,10 +2996,11 @@ impl<'a> Parser<'a> { // Clickhouse has `ON CLUSTER 'cluster'` syntax for DDLs let on_cluster = if self.parse_keywords(&[Keyword::ON, Keyword::CLUSTER]) { - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::SingleQuotedString(s) => Some(s), Token::Word(s) => Some(s.to_string()), - unexpected => self.expected("identifier or cluster literal", unexpected)?, + _ => self.expected("identifier or cluster literal", next_token)?, } } else { None @@ -2990,9 +3038,10 @@ impl<'a> Parser<'a> { let engine = if self.parse_keyword(Keyword::ENGINE) { self.expect_token(&Token::Eq)?; - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Word(w) => Some(w.value), - unexpected => self.expected("identifier", unexpected)?, + _ => self.expected("identifier", next_token)?, } } else { None @@ -3000,9 +3049,10 @@ impl<'a> Parser<'a> { let default_charset = if self.parse_keywords(&[Keyword::DEFAULT, Keyword::CHARSET]) { self.expect_token(&Token::Eq)?; - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Word(w) => Some(w.value), - unexpected => self.expected("identifier", unexpected)?, + _ => self.expected("identifier", next_token)?, } } else { None @@ -3010,9 +3060,10 @@ impl<'a> Parser<'a> { let collation = if self.parse_keywords(&[Keyword::COLLATE]) { self.expect_token(&Token::Eq)?; - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Word(w) => Some(w.value), - unexpected => self.expected("identifier", unexpected)?, + _ => self.expected("identifier", next_token)?, } } else { None @@ -3068,7 +3119,7 @@ impl<'a> Parser<'a> { loop { if let Some(constraint) = self.parse_optional_table_constraint()? { constraints.push(constraint); - } else if let Token::Word(_) = self.peek_token() { + } else if let Token::Word(_) = self.peek_token().token { columns.push(self.parse_column_def()?); } else { return self.expected("column name or constraint definition", self.peek_token()); @@ -3125,9 +3176,10 @@ impl<'a> Parser<'a> { } else if self.parse_keywords(&[Keyword::NOT, Keyword::NULL]) { Ok(Some(ColumnOption::NotNull)) } else if self.parse_keywords(&[Keyword::COMMENT]) { - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::SingleQuotedString(value, ..) => Ok(Some(ColumnOption::Comment(value))), - unexpected => self.expected("string", unexpected), + _ => self.expected("string", next_token), } } else if self.parse_keyword(Keyword::NULL) { Ok(Some(ColumnOption::Null)) @@ -3218,7 +3270,9 @@ impl<'a> Parser<'a> { } else { None }; - match self.next_token() { + + let next_token = self.next_token(); + match next_token.token { Token::Word(w) if w.keyword == Keyword::PRIMARY || w.keyword == Keyword::UNIQUE => { let is_primary = w.keyword == Keyword::PRIMARY; if is_primary { @@ -3271,7 +3325,7 @@ impl<'a> Parser<'a> { { let display_as_key = w.keyword == Keyword::KEY; - let name = match self.peek_token() { + let name = match self.peek_token().token { Token::Word(word) if word.keyword == Keyword::USING => None, _ => self.maybe_parse(|parser| parser.parse_identifier()), }; @@ -3297,7 +3351,10 @@ impl<'a> Parser<'a> { if let Some(name) = name { return self.expected( "FULLTEXT or SPATIAL option without constraint name", - Token::make_keyword(&name.to_string()), + TokenWithLocation { + token: Token::make_keyword(&name.to_string()), + location: next_token.location, + }, ); } @@ -3322,9 +3379,9 @@ impl<'a> Parser<'a> { columns, })) } - unexpected => { + _ => { if name.is_some() { - self.expected("PRIMARY, UNIQUE, FOREIGN, or CHECK", unexpected) + self.expected("PRIMARY, UNIQUE, FOREIGN, or CHECK", next_token) } else { self.prev_token(); Ok(None) @@ -3712,7 +3769,7 @@ impl<'a> Parser<'a> { pub fn parse_tab_value(&mut self) -> Vec> { let mut values = vec![]; let mut content = String::from(""); - while let Some(t) = self.next_token_no_skip() { + while let Some(t) = self.next_token_no_skip().map(|t| &t.token) { match t { Token::Whitespace(Whitespace::Tab) => { values.push(Some(content.to_string())); @@ -3726,7 +3783,7 @@ impl<'a> Parser<'a> { if self.consume_token(&Token::Period) { return values; } - if let Token::Word(w) = self.next_token() { + if let Token::Word(w) = self.next_token().token { if w.value == "N" { values.push(None); } @@ -3742,7 +3799,9 @@ impl<'a> Parser<'a> { /// Parse a literal value (numbers, strings, date/time, booleans) pub fn parse_value(&mut self) -> Result { - match self.next_token() { + let next_token = self.next_token(); + let location = next_token.location; + match next_token.token { Token::Word(w) => match w.keyword { Keyword::TRUE => Ok(Value::Boolean(true)), Keyword::FALSE => Ok(Value::Boolean(false)), @@ -3750,13 +3809,25 @@ impl<'a> Parser<'a> { Keyword::NoKeyword if w.quote_style.is_some() => match w.quote_style { Some('"') => Ok(Value::DoubleQuotedString(w.value)), Some('\'') => Ok(Value::SingleQuotedString(w.value)), - _ => self.expected("A value?", Token::Word(w))?, + _ => self.expected( + "A value?", + TokenWithLocation { + token: Token::Word(w), + location, + }, + )?, }, // Case when Snowflake Semi-structured data like key:value Keyword::NoKeyword | Keyword::LOCATION if dialect_of!(self is SnowflakeDialect | GenericDialect) => { Ok(Value::UnQuotedString(w.value)) } - _ => self.expected("a concrete value", Token::Word(w)), + _ => self.expected( + "a concrete value", + TokenWithLocation { + token: Token::Word(w), + location, + }, + ), }, // The call to n.parse() returns a bigdecimal when the // bigdecimal feature is enabled, and is otherwise a no-op @@ -3776,7 +3847,13 @@ impl<'a> Parser<'a> { let placeholder = tok.to_string() + &ident.value; Ok(Value::Placeholder(placeholder)) } - unexpected => self.expected("a value", unexpected), + unexpected => self.expected( + "a value", + TokenWithLocation { + token: unexpected, + location, + }, + ), } } @@ -3793,30 +3870,33 @@ impl<'a> Parser<'a> { /// Parse an unsigned literal integer/long pub fn parse_literal_uint(&mut self) -> Result { - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Number(s, _) => s.parse::().map_err(|e| { ParserError::ParserError(format!("Could not parse '{}' as u64: {}", s, e)) }), - unexpected => self.expected("literal int", unexpected), + _ => self.expected("literal int", next_token), } } /// Parse a literal string pub fn parse_literal_string(&mut self) -> Result { - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => Ok(value), Token::SingleQuotedString(s) => Ok(s), Token::DoubleQuotedString(s) => Ok(s), Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => { Ok(s) } - unexpected => self.expected("literal string", unexpected), + _ => self.expected("literal string", next_token), } } /// Parse a map key string pub fn parse_map_key(&mut self) -> Result { - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => { if self.peek_token() == Token::LParen { return self.parse_function(ObjectName(vec![Ident::new(value)])); @@ -3828,13 +3908,14 @@ impl<'a> Parser<'a> { Token::Number(s, _) => Ok(Expr::Value(Value::Number(s, false))), #[cfg(feature = "bigdecimal")] Token::Number(s, _) => Ok(Expr::Value(Value::Number(s.parse().unwrap(), false))), - unexpected => self.expected("literal string, number or function", unexpected), + _ => self.expected("literal string, number or function", next_token), } } /// Parse a SQL datatype (in the context of a CREATE TABLE statement for example) pub fn parse_data_type(&mut self) -> Result { - let mut data = match self.next_token() { + let next_token = self.next_token(); + let mut data = match next_token.token { Token::Word(w) => match w.keyword { Keyword::BOOLEAN => Ok(DataType::Boolean), Keyword::FLOAT => Ok(DataType::Float(self.parse_optional_precision()?)), @@ -4003,7 +4084,7 @@ impl<'a> Parser<'a> { } } }, - unexpected => self.expected("a data type name", unexpected), + _ => self.expected("a data type name", next_token), }?; // Parse array data types. Note: this is postgresql-specific and different from @@ -4019,14 +4100,16 @@ impl<'a> Parser<'a> { self.expect_token(&Token::LParen)?; let mut values = Vec::new(); loop { - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::SingleQuotedString(value) => values.push(value), - unexpected => self.expected("a string", unexpected)?, + _ => self.expected("a string", next_token)?, } - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Comma => (), Token::RParen => break, - unexpected => self.expected(", or }", unexpected)?, + _ => self.expected(", or }", next_token)?, } } Ok(values) @@ -4040,7 +4123,8 @@ impl<'a> Parser<'a> { reserved_kwds: &[Keyword], ) -> Result, ParserError> { let after_as = self.parse_keyword(Keyword::AS); - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { // Accept any identifier after `AS` (though many dialects have restrictions on // keywords that may appear here). If there's no `AS`: don't parse keywords, // which may start a construct allowed in this position, to be parsed as aliases. @@ -4064,9 +4148,9 @@ impl<'a> Parser<'a> { Token::SingleQuotedString(s) => Ok(Some(Ident::with_quote('\'', s))), // Support for MySql dialect double qouted string, `AS "HOUR"` for example Token::DoubleQuotedString(s) => Ok(Some(Ident::with_quote('\"', s))), - not_an_ident => { + _ => { if after_as { - return self.expected("an identifier after AS", not_an_ident); + return self.expected("an identifier after AS", next_token); } self.prev_token(); Ok(None) // no alias found @@ -4108,7 +4192,7 @@ impl<'a> Parser<'a> { pub fn parse_identifiers(&mut self) -> Result, ParserError> { let mut idents = vec![]; loop { - match self.peek_token() { + match self.peek_token().token { Token::Word(w) => { idents.push(w.to_ident()); } @@ -4122,11 +4206,12 @@ impl<'a> Parser<'a> { /// Parse a simple one-word identifier (possibly quoted, possibly a keyword) pub fn parse_identifier(&mut self) -> Result { - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Word(w) => Ok(w.to_ident()), Token::SingleQuotedString(s) => Ok(Ident::with_quote('\'', s)), Token::DoubleQuotedString(s) => Ok(Ident::with_quote('\"', s)), - unexpected => self.expected("identifier", unexpected), + _ => self.expected("identifier", next_token), } } @@ -4231,7 +4316,8 @@ impl<'a> Parser<'a> { if self.consume_token(&Token::LParen) { let mut modifiers = Vec::new(); loop { - match self.next_token() { + let next_token = self.next_token(); + match next_token.token { Token::Word(w) => modifiers.push(w.to_string()), Token::Number(n, _) => modifiers.push(n), Token::SingleQuotedString(s) => modifiers.push(s), @@ -4242,7 +4328,7 @@ impl<'a> Parser<'a> { Token::RParen => { break; } - unexpected => self.expected("type modifiers", unexpected)?, + _ => self.expected("type modifiers", next_token)?, } } @@ -4485,7 +4571,7 @@ impl<'a> Parser<'a> { loop { // The query can be optionally followed by a set operator: - let op = self.parse_set_operator(&self.peek_token()); + let op = self.parse_set_operator(&self.peek_token().token); let next_precedence = match op { // UNION and EXCEPT have the same binding power and evaluate left-to-right Some(SetOperator::Union) | Some(SetOperator::Except) => 10, @@ -4683,7 +4769,7 @@ impl<'a> Parser<'a> { let table_name; let schema_name; if token2 == Token::Period { - match token1 { + match token1.token { Token::Word(w) => { schema_name = w.value; } @@ -4691,7 +4777,7 @@ impl<'a> Parser<'a> { return self.expected("Schema name", token1); } } - match token3 { + match token3.token { Token::Word(w) => { table_name = w.value; } @@ -4704,7 +4790,7 @@ impl<'a> Parser<'a> { schema_name: Some(schema_name), }) } else { - match token1 { + match token1.token { Token::Word(w) => { table_name = w.value; } @@ -4986,7 +5072,7 @@ impl<'a> Parser<'a> { } } else { let natural = self.parse_keyword(Keyword::NATURAL); - let peek_keyword = if let Token::Word(w) = self.peek_token() { + let peek_keyword = if let Token::Word(w) = self.peek_token().token { w.keyword } else { Keyword::NoKeyword @@ -6704,4 +6790,31 @@ mod tests { r#"UPDATE test SET name = $1, value = $2, where = $3, create = $4, is_default = $5, classification = $6, sort = $7 WHERE id = $8"# ); } + + #[test] + fn test_tokenizer_error_loc() { + let sql = "foo '"; + let ast = Parser::parse_sql(&GenericDialect, sql); + assert_eq!( + ast, + Err(ParserError::TokenizerError( + "Unterminated string literal at Line: 1, Column 5".to_string() + )) + ); + } + + #[test] + fn test_parser_error_loc() { + // TODO: Once we thread token locations through the parser, we should update this + // test to assert the locations of the referenced token + let sql = "SELECT this is a syntax error"; + let ast = Parser::parse_sql(&GenericDialect, sql); + assert_eq!( + ast, + Err(ParserError::ParserError( + "Expected [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: a" + .to_string() + )) + ); + } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 5dfac47f0..68dd652d5 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -300,6 +300,53 @@ impl fmt::Display for Whitespace { } } +/// Location in input string +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct Location { + /// Line number, starting from 1 + pub line: u64, + /// Line column, starting from 1 + pub column: u64, +} + +/// A [Token] with [Location] attached to it +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct TokenWithLocation { + pub token: Token, + pub location: Location, +} + +impl TokenWithLocation { + pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation { + TokenWithLocation { + token, + location: Location { line, column }, + } + } + + pub fn wrap(token: Token) -> TokenWithLocation { + TokenWithLocation::new(token, 0, 0) + } +} + +impl PartialEq for TokenWithLocation { + fn eq(&self, other: &Token) -> bool { + &self.token == other + } +} + +impl PartialEq for Token { + fn eq(&self, other: &TokenWithLocation) -> bool { + self == &other.token + } +} + +impl fmt::Display for TokenWithLocation { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.token.fmt(f) + } +} + /// Tokenizer error #[derive(Debug, PartialEq, Eq)] pub struct TokenizerError { @@ -321,58 +368,88 @@ impl fmt::Display for TokenizerError { #[cfg(feature = "std")] impl std::error::Error for TokenizerError {} +struct State<'a> { + peekable: Peekable>, + pub line: u64, + pub col: u64, +} + +impl<'a> State<'a> { + pub fn next(&mut self) -> Option { + match self.peekable.next() { + None => None, + Some(s) => { + if s == '\n' { + self.line += 1; + self.col = 1; + } else { + self.col += 1; + } + Some(s) + } + } + } + + pub fn peek(&mut self) -> Option<&char> { + self.peekable.peek() + } + + pub fn location(&self) -> Location { + Location { + line: self.line, + column: self.col, + } + } +} + /// SQL Tokenizer pub struct Tokenizer<'a> { dialect: &'a dyn Dialect, query: &'a str, - line: u64, - col: u64, } impl<'a> Tokenizer<'a> { /// Create a new SQL tokenizer for the specified SQL statement pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self { - Self { - dialect, - query, - line: 1, - col: 1, - } + Self { dialect, query } } /// Tokenize the statement and produce a vector of tokens pub fn tokenize(&mut self) -> Result, TokenizerError> { - let mut peekable = self.query.chars().peekable(); + let twl = self.tokenize_with_location()?; let mut tokens: Vec = vec![]; + tokens.reserve(twl.len()); + for token_with_location in twl { + tokens.push(token_with_location.token); + } + Ok(tokens) + } - while let Some(token) = self.next_token(&mut peekable)? { - match &token { - Token::Whitespace(Whitespace::Newline) => { - self.line += 1; - self.col = 1; - } + /// Tokenize the statement and produce a vector of tokens with location information + pub fn tokenize_with_location(&mut self) -> Result, TokenizerError> { + let mut state = State { + peekable: self.query.chars().peekable(), + line: 1, + col: 1, + }; - Token::Whitespace(Whitespace::Tab) => self.col += 4, - Token::Word(w) => { - self.col += w.value.chars().count() as u64; - if w.quote_style.is_some() { - self.col += 2 - } - } - Token::Number(s, _) => self.col += s.chars().count() as u64, - Token::SingleQuotedString(s) => self.col += s.chars().count() as u64, - Token::Placeholder(s) => self.col += s.chars().count() as u64, - _ => self.col += 1, - } + let mut tokens: Vec = vec![]; + + let mut location = state.location(); + while let Some(token) = self.next_token(&mut state)? { + tokens.push(TokenWithLocation { + token, + location: location.clone(), + }); - tokens.push(token); + location = state.location(); } Ok(tokens) } /// Get the next token or return None - fn next_token(&self, chars: &mut Peekable>) -> Result, TokenizerError> { + fn next_token(&self, chars: &mut State) -> Result, TokenizerError> { //println!("next_token: {:?}", chars.peek()); match chars.peek() { Some(&ch) => match ch { @@ -405,10 +482,12 @@ impl<'a> Tokenizer<'a> { } // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard. x @ 'e' | x @ 'E' => { + let starting_loc = chars.location(); chars.next(); // consume, to check the next char match chars.peek() { Some('\'') => { - let s = self.tokenize_escaped_single_quoted_string(chars)?; + let s = + self.tokenize_escaped_single_quoted_string(starting_loc, chars)?; Ok(Some(Token::EscapedStringLiteral(s))) } _ => { @@ -441,7 +520,12 @@ impl<'a> Tokenizer<'a> { let s = self.tokenize_word(ch, chars); if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') { - let mut s = peeking_take_while(&mut s.chars().peekable(), |ch| { + let mut inner_state = State { + peekable: s.chars().peekable(), + line: 0, + col: 0, + }; + let mut s = peeking_take_while(&mut inner_state, |ch| { matches!(ch, '0'..='9' | '.') }); let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.')); @@ -469,8 +553,9 @@ impl<'a> Tokenizer<'a> { if self.dialect.is_delimited_identifier_start(ch) && self .dialect - .is_proper_identifier_inside_quotes(chars.clone()) => + .is_proper_identifier_inside_quotes(chars.peekable.clone()) => { + let error_loc = chars.location(); chars.next(); // consume the opening quote let quote_end = Word::matching_end_quote(quote_start); let (s, last_char) = parse_quoted_ident(chars, quote_end); @@ -478,10 +563,10 @@ impl<'a> Tokenizer<'a> { if last_char == Some(quote_end) { Ok(Some(Token::make_word(&s, Some(quote_start)))) } else { - self.tokenizer_error(format!( - "Expected close delimiter '{}' before EOF.", - quote_end - )) + self.tokenizer_error( + error_loc, + format!("Expected close delimiter '{}' before EOF.", quote_end), + ) } } // numbers and period @@ -698,16 +783,20 @@ impl<'a> Tokenizer<'a> { } } - fn tokenizer_error(&self, message: impl Into) -> Result { + fn tokenizer_error( + &self, + loc: Location, + message: impl Into, + ) -> Result { Err(TokenizerError { message: message.into(), - col: self.col, - line: self.line, + col: loc.column, + line: loc.line, }) } // Consume characters until newline - fn tokenize_single_line_comment(&self, chars: &mut Peekable>) -> String { + fn tokenize_single_line_comment(&self, chars: &mut State) -> String { let mut comment = peeking_take_while(chars, |ch| ch != '\n'); if let Some(ch) = chars.next() { assert_eq!(ch, '\n'); @@ -717,7 +806,7 @@ impl<'a> Tokenizer<'a> { } /// Tokenize an identifier or keyword, after the first char is already consumed. - fn tokenize_word(&self, first_char: char, chars: &mut Peekable>) -> String { + fn tokenize_word(&self, first_char: char, chars: &mut State) -> String { let mut s = first_char.to_string(); s.push_str(&peeking_take_while(chars, |ch| { self.dialect.is_identifier_part(ch) @@ -728,9 +817,13 @@ impl<'a> Tokenizer<'a> { /// Read a single quoted string, starting with the opening quote. fn tokenize_escaped_single_quoted_string( &self, - chars: &mut Peekable>, + starting_loc: Location, + chars: &mut State, ) -> Result { let mut s = String::new(); + + // This case is a bit tricky + chars.next(); // consume the opening quote // slash escaping @@ -782,16 +875,18 @@ impl<'a> Tokenizer<'a> { } } } - self.tokenizer_error("Unterminated encoded string literal") + self.tokenizer_error(starting_loc, "Unterminated encoded string literal") } /// Read a single quoted string, starting with the opening quote. fn tokenize_quoted_string( &self, - chars: &mut Peekable>, + chars: &mut State, quote_style: char, ) -> Result { let mut s = String::new(); + let error_loc = chars.location(); + chars.next(); // consume the opening quote // slash escaping is specific to MySQL dialect @@ -824,12 +919,12 @@ impl<'a> Tokenizer<'a> { } } } - self.tokenizer_error("Unterminated string literal") + self.tokenizer_error(error_loc, "Unterminated string literal") } fn tokenize_multiline_comment( &self, - chars: &mut Peekable>, + chars: &mut State, ) -> Result, TokenizerError> { let mut s = String::new(); let mut nested = 1; @@ -850,7 +945,12 @@ impl<'a> Tokenizer<'a> { s.push(ch); last_ch = ch; } - None => break self.tokenizer_error("Unexpected EOF while in a multi-line comment"), + None => { + break self.tokenizer_error( + chars.location(), + "Unexpected EOF while in a multi-line comment", + ) + } } } } @@ -858,7 +958,7 @@ impl<'a> Tokenizer<'a> { #[allow(clippy::unnecessary_wraps)] fn consume_and_return( &self, - chars: &mut Peekable>, + chars: &mut State, t: Token, ) -> Result, TokenizerError> { chars.next(); @@ -869,10 +969,7 @@ impl<'a> Tokenizer<'a> { /// Read from `chars` until `predicate` returns `false` or EOF is hit. /// Return the characters read as String, and keep the first non-matching /// char available as `chars.next()`. -fn peeking_take_while( - chars: &mut Peekable>, - mut predicate: impl FnMut(char) -> bool, -) -> String { +fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String { let mut s = String::new(); while let Some(&ch) = chars.peek() { if predicate(ch) { @@ -885,7 +982,7 @@ fn peeking_take_while( s } -fn parse_quoted_ident(chars: &mut Peekable>, quote_end: char) -> (String, Option) { +fn parse_quoted_ident(chars: &mut State, quote_end: char) -> (String, Option) { let mut last_char = None; let mut s = String::new(); while let Some(ch) = chars.next() { @@ -1518,7 +1615,25 @@ mod tests { compare(expected, tokens); } - fn compare(expected: Vec, actual: Vec) { + #[test] + fn tokenize_with_location() { + let sql = "SELECT a,\n b"; + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, sql); + let tokens = tokenizer.tokenize_with_location().unwrap(); + let expected = vec![ + TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1), + TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7), + TokenWithLocation::new(Token::make_word("a", None), 1, 8), + TokenWithLocation::new(Token::Comma, 1, 9), + TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10), + TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1), + TokenWithLocation::new(Token::make_word("b", None), 2, 2), + ]; + compare(expected, tokens); + } + + fn compare(expected: Vec, actual: Vec) { //println!("------------------------------"); //println!("tokens = {:?}", actual); //println!("expected = {:?}", expected);