From 785864b1a1275843f9547554e5d5ed2632f50b1f Mon Sep 17 00:00:00 2001 From: Alex Yaroslavsky <70210301+alexsatori@users.noreply.github.com> Date: Tue, 3 May 2022 13:09:05 +0300 Subject: [PATCH 1/2] Support unicode whitespace --- src/tokenizer.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 566deacec..84f7ebbd8 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -648,6 +648,10 @@ impl<'a> Tokenizer<'a> { ); Ok(Some(Token::Placeholder(String::from("$") + &s))) } + //whitespace check (including unicode chars) should be last as it covers some of the chars above + ch if ch.is_whitespace() => { + self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)) + } other => self.consume_and_return(chars, Token::Char(other)), }, None => Ok(None), From 7ff50a6317b2f7bd6b124fcf28645434fe0219c4 Mon Sep 17 00:00:00 2001 From: Alex Yaroslavsky Date: Thu, 12 May 2022 17:21:12 +0300 Subject: [PATCH 2/2] Add test --- src/tokenizer.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 84f7ebbd8..4f139b758 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1253,6 +1253,21 @@ mod tests { compare(expected, tokens); } + #[test] + fn tokenize_unicode_whitespace() { + let sql = String::from(" \u{2003}\n"); + + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); + let tokens = tokenizer.tokenize().unwrap(); + let expected = vec![ + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::Newline), + ]; + compare(expected, tokens); + } + #[test] fn tokenize_mismatched_quotes() { let sql = String::from("\"foo");