-
Notifications
You must be signed in to change notification settings - Fork 179
Tokenize Unicode identifiers #2284
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -22,6 +22,7 @@ | |||||||||||
#include "rust-linemap.h" | ||||||||||||
#include "rust-session-manager.h" | ||||||||||||
#include "safe-ctype.h" | ||||||||||||
#include "cpplib.h" | ||||||||||||
|
||||||||||||
namespace Rust { | ||||||||||||
// TODO: move to separate compilation unit? | ||||||||||||
|
@@ -103,11 +104,17 @@ check_valid_float_dot_end (char character) | |||||||||||
return character != '.' && character != '_' && !ISALPHA (character); | ||||||||||||
} | ||||||||||||
|
||||||||||||
// ISSPACE from safe-ctype but may change in future | ||||||||||||
bool | ||||||||||||
is_whitespace (char character) | ||||||||||||
is_whitespace (int character) | ||||||||||||
{ | ||||||||||||
return ISSPACE (character); | ||||||||||||
// https://doc.rust-lang.org/reference/whitespace.html | ||||||||||||
return character == '\t' || character == '\n' || character == '\v' | ||||||||||||
|| character == '\f' || character == '\r' || character == ' ' | ||||||||||||
|| character == 0x0085 // next line | ||||||||||||
|| character == 0x200e // left-to-right mark | ||||||||||||
|| character == 0x200f // right-to-left mark | ||||||||||||
|| character == 0x2028 // line separator | ||||||||||||
|| character == 0x2029; // pragraph separator | ||||||||||||
Comment on lines
+113
to
+117
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are all of those characters accepted by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. All of these values are defined in the Rust ref. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I missed this! Sorry! Thanks for pointing it out haha |
||||||||||||
} | ||||||||||||
|
||||||||||||
bool | ||||||||||||
|
@@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character) | |||||||||||
return character == 'x' || character == 'o' || character == 'b'; | ||||||||||||
} | ||||||||||||
|
||||||||||||
bool | ||||||||||||
is_identifier_start (int codepoint) | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want to unify with the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think gccrs/gcc/rust/lex/rust-lex.cc Lines 83 to 87 in d535c82
|
||||||||||||
{ | ||||||||||||
return (check_xid_property (codepoint) & XID_START) || codepoint == '_'; | ||||||||||||
} | ||||||||||||
|
||||||||||||
bool | ||||||||||||
is_identifier_continue (int codepoint) | ||||||||||||
{ | ||||||||||||
return check_xid_property (codepoint) & XID_CONTINUE; | ||||||||||||
} | ||||||||||||
|
||||||||||||
Lexer::Lexer (const std::string &input) | ||||||||||||
: input (RAIIFile::create_error ()), current_line (1), current_column (1), | ||||||||||||
line_map (nullptr), dump_lex_out (Optional<std::ofstream &>::none ()), | ||||||||||||
|
@@ -283,22 +302,22 @@ Lexer::build_token () | |||||||||||
while (true) | ||||||||||||
{ | ||||||||||||
Location loc = get_current_location (); | ||||||||||||
current_char = peek_input (); | ||||||||||||
skip_input (); | ||||||||||||
|
||||||||||||
// detect UTF8 bom | ||||||||||||
// | ||||||||||||
// Must be the first thing on the first line. | ||||||||||||
// There might be an optional BOM (Byte Order Mark), which for UTF-8 is | ||||||||||||
// the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped. | ||||||||||||
if (current_line == 1 && current_column == 1 && current_char == 0xef | ||||||||||||
&& peek_input () == 0xbb && peek_input (1) == 0xbf) | ||||||||||||
if (current_line == 1 && current_column == 1 && peek_input () == 0xef | ||||||||||||
&& peek_input (1) == 0xbb && peek_input (2) == 0xbf) | ||||||||||||
{ | ||||||||||||
skip_input (1); | ||||||||||||
current_char = peek_input (); | ||||||||||||
skip_input (); | ||||||||||||
skip_input (2); | ||||||||||||
} | ||||||||||||
|
||||||||||||
current_char = peek_input (); | ||||||||||||
current_char32 = peek_codepoint_input (); | ||||||||||||
skip_codepoint_input (); | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we skipping the codepoint input here but not the char? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we skip one byte here, only the first byte of current utf-8 character can be skipped by the lexer, which we do not expect. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, thank you! |
||||||||||||
|
||||||||||||
// detect shebang | ||||||||||||
// Must be the first thing on the first line, starting with #! | ||||||||||||
// But since an attribute can also start with an #! we don't count it as a | ||||||||||||
|
@@ -311,6 +330,7 @@ Lexer::build_token () | |||||||||||
int n = 1; | ||||||||||||
while (true) | ||||||||||||
{ | ||||||||||||
// TODO use utf-8 codepoint to skip whitespaces | ||||||||||||
int next_char = peek_input (n); | ||||||||||||
if (is_whitespace (next_char)) | ||||||||||||
n++; | ||||||||||||
|
@@ -1051,7 +1071,8 @@ Lexer::build_token () | |||||||||||
int peek = peek_input (); | ||||||||||||
int peek1 = peek_input (1); | ||||||||||||
|
||||||||||||
if (peek == '#' && (ISALPHA (peek1) || peek1 == '_')) | ||||||||||||
// TODO (tamaron) parse Unicode ident | ||||||||||||
if (peek == '#' && is_identifier_start (peek1)) | ||||||||||||
{ | ||||||||||||
TokenPtr raw_ident_ptr = parse_raw_identifier (loc); | ||||||||||||
if (raw_ident_ptr != nullptr) | ||||||||||||
|
@@ -1068,8 +1089,8 @@ Lexer::build_token () | |||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
// find identifiers and keywords | ||||||||||||
if (ISALPHA (current_char) || current_char == '_') | ||||||||||||
// find identifiers and keywords. | ||||||||||||
if (is_identifier_start (current_char32.value)) | ||||||||||||
return parse_identifier_or_keyword (loc); | ||||||||||||
|
||||||||||||
// int and float literals | ||||||||||||
|
@@ -1467,6 +1488,7 @@ Lexer::parse_partial_string_continue () | |||||||||||
int additional_length_offset = 1; | ||||||||||||
|
||||||||||||
// string continue | ||||||||||||
// TODO use utf-8 codepoint to skip whitespaces | ||||||||||||
while (is_whitespace (current_char)) | ||||||||||||
{ | ||||||||||||
if (current_char == '\n') | ||||||||||||
|
@@ -1610,6 +1632,7 @@ Lexer::parse_partial_unicode_escape () | |||||||||||
// wrong bracketm whitespace or single/double quotes are wrong | ||||||||||||
// termination, otherwise it is a wrong character, then skip to the actual | ||||||||||||
// terminator. | ||||||||||||
// TODO use utf-8 codepoint to skip whitespaces | ||||||||||||
if (current_char == '{' || is_whitespace (current_char) | ||||||||||||
|| current_char == '\'' || current_char == '"') | ||||||||||||
{ | ||||||||||||
|
@@ -1622,6 +1645,7 @@ Lexer::parse_partial_unicode_escape () | |||||||||||
rust_error_at (get_current_location (), | ||||||||||||
"invalid character %<%c%> in unicode escape", | ||||||||||||
current_char); | ||||||||||||
// TODO use utf-8 codepoint to skip whitespaces | ||||||||||||
while (current_char != '}' && current_char != '{' | ||||||||||||
&& !is_whitespace (current_char) && current_char != '\'' | ||||||||||||
&& current_char != '"') | ||||||||||||
|
@@ -1904,8 +1928,7 @@ Lexer::parse_raw_identifier (Location loc) | |||||||||||
int length = 0; | ||||||||||||
current_char = peek_input (); | ||||||||||||
// loop through entire name | ||||||||||||
while (ISALPHA (current_char) || ISDIGIT (current_char) | ||||||||||||
|| current_char == '_') | ||||||||||||
while (is_identifier_continue (current_char)) | ||||||||||||
{ | ||||||||||||
length++; | ||||||||||||
|
||||||||||||
|
@@ -2041,21 +2064,22 @@ Lexer::parse_identifier_or_keyword (Location loc) | |||||||||||
{ | ||||||||||||
std::string str; | ||||||||||||
str.reserve (16); // default | ||||||||||||
str += current_char; | ||||||||||||
str += current_char32.as_string (); | ||||||||||||
|
||||||||||||
bool first_is_underscore = current_char == '_'; | ||||||||||||
|
||||||||||||
int length = 1; | ||||||||||||
current_char = peek_input (); | ||||||||||||
current_char32 = peek_codepoint_input (); | ||||||||||||
|
||||||||||||
// loop through entire name | ||||||||||||
while (ISALPHA (current_char) || ISDIGIT (current_char) | ||||||||||||
|| current_char == '_') | ||||||||||||
while (is_identifier_continue (current_char32.value)) | ||||||||||||
{ | ||||||||||||
auto s = current_char32.as_string (); | ||||||||||||
length++; | ||||||||||||
|
||||||||||||
str += current_char; | ||||||||||||
skip_input (); | ||||||||||||
current_char = peek_input (); | ||||||||||||
str += current_char32.as_string (); | ||||||||||||
skip_codepoint_input (); | ||||||||||||
current_char32 = peek_codepoint_input (); | ||||||||||||
} | ||||||||||||
|
||||||||||||
current_column += length; | ||||||||||||
|
@@ -2443,28 +2467,29 @@ Lexer::parse_char_or_lifetime (Location loc) | |||||||||||
|
||||||||||||
return Token::make_char (loc, current_char32); | ||||||||||||
} | ||||||||||||
else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value) | ||||||||||||
|| current_char32.value == '_') | ||||||||||||
else if (is_identifier_start (current_char32.value)) | ||||||||||||
{ | ||||||||||||
// parse lifetime name | ||||||||||||
std::string str; | ||||||||||||
str += current_char32; | ||||||||||||
length++; | ||||||||||||
|
||||||||||||
current_char = peek_input (); | ||||||||||||
while (ISDIGIT (current_char) || ISALPHA (current_char) | ||||||||||||
|| current_char == '_') | ||||||||||||
current_char32 = peek_codepoint_input (); | ||||||||||||
while (is_identifier_continue (current_char32.value)) | ||||||||||||
{ | ||||||||||||
str += current_char; | ||||||||||||
skip_input (); | ||||||||||||
current_char = peek_input (); | ||||||||||||
str += current_char32; | ||||||||||||
skip_codepoint_input (); | ||||||||||||
current_char32 = peek_codepoint_input (); | ||||||||||||
length++; | ||||||||||||
} | ||||||||||||
|
||||||||||||
current_column += length; | ||||||||||||
|
||||||||||||
loc += length - 1; | ||||||||||||
|
||||||||||||
// TODO some keywords cannot be used for a lifetime label #2306 | ||||||||||||
// https://doc.rust-lang.org/reference/tokens.html | ||||||||||||
|
||||||||||||
str.shrink_to_fit (); | ||||||||||||
return Token::make_lifetime (loc, std::move (str)); | ||||||||||||
} | ||||||||||||
|
@@ -2636,6 +2661,8 @@ Lexer::peek_codepoint_input () | |||||||||||
void | ||||||||||||
Lexer::skip_codepoint_input () | ||||||||||||
{ | ||||||||||||
if (peek_input () == EOF) | ||||||||||||
return; | ||||||||||||
int toSkip = get_input_codepoint_length (); | ||||||||||||
gcc_assert (toSkip >= 1); | ||||||||||||
|
||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added some codepoints of whitespaces.
But non-ascii whitespaces are not actually checked during tokenization because this func is called with argument whose type is
char
(1 byte)