Skip to content

Commit 41e4ea7

Browse files
committed
add utils to check XID properties
Signed-off-by: Raiki Tamura <[email protected]>
1 parent c7b7e29 commit 41e4ea7

File tree

4 files changed

+104
-32
lines changed

4 files changed

+104
-32
lines changed

gcc/rust/lex/rust-lex.cc

+57-30
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "rust-linemap.h"
2323
#include "rust-session-manager.h"
2424
#include "safe-ctype.h"
25+
#include "cpplib.h"
2526

2627
namespace Rust {
2728
// TODO: move to separate compilation unit?
@@ -103,11 +104,17 @@ check_valid_float_dot_end (char character)
103104
return character != '.' && character != '_' && !ISALPHA (character);
104105
}
105106

106-
// ISSPACE from safe-ctype but may change in future
107107
bool
108-
is_whitespace (char character)
108+
is_whitespace (int character)
109109
{
110-
return ISSPACE (character);
110+
// https://doc.rust-lang.org/reference/whitespace.html
111+
return character == '\t' || character == '\n' || character == '\v'
112+
|| character == '\f' || character == '\r' || character == ' '
113+
|| character == 0x0085 // next line
114+
|| character == 0x200e // left-to-right mark
115+
|| character == 0x200f // right-to-left mark
116+
|| character == 0x2028 // line separator
117+
|| character == 0x2029; // pragraph separator
111118
}
112119

113120
bool
@@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character)
116123
return character == 'x' || character == 'o' || character == 'b';
117124
}
118125

126+
bool
127+
is_identifier_start (int codepoint)
128+
{
129+
return (check_xid_property (codepoint) & XID_START) || codepoint == '_';
130+
}
131+
132+
bool
133+
is_identifier_continue (int codepoint)
134+
{
135+
return check_xid_property (codepoint) & XID_CONTINUE;
136+
}
137+
119138
Lexer::Lexer (const std::string &input)
120139
: input (RAIIFile::create_error ()), current_line (1), current_column (1),
121140
line_map (nullptr), dump_lex_out (Optional<std::ofstream &>::none ()),
@@ -283,22 +302,22 @@ Lexer::build_token ()
283302
while (true)
284303
{
285304
Location loc = get_current_location ();
286-
current_char = peek_input ();
287-
skip_input ();
288305

289306
// detect UTF8 bom
290307
//
291308
// Must be the first thing on the first line.
292309
// There might be an optional BOM (Byte Order Mark), which for UTF-8 is
293310
// the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
294-
if (current_line == 1 && current_column == 1 && current_char == 0xef
295-
&& peek_input () == 0xbb && peek_input (1) == 0xbf)
311+
if (current_line == 1 && current_column == 1 && peek_input () == 0xef
312+
&& peek_input (1) == 0xbb && peek_input (2) == 0xbf)
296313
{
297-
skip_input (1);
298-
current_char = peek_input ();
299-
skip_input ();
314+
skip_input (2);
300315
}
301316

317+
current_char = peek_input ();
318+
current_char32 = peek_codepoint_input ();
319+
skip_codepoint_input ();
320+
302321
// detect shebang
303322
// Must be the first thing on the first line, starting with #!
304323
// But since an attribute can also start with an #! we don't count it as a
@@ -311,6 +330,7 @@ Lexer::build_token ()
311330
int n = 1;
312331
while (true)
313332
{
333+
// TODO use utf-8 codepoint to skip whitespaces
314334
int next_char = peek_input (n);
315335
if (is_whitespace (next_char))
316336
n++;
@@ -1051,7 +1071,8 @@ Lexer::build_token ()
10511071
int peek = peek_input ();
10521072
int peek1 = peek_input (1);
10531073

1054-
if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
1074+
// TODO (tamaron) parse Unicode ident
1075+
if (peek == '#' && is_identifier_start (peek1))
10551076
{
10561077
TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
10571078
if (raw_ident_ptr != nullptr)
@@ -1068,8 +1089,8 @@ Lexer::build_token ()
10681089
}
10691090
}
10701091

1071-
// find identifiers and keywords
1072-
if (ISALPHA (current_char) || current_char == '_')
1092+
// find identifiers and keywords.
1093+
if (is_identifier_start (current_char32.value))
10731094
return parse_identifier_or_keyword (loc);
10741095

10751096
// int and float literals
@@ -1467,6 +1488,7 @@ Lexer::parse_partial_string_continue ()
14671488
int additional_length_offset = 1;
14681489

14691490
// string continue
1491+
// TODO use utf-8 codepoint to skip whitespaces
14701492
while (is_whitespace (current_char))
14711493
{
14721494
if (current_char == '\n')
@@ -1610,6 +1632,7 @@ Lexer::parse_partial_unicode_escape ()
16101632
// wrong bracketm whitespace or single/double quotes are wrong
16111633
// termination, otherwise it is a wrong character, then skip to the actual
16121634
// terminator.
1635+
// TODO use utf-8 codepoint to skip whitespaces
16131636
if (current_char == '{' || is_whitespace (current_char)
16141637
|| current_char == '\'' || current_char == '"')
16151638
{
@@ -1622,6 +1645,7 @@ Lexer::parse_partial_unicode_escape ()
16221645
rust_error_at (get_current_location (),
16231646
"invalid character %<%c%> in unicode escape",
16241647
current_char);
1648+
// TODO use utf-8 codepoint to skip whitespaces
16251649
while (current_char != '}' && current_char != '{'
16261650
&& !is_whitespace (current_char) && current_char != '\''
16271651
&& current_char != '"')
@@ -1904,8 +1928,7 @@ Lexer::parse_raw_identifier (Location loc)
19041928
int length = 0;
19051929
current_char = peek_input ();
19061930
// loop through entire name
1907-
while (ISALPHA (current_char) || ISDIGIT (current_char)
1908-
|| current_char == '_')
1931+
while (is_identifier_continue (current_char))
19091932
{
19101933
length++;
19111934

@@ -2041,21 +2064,22 @@ Lexer::parse_identifier_or_keyword (Location loc)
20412064
{
20422065
std::string str;
20432066
str.reserve (16); // default
2044-
str += current_char;
2067+
str += current_char32.as_string ();
20452068

20462069
bool first_is_underscore = current_char == '_';
20472070

20482071
int length = 1;
2049-
current_char = peek_input ();
2072+
current_char32 = peek_codepoint_input ();
2073+
20502074
// loop through entire name
2051-
while (ISALPHA (current_char) || ISDIGIT (current_char)
2052-
|| current_char == '_')
2075+
while (is_identifier_continue (current_char32.value))
20532076
{
2077+
auto s = current_char32.as_string ();
20542078
length++;
20552079

2056-
str += current_char;
2057-
skip_input ();
2058-
current_char = peek_input ();
2080+
str += current_char32.as_string ();
2081+
skip_codepoint_input ();
2082+
current_char32 = peek_codepoint_input ();
20592083
}
20602084

20612085
current_column += length;
@@ -2443,28 +2467,29 @@ Lexer::parse_char_or_lifetime (Location loc)
24432467

24442468
return Token::make_char (loc, current_char32);
24452469
}
2446-
else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
2447-
|| current_char32.value == '_')
2470+
else if (is_identifier_start (current_char32.value))
24482471
{
24492472
// parse lifetime name
24502473
std::string str;
24512474
str += current_char32;
24522475
length++;
24532476

2454-
current_char = peek_input ();
2455-
while (ISDIGIT (current_char) || ISALPHA (current_char)
2456-
|| current_char == '_')
2477+
current_char32 = peek_codepoint_input ();
2478+
while (is_identifier_continue (current_char32.value))
24572479
{
2458-
str += current_char;
2459-
skip_input ();
2460-
current_char = peek_input ();
2480+
str += current_char32;
2481+
skip_codepoint_input ();
2482+
current_char32 = peek_codepoint_input ();
24612483
length++;
24622484
}
24632485

24642486
current_column += length;
24652487

24662488
loc += length - 1;
24672489

2490+
// TODO some keywords cannot be used for a lifetime label
2491+
// https://doc.rust-lang.org/reference/tokens.html#lifetimes-and-loop-labels
2492+
24682493
str.shrink_to_fit ();
24692494
return Token::make_lifetime (loc, std::move (str));
24702495
}
@@ -2636,6 +2661,8 @@ Lexer::peek_codepoint_input ()
26362661
void
26372662
Lexer::skip_codepoint_input ()
26382663
{
2664+
if (peek_input () == EOF)
2665+
return;
26392666
int toSkip = get_input_codepoint_length ();
26402667
gcc_assert (toSkip >= 1);
26412668

gcc/rust/lex/rust-lex.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,9 @@ class Lexer
118118
// Advances current input char to n + 1 chars ahead of current position.
119119
void skip_input (int n);
120120

121-
// Returns char n chars ahead of current position.
122-
int peek_input ();
123121
// Peeks the current char.
122+
int peek_input ();
123+
// Returns char n bytes ahead of current position.
124124
int peek_input (int n);
125125

126126
// Classifies keyword (i.e. gets id for keyword).
@@ -140,6 +140,7 @@ class Lexer
140140

141141
int get_input_codepoint_length ();
142142
int test_get_input_codepoint_n_length (int n_start_offset);
143+
// Peeks the current utf-8 char
143144
Codepoint peek_codepoint_input ();
144145
Codepoint test_peek_codepoint_input (int n);
145146
void skip_codepoint_input ();
@@ -220,6 +221,7 @@ class Lexer
220221
int current_column;
221222
// Current character.
222223
int current_char;
224+
Codepoint current_char32;
223225
// Line map.
224226
Linemap *line_map;
225227

libcpp/charset.cc

+36
Original file line numberDiff line numberDiff line change
@@ -1256,6 +1256,42 @@ _cpp_uname2c_uax44_lm2 (const char *name, size_t len, char *canon_name)
12561256
return result;
12571257
}
12581258

1259+
/* Returns flags representing the XID properties of the given codepoint. */
1260+
unsigned int
1261+
check_xid_property (cppchar_t c)
1262+
{
1263+
// fast path for ASCII
1264+
if (c < 0x80)
1265+
{
1266+
if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'))
1267+
return XID_START | XID_CONTINUE;
1268+
if (('0' <= c && c <= '9') || c == '_')
1269+
return XID_CONTINUE;
1270+
}
1271+
1272+
if (c > UCS_LIMIT)
1273+
return 0;
1274+
1275+
int mn, mx, md;
1276+
mn = 0;
1277+
mx = ARRAY_SIZE (ucnranges) - 1;
1278+
while (mx != mn)
1279+
{
1280+
md = (mn + mx) / 2;
1281+
if (c <= ucnranges[md].end)
1282+
mx = md;
1283+
else
1284+
mn = md + 1;
1285+
}
1286+
1287+
unsigned short flags = ucnranges[mn].flags;
1288+
1289+
if (flags & CXX23)
1290+
return XID_START | XID_CONTINUE;
1291+
if (flags & NXX23)
1292+
return XID_CONTINUE;
1293+
return 0;
1294+
}
12591295

12601296
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
12611297
the start of an identifier, and 0 if C is not valid in an

libcpp/include/cpplib.h

+7
Original file line numberDiff line numberDiff line change
@@ -1602,4 +1602,11 @@ bool cpp_input_conversion_is_trivial (const char *input_charset);
16021602
int cpp_check_utf8_bom (const char *data, size_t data_length);
16031603
bool cpp_valid_utf8_p (const char *data, size_t num_bytes);
16041604

1605+
enum {
1606+
XID_START = 1,
1607+
XID_CONTINUE = 2
1608+
};
1609+
1610+
unsigned int check_xid_property (cppchar_t c);
1611+
16051612
#endif /* ! LIBCPP_CPPLIB_H */

0 commit comments

Comments
 (0)