22
22
#include " rust-linemap.h"
23
23
#include " rust-session-manager.h"
24
24
#include " safe-ctype.h"
25
+ #include " cpplib.h"
25
26
26
27
namespace Rust {
27
28
// TODO: move to separate compilation unit?
@@ -103,11 +104,17 @@ check_valid_float_dot_end (char character)
103
104
return character != ' .' && character != ' _' && !ISALPHA (character);
104
105
}
105
106
106
- // ISSPACE from safe-ctype but may change in future
107
107
bool
108
- is_whitespace (char character)
108
+ is_whitespace (int character)
109
109
{
110
- return ISSPACE (character);
110
+ // https://doc.rust-lang.org/reference/whitespace.html
111
+ return character == ' \t ' || character == ' \n ' || character == ' \v '
112
+ || character == ' \f ' || character == ' \r ' || character == ' '
113
+ || character == 0x0085 // next line
114
+ || character == 0x200e // left-to-right mark
115
+ || character == 0x200f // right-to-left mark
116
+ || character == 0x2028 // line separator
117
+ || character == 0x2029 ; // pragraph separator
111
118
}
112
119
113
120
bool
@@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character)
116
123
return character == ' x' || character == ' o' || character == ' b' ;
117
124
}
118
125
126
+ bool
127
+ is_identifier_start (int codepoint)
128
+ {
129
+ return (check_xid_property (codepoint) & XID_START) || codepoint == ' _' ;
130
+ }
131
+
132
+ bool
133
+ is_identifier_continue (int codepoint)
134
+ {
135
+ return check_xid_property (codepoint) & XID_CONTINUE;
136
+ }
137
+
119
138
Lexer::Lexer (const std::string &input)
120
139
: input (RAIIFile::create_error ()), current_line (1 ), current_column (1 ),
121
140
line_map (nullptr ), dump_lex_out (Optional<std::ofstream &>::none ()),
@@ -283,22 +302,22 @@ Lexer::build_token ()
283
302
while (true )
284
303
{
285
304
Location loc = get_current_location ();
286
- current_char = peek_input ();
287
- skip_input ();
288
305
289
306
// detect UTF8 bom
290
307
//
291
308
// Must be the first thing on the first line.
292
309
// There might be an optional BOM (Byte Order Mark), which for UTF-8 is
293
310
// the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
294
- if (current_line == 1 && current_column == 1 && current_char == 0xef
295
- && peek_input () == 0xbb && peek_input (1 ) == 0xbf )
311
+ if (current_line == 1 && current_column == 1 && peek_input () == 0xef
312
+ && peek_input (1 ) == 0xbb && peek_input (2 ) == 0xbf )
296
313
{
297
- skip_input (1 );
298
- current_char = peek_input ();
299
- skip_input ();
314
+ skip_input (2 );
300
315
}
301
316
317
+ current_char = peek_input ();
318
+ current_char32 = peek_codepoint_input ();
319
+ skip_codepoint_input ();
320
+
302
321
// detect shebang
303
322
// Must be the first thing on the first line, starting with #!
304
323
// But since an attribute can also start with an #! we don't count it as a
@@ -311,6 +330,7 @@ Lexer::build_token ()
311
330
int n = 1 ;
312
331
while (true )
313
332
{
333
+ // TODO use utf-8 codepoint to skip whitespaces
314
334
int next_char = peek_input (n);
315
335
if (is_whitespace (next_char))
316
336
n++;
@@ -1051,7 +1071,8 @@ Lexer::build_token ()
1051
1071
int peek = peek_input ();
1052
1072
int peek1 = peek_input (1 );
1053
1073
1054
- if (peek == ' #' && (ISALPHA (peek1) || peek1 == ' _' ))
1074
+ // TODO (tamaron) parse Unicode ident
1075
+ if (peek == ' #' && is_identifier_start (peek1))
1055
1076
{
1056
1077
TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
1057
1078
if (raw_ident_ptr != nullptr )
@@ -1068,8 +1089,8 @@ Lexer::build_token ()
1068
1089
}
1069
1090
}
1070
1091
1071
- // find identifiers and keywords
1072
- if (ISALPHA (current_char) || current_char == ' _ ' )
1092
+ // find identifiers and keywords.
1093
+ if (is_identifier_start (current_char32. value ) )
1073
1094
return parse_identifier_or_keyword (loc);
1074
1095
1075
1096
// int and float literals
@@ -1467,6 +1488,7 @@ Lexer::parse_partial_string_continue ()
1467
1488
int additional_length_offset = 1 ;
1468
1489
1469
1490
// string continue
1491
+ // TODO use utf-8 codepoint to skip whitespaces
1470
1492
while (is_whitespace (current_char))
1471
1493
{
1472
1494
if (current_char == ' \n ' )
@@ -1610,6 +1632,7 @@ Lexer::parse_partial_unicode_escape ()
1610
1632
// wrong bracketm whitespace or single/double quotes are wrong
1611
1633
// termination, otherwise it is a wrong character, then skip to the actual
1612
1634
// terminator.
1635
+ // TODO use utf-8 codepoint to skip whitespaces
1613
1636
if (current_char == ' {' || is_whitespace (current_char)
1614
1637
|| current_char == ' \' ' || current_char == ' "' )
1615
1638
{
@@ -1622,6 +1645,7 @@ Lexer::parse_partial_unicode_escape ()
1622
1645
rust_error_at (get_current_location (),
1623
1646
" invalid character %<%c%> in unicode escape" ,
1624
1647
current_char);
1648
+ // TODO use utf-8 codepoint to skip whitespaces
1625
1649
while (current_char != ' }' && current_char != ' {'
1626
1650
&& !is_whitespace (current_char) && current_char != ' \' '
1627
1651
&& current_char != ' "' )
@@ -1904,8 +1928,7 @@ Lexer::parse_raw_identifier (Location loc)
1904
1928
int length = 0 ;
1905
1929
current_char = peek_input ();
1906
1930
// loop through entire name
1907
- while (ISALPHA (current_char) || ISDIGIT (current_char)
1908
- || current_char == ' _' )
1931
+ while (is_identifier_continue (current_char))
1909
1932
{
1910
1933
length++;
1911
1934
@@ -2041,21 +2064,22 @@ Lexer::parse_identifier_or_keyword (Location loc)
2041
2064
{
2042
2065
std::string str;
2043
2066
str.reserve (16 ); // default
2044
- str += current_char ;
2067
+ str += current_char32. as_string () ;
2045
2068
2046
2069
bool first_is_underscore = current_char == ' _' ;
2047
2070
2048
2071
int length = 1 ;
2049
- current_char = peek_input ();
2072
+ current_char32 = peek_codepoint_input ();
2073
+
2050
2074
// loop through entire name
2051
- while (ISALPHA (current_char) || ISDIGIT (current_char)
2052
- || current_char == ' _' )
2075
+ while (is_identifier_continue (current_char32.value ))
2053
2076
{
2077
+ auto s = current_char32.as_string ();
2054
2078
length++;
2055
2079
2056
- str += current_char ;
2057
- skip_input ();
2058
- current_char = peek_input ();
2080
+ str += current_char32. as_string () ;
2081
+ skip_codepoint_input ();
2082
+ current_char32 = peek_codepoint_input ();
2059
2083
}
2060
2084
2061
2085
current_column += length;
@@ -2443,28 +2467,29 @@ Lexer::parse_char_or_lifetime (Location loc)
2443
2467
2444
2468
return Token::make_char (loc, current_char32);
2445
2469
}
2446
- else if (ISDIGIT (current_char32.value ) || ISALPHA (current_char32.value )
2447
- || current_char32.value == ' _' )
2470
+ else if (is_identifier_start (current_char32.value ))
2448
2471
{
2449
2472
// parse lifetime name
2450
2473
std::string str;
2451
2474
str += current_char32;
2452
2475
length++;
2453
2476
2454
- current_char = peek_input ();
2455
- while (ISDIGIT (current_char) || ISALPHA (current_char)
2456
- || current_char == ' _' )
2477
+ current_char32 = peek_codepoint_input ();
2478
+ while (is_identifier_continue (current_char32.value ))
2457
2479
{
2458
- str += current_char ;
2459
- skip_input ();
2460
- current_char = peek_input ();
2480
+ str += current_char32 ;
2481
+ skip_codepoint_input ();
2482
+ current_char32 = peek_codepoint_input ();
2461
2483
length++;
2462
2484
}
2463
2485
2464
2486
current_column += length;
2465
2487
2466
2488
loc += length - 1 ;
2467
2489
2490
+ // TODO some keywords cannot be used for a lifetime label
2491
+ // https://doc.rust-lang.org/reference/tokens.html#lifetimes-and-loop-labels
2492
+
2468
2493
str.shrink_to_fit ();
2469
2494
return Token::make_lifetime (loc, std::move (str));
2470
2495
}
@@ -2636,6 +2661,8 @@ Lexer::peek_codepoint_input ()
2636
2661
void
2637
2662
Lexer::skip_codepoint_input ()
2638
2663
{
2664
+ if (peek_input () == EOF)
2665
+ return ;
2639
2666
int toSkip = get_input_codepoint_length ();
2640
2667
gcc_assert (toSkip >= 1 );
2641
2668
0 commit comments