add utils to check XID properties

tamaroning · tamaroning · commit 41e4ea755fa6 · 2023-06-16T10:08:45.000+09:00
Signed-off-by: Raiki Tamura &lt;tamaron1203@gmail.com&gt;
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
@@ -22,6 +22,7 @@
 #include "rust-linemap.h"
 #include "rust-session-manager.h"
 #include "safe-ctype.h"
+#include "cpplib.h"
 
 namespace Rust {
 // TODO: move to separate compilation unit?
@@ -103,11 +104,17 @@ check_valid_float_dot_end (char character)
   return character != '.' && character != '_' && !ISALPHA (character);
 }
 
-// ISSPACE from safe-ctype but may change in future
 bool
-is_whitespace (char character)
+is_whitespace (int character)
 {
-  return ISSPACE (character);
+  // https://doc.rust-lang.org/reference/whitespace.html
+  return character == '\t' || character == '\n' || character == '\v'
+	 || character == '\f' || character == '\r' || character == ' '
+	 || character == 0x0085	 // next line
+	 || character == 0x200e	 // left-to-right mark
+	 || character == 0x200f	 // right-to-left mark
+	 || character == 0x2028	 // line separator
+	 || character == 0x2029; // pragraph separator
 }
 
 bool
@@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character)
   return character == 'x' || character == 'o' || character == 'b';
 }
 
+bool
+is_identifier_start (int codepoint)
+{
+  return (check_xid_property (codepoint) & XID_START) || codepoint == '_';
+}
+
+bool
+is_identifier_continue (int codepoint)
+{
+  return check_xid_property (codepoint) & XID_CONTINUE;
+}
+
 Lexer::Lexer (const std::string &input)
   : input (RAIIFile::create_error ()), current_line (1), current_column (1),
     line_map (nullptr), dump_lex_out (Optional<std::ofstream &>::none ()),
@@ -283,22 +302,22 @@ Lexer::build_token ()
   while (true)
     {
       Location loc = get_current_location ();
-      current_char = peek_input ();
-      skip_input ();
 
       // detect UTF8 bom
       //
       // Must be the first thing on the first line.
       // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
       // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
-      if (current_line == 1 && current_column == 1 && current_char == 0xef
-	  && peek_input () == 0xbb && peek_input (1) == 0xbf)
+      if (current_line == 1 && current_column == 1 && peek_input () == 0xef
+	  && peek_input (1) == 0xbb && peek_input (2) == 0xbf)
 	{
-	  skip_input (1);
-	  current_char = peek_input ();
-	  skip_input ();
+	  skip_input (2);
 	}
 
+      current_char = peek_input ();
+      current_char32 = peek_codepoint_input ();
+      skip_codepoint_input ();
+
       // detect shebang
       // Must be the first thing on the first line, starting with #!
       // But since an attribute can also start with an #! we don't count it as a
@@ -311,6 +330,7 @@ Lexer::build_token ()
 	  int n = 1;
 	  while (true)
 	    {
+	      // TODO use utf-8 codepoint to skip whitespaces
 	      int next_char = peek_input (n);
 	      if (is_whitespace (next_char))
 		n++;
@@ -1051,7 +1071,8 @@ Lexer::build_token ()
 	  int peek = peek_input ();
 	  int peek1 = peek_input (1);
 
-	  if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
+	  // TODO (tamaron) parse Unicode ident
+	  if (peek == '#' && is_identifier_start (peek1))
 	    {
 	      TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
 	      if (raw_ident_ptr != nullptr)
@@ -1068,8 +1089,8 @@ Lexer::build_token ()
 	    }
 	}
 
-      // find identifiers and keywords
-      if (ISALPHA (current_char) || current_char == '_')
+      // find identifiers and keywords.
+      if (is_identifier_start (current_char32.value))
 	return parse_identifier_or_keyword (loc);
 
       // int and float literals
@@ -1467,6 +1488,7 @@ Lexer::parse_partial_string_continue ()
   int additional_length_offset = 1;
 
   // string continue
+  // TODO use utf-8 codepoint to skip whitespaces
   while (is_whitespace (current_char))
     {
       if (current_char == '\n')
@@ -1610,6 +1632,7 @@ Lexer::parse_partial_unicode_escape ()
       // wrong bracketm whitespace or single/double quotes are wrong
       // termination, otherwise it is a wrong character, then skip to the actual
       // terminator.
+      // TODO use utf-8 codepoint to skip whitespaces
       if (current_char == '{' || is_whitespace (current_char)
 	  || current_char == '\'' || current_char == '"')
 	{
@@ -1622,6 +1645,7 @@ Lexer::parse_partial_unicode_escape ()
 	  rust_error_at (get_current_location (),
 			 "invalid character %<%c%> in unicode escape",
 			 current_char);
+	  // TODO use utf-8 codepoint to skip whitespaces
 	  while (current_char != '}' && current_char != '{'
 		 && !is_whitespace (current_char) && current_char != '\''
 		 && current_char != '"')
@@ -1904,8 +1928,7 @@ Lexer::parse_raw_identifier (Location loc)
   int length = 0;
   current_char = peek_input ();
   // loop through entire name
-  while (ISALPHA (current_char) || ISDIGIT (current_char)
-	 || current_char == '_')
+  while (is_identifier_continue (current_char))
     {
       length++;
 
@@ -2041,21 +2064,22 @@ Lexer::parse_identifier_or_keyword (Location loc)
 {
   std::string str;
   str.reserve (16); // default
-  str += current_char;
+  str += current_char32.as_string ();
 
   bool first_is_underscore = current_char == '_';
 
   int length = 1;
-  current_char = peek_input ();
+  current_char32 = peek_codepoint_input ();
+
   // loop through entire name
-  while (ISALPHA (current_char) || ISDIGIT (current_char)
-	 || current_char == '_')
+  while (is_identifier_continue (current_char32.value))
     {
+      auto s = current_char32.as_string ();
       length++;
 
-      str += current_char;
-      skip_input ();
-      current_char = peek_input ();
+      str += current_char32.as_string ();
+      skip_codepoint_input ();
+      current_char32 = peek_codepoint_input ();
     }
 
   current_column += length;
@@ -2443,28 +2467,29 @@ Lexer::parse_char_or_lifetime (Location loc)
 
 	  return Token::make_char (loc, current_char32);
 	}
-      else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
-	       || current_char32.value == '_')
+      else if (is_identifier_start (current_char32.value))
 	{
 	  // parse lifetime name
 	  std::string str;
 	  str += current_char32;
 	  length++;
 
-	  current_char = peek_input ();
-	  while (ISDIGIT (current_char) || ISALPHA (current_char)
-		 || current_char == '_')
+	  current_char32 = peek_codepoint_input ();
+	  while (is_identifier_continue (current_char32.value))
 	    {
-	      str += current_char;
-	      skip_input ();
-	      current_char = peek_input ();
+	      str += current_char32;
+	      skip_codepoint_input ();
+	      current_char32 = peek_codepoint_input ();
 	      length++;
 	    }
 
 	  current_column += length;
 
 	  loc += length - 1;
 
+	  // TODO some keywords cannot be used for a lifetime label
+	  // https://doc.rust-lang.org/reference/tokens.html#lifetimes-and-loop-labels
+
 	  str.shrink_to_fit ();
 	  return Token::make_lifetime (loc, std::move (str));
 	}
@@ -2636,6 +2661,8 @@ Lexer::peek_codepoint_input ()
 void
 Lexer::skip_codepoint_input ()
 {
+  if (peek_input () == EOF)
+    return;
   int toSkip = get_input_codepoint_length ();
   gcc_assert (toSkip >= 1);
 
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
@@ -118,9 +118,9 @@ class Lexer
   // Advances current input char to n + 1 chars ahead of current position.
   void skip_input (int n);
 
-  // Returns char n chars ahead of current position.
-  int peek_input ();
   // Peeks the current char.
+  int peek_input ();
+  // Returns char n bytes ahead of current position.
   int peek_input (int n);
 
   // Classifies keyword (i.e. gets id for keyword).
@@ -140,6 +140,7 @@ class Lexer
 
   int get_input_codepoint_length ();
   int test_get_input_codepoint_n_length (int n_start_offset);
+  // Peeks the current utf-8 char
   Codepoint peek_codepoint_input ();
   Codepoint test_peek_codepoint_input (int n);
   void skip_codepoint_input ();
@@ -220,6 +221,7 @@ class Lexer
   int current_column;
   // Current character.
   int current_char;
+  Codepoint current_char32;
   // Line map.
   Linemap *line_map;
 
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
@@ -1256,6 +1256,42 @@ _cpp_uname2c_uax44_lm2 (const char *name, size_t len, char *canon_name)
   return result;
 }
 
+/* Returns flags representing the XID properties of the given codepoint. */
+unsigned int
+check_xid_property (cppchar_t c)
+{
+  // fast path for ASCII
+  if (c < 0x80)
+  {
+    if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'))
+  return XID_START | XID_CONTINUE;
+    if (('0' <= c && c <= '9') || c == '_')
+  return XID_CONTINUE;
+  }
+
+  if (c > UCS_LIMIT)
+    return 0;
+
+  int mn, mx, md;
+  mn = 0;
+  mx = ARRAY_SIZE (ucnranges) - 1;
+  while (mx != mn)
+    {
+      md = (mn + mx) / 2;
+      if (c <= ucnranges[md].end)
+  mx = md;
+      else
+  mn = md + 1;
+    }
+
+  unsigned short flags = ucnranges[mn].flags;
+
+  if (flags & CXX23) 
+    return XID_START | XID_CONTINUE;
+  if (flags & NXX23)
+    return XID_CONTINUE;
+  return 0;
+}
 
 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
    the start of an identifier, and 0 if C is not valid in an
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
@@ -1602,4 +1602,11 @@ bool cpp_input_conversion_is_trivial (const char *input_charset);
 int cpp_check_utf8_bom (const char *data, size_t data_length);
 bool cpp_valid_utf8_p (const char *data, size_t num_bytes);
 
+enum {
+   XID_START = 1,
+   XID_CONTINUE = 2
+};
+
+unsigned int check_xid_property (cppchar_t c);
+
 #endif /* ! LIBCPP_CPPLIB_H */