5454#include < liblangutil/Exceptions.h>
5555#include < liblangutil/Scanner.h>
5656
57- #include < algorithm>
57+ #include < boost/algorithm/string/classification.hpp>
58+
5859#include < optional>
59- #include < ostream >
60+ #include < string_view >
6061#include < tuple>
6162
6263using namespace std ;
@@ -79,6 +80,8 @@ string to_string(ScannerError _errorCode)
7980 case ScannerError::IllegalExponent: return " Invalid exponent." ;
8081 case ScannerError::IllegalNumberEnd: return " Identifier-start is not allowed at end of a number." ;
8182 case ScannerError::OctalNotAllowed: return " Octal numbers not allowed." ;
83+ case ScannerError::DirectionalOverrideUnderflowInComment: return " Unicode direction override underflow in comment or string literal." ;
84+ case ScannerError::DirectionalOverrideMismatchInComment: return " Mismatching directional override markers in comment or string literal." ;
8285 default :
8386 solAssert (false , " Unhandled case in to_string(ScannerError)" );
8487 return " " ;
@@ -271,12 +274,59 @@ bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
271274 return sourcePos () != startPosition;
272275}
273276
277+
278+ namespace
279+ {
280+ // / Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth.
281+ // /
282+ // / @returns ScannerError::NoError in case of successful parsing and directional encodings are paired
283+ // / and error code in case the input's lexical parser state is invalid and this error should be reported
284+ // / to the user.
285+ static ScannerError validateBiDiMarkup (CharStream& _stream, size_t _startPosition)
286+ {
287+ static array<pair<string_view, int >, 5 > constexpr directionalSequences{
288+ pair<string_view, int >{" \xE2\x80\xAD " , 1 }, // U+202D (LRO - Left-to-Right Override)
289+ pair<string_view, int >{" \xE2\x80\xAE " , 1 }, // U+202E (RLO - Right-to-Left Override)
290+ pair<string_view, int >{" \xE2\x80\xAA " , 1 }, // U+202A (LRE - Left-to-Right Embedding)
291+ pair<string_view, int >{" \xE2\x80\xAB " , 1 }, // U+202B (RLE - Right-to-Left Embedding)
292+ pair<string_view, int >{" \xE2\x80\xAC " , -1 } // PDF - Pop Directional Formatting
293+ };
294+
295+ size_t endPosition = _stream.position ();
296+ _stream.setPosition (_startPosition);
297+
298+ int directionOverrideDepth = 0 ;
299+
300+ for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos)
301+ {
302+ _stream.setPosition (currentPos);
303+
304+ for (auto const & [sequence, depthChange]: directionalSequences)
305+ if (_stream.prefixMatch (sequence))
306+ directionOverrideDepth += depthChange;
307+
308+ if (directionOverrideDepth < 0 )
309+ return ScannerError::DirectionalOverrideUnderflowInComment;
310+ }
311+
312+ _stream.setPosition (endPosition);
313+
314+ return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatchInComment : ScannerError::NoError;
315+ }
316+ }
317+
274318Token Scanner::skipSingleLineComment ()
275319{
276320 // Line terminator is not part of the comment. If it is a
277321 // non-ascii line terminator, it will result in a parser error.
322+ size_t startPosition = m_source->position ();
278323 while (!isUnicodeLinebreak ())
279- if (!advance ()) break ;
324+ if (!advance ())
325+ break ;
326+
327+ ScannerError unicodeDirectionError = validateBiDiMarkup (*m_source, startPosition);
328+ if (unicodeDirectionError != ScannerError::NoError)
329+ return setError (unicodeDirectionError);
280330
281331 return Token::Whitespace;
282332}
@@ -349,16 +399,21 @@ size_t Scanner::scanSingleLineDocComment()
349399
350400Token Scanner::skipMultiLineComment ()
351401{
402+ size_t startPosition = m_source->position ();
352403 while (!isSourcePastEndOfInput ())
353404 {
354- char ch = m_char;
405+ char prevChar = m_char;
355406 advance ();
356407
357408 // If we have reached the end of the multi-line comment, we
358409 // consume the '/' and insert a whitespace. This way all
359410 // multi-line comments are treated as whitespace.
360- if (ch == ' *' && m_char == ' /' )
411+ if (prevChar == ' *' && m_char == ' /' )
361412 {
413+ ScannerError unicodeDirectionError = validateBiDiMarkup (*m_source, startPosition);
414+ if (unicodeDirectionError != ScannerError::NoError)
415+ return setError (unicodeDirectionError);
416+
362417 m_char = ' ' ;
363418 return Token::Whitespace;
364419 }
@@ -785,6 +840,7 @@ bool Scanner::isUnicodeLinebreak()
785840
786841Token Scanner::scanString (bool const _isUnicode)
787842{
843+ size_t startPosition = m_source->position ();
788844 char const quote = m_char;
789845 advance (); // consume quote
790846 LiteralScope literal (this , LITERAL_TYPE_STRING);
@@ -812,6 +868,14 @@ Token Scanner::scanString(bool const _isUnicode)
812868 }
813869 if (m_char != quote)
814870 return setError (ScannerError::IllegalStringEndQuote);
871+
872+ if (_isUnicode)
873+ {
874+ ScannerError unicodeDirectionError = validateBiDiMarkup (*m_source, startPosition);
875+ if (unicodeDirectionError != ScannerError::NoError)
876+ return setError (unicodeDirectionError);
877+ }
878+
815879 literal.complete ();
816880 advance (); // consume quote
817881 return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;
0 commit comments