Skip to content

Commit db2814c

Browse files
Scanner: Generate error on inbalanced RLO/LRO/PDF override markers.
1 parent ae39c96 commit db2814c

File tree

3 files changed

+76
-10
lines changed

3 files changed

+76
-10
lines changed

liblangutil/CharStream.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,19 @@ class CharStream
8585
/// @returns The character of the current location after update is returned.
8686
char setPosition(size_t _location);
8787

88+
/// Tests whether or not given octect sequence is present at the current reading position.
89+
/// @returns true if the sequence could be found, false otherwise.
90+
bool prefixMatch(std::string_view _sequence) const
91+
{
92+
if (m_position + _sequence.size() >= m_source.size())
93+
return false;
94+
95+
for (size_t i = 0; i < _sequence.size(); ++i)
96+
if (_sequence[i] != get(i))
97+
return false;
98+
return true;
99+
}
100+
88101
void reset() { m_position = 0; }
89102

90103
std::string const& source() const noexcept { return m_source; }

liblangutil/Scanner.cpp

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ string to_string(ScannerError _errorCode)
7979
case ScannerError::IllegalExponent: return "Invalid exponent.";
8080
case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
8181
case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
82+
case ScannerError::MismatchingDirectionalOverridesInComment: return "Mismatching directional override markers in comment.";
8283
default:
8384
solAssert(false, "Unhandled case in to_string(ScannerError)");
8485
return "";
@@ -273,10 +274,29 @@ bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
273274

274275
Token Scanner::skipSingleLineComment()
275276
{
277+
int rtlOverrideDepth = 0;
278+
276279
// Line terminator is not part of the comment. If it is a
277280
// non-ascii line terminator, it will result in a parser error.
278281
while (!isUnicodeLinebreak())
279-
if (!advance()) break;
282+
{
283+
if (tryScanByteSequence("\xE2\x80\xAD") || // U+202D (LRO - Left-to-Right Override)
284+
tryScanByteSequence("\xE2\x80\xAE") // U+202E (RLO - Right-to-Left Override)
285+
)
286+
{
287+
rtlOverrideDepth++;
288+
}
289+
else if (tryScanByteSequence("\xE2\x80\xAC")) // U+202C (PDF - Pop Directional Formatting)
290+
{
291+
rtlOverrideDepth--;
292+
}
293+
else if (!advance())
294+
break;
295+
}
296+
297+
if (rtlOverrideDepth != 0)
298+
// Unbalanced RLO/LRO/PDF codepoint sequences in comment.
299+
return setError(ScannerError::MismatchingDirectionalOverridesInComment);
280300

281301
return Token::Whitespace;
282302
}
@@ -349,18 +369,36 @@ size_t Scanner::scanSingleLineDocComment()
349369

350370
Token Scanner::skipMultiLineComment()
351371
{
372+
int rtlOverrideDepth = 0;
352373
while (!isSourcePastEndOfInput())
353374
{
354-
char ch = m_char;
355-
advance();
356-
357-
// If we have reached the end of the multi-line comment, we
358-
// consume the '/' and insert a whitespace. This way all
359-
// multi-line comments are treated as whitespace.
360-
if (ch == '*' && m_char == '/')
375+
if (tryScanByteSequence("\xE2\x80\xAD") || // U+202D (LRO - Left-to-Right Override)
376+
tryScanByteSequence("\xE2\x80\xAE") // U+202E (RLO - Right-to-Left Override)
377+
)
361378
{
362-
m_char = ' ';
363-
return Token::Whitespace;
379+
rtlOverrideDepth++;
380+
}
381+
else if (tryScanByteSequence("\xE2\x80\xAC")) // U+202C (PDF - Pop Directional Formatting)
382+
{
383+
rtlOverrideDepth--;
384+
}
385+
else
386+
{
387+
char ch = m_char;
388+
advance();
389+
390+
// If we have reached the end of the multi-line comment, we
391+
// consume the '/' and insert a whitespace. This way all
392+
// multi-line comments are treated as whitespace.
393+
if (ch == '*' && m_char == '/')
394+
{
395+
if (rtlOverrideDepth != 0)
396+
// Unbalanced RLO/LRO/PDF codepoint sequences in comment.
397+
return setError(ScannerError::MismatchingDirectionalOverridesInComment);
398+
399+
m_char = ' ';
400+
return Token::Whitespace;
401+
}
364402
}
365403
}
366404
// Unterminated multi-line comment.

liblangutil/Scanner.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ enum class ScannerError
8989
IllegalExponent,
9090
IllegalNumberEnd,
9191

92+
MismatchingDirectionalOverridesInComment,
93+
9294
OctalNotAllowed,
9395
};
9496

@@ -248,6 +250,19 @@ class Scanner
248250
/// Scans a slash '/' and depending on the characters returns the appropriate token
249251
Token scanSlash();
250252

253+
/// Tries scanning given octect sequence and advances reading position respectively iff found.
254+
/// @returns true if it could be scanned, false otherwise.
255+
bool tryScanByteSequence(std::string_view _sequence)
256+
{
257+
if (!m_source->prefixMatch(_sequence))
258+
return false;
259+
260+
for (size_t i = 0; i < _sequence.size(); ++i)
261+
advance();
262+
263+
return true;
264+
}
265+
251266
/// Scans an escape-sequence which is part of a string and adds the
252267
/// decoded character to the current literal. Returns true if a pattern
253268
/// is scanned.

0 commit comments

Comments
 (0)