Skip to content

Commit b666df5

Browse files
Scanner: Generate error on inbalanced RLO/LRO/PDF override markers.
1 parent b62de4f commit b666df5

17 files changed

+276
-11
lines changed

Changelog.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
Compiler Features:
44
* SMTChecker: Support named arguments in function calls.
55

6+
Language Features:
7+
* Scanner: Generates a parser error when comments do contain an unbalanced or underflowing set of unicode direction override markers (LRO, RLO, PDF)
68

79
### 0.7.5 (2020-11-18)
810

liblangutil/CharStream.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,19 @@ class CharStream
8585
/// @returns The character of the current location after update is returned.
8686
char setPosition(size_t _location);
8787

88+
/// Tests whether or not given octet sequence is present at the current reading position.
89+
/// @returns true if the sequence could be found, false otherwise.
90+
bool prefixMatch(std::string_view _sequence) const
91+
{
92+
if (m_position + _sequence.size() >= m_source.size())
93+
return false;
94+
95+
for (size_t i = 0; i < _sequence.size(); ++i)
96+
if (_sequence[i] != get(i))
97+
return false;
98+
return true;
99+
}
100+
88101
void reset() { m_position = 0; }
89102

90103
std::string const& source() const noexcept { return m_source; }

liblangutil/Scanner.cpp

Lines changed: 86 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ string to_string(ScannerError _errorCode)
7979
case ScannerError::IllegalExponent: return "Invalid exponent.";
8080
case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
8181
case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
82+
case ScannerError::DirectionalOverrideUnderflowInComment: return "Unicode direction override underflow in comment.";
83+
case ScannerError::DirectionalOverrideMismatchInComment: return "Mismatching directional override markers in comment.";
8284
default:
8385
solAssert(false, "Unhandled case in to_string(ScannerError)");
8486
return "";
@@ -271,12 +273,74 @@ bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
271273
return sourcePos() != startPosition;
272274
}
273275

276+
// A little helper class for counting depth of RLO/LRO/PDF uses.
277+
struct Scanner::UnicodeDirectionOverrideProcessor
278+
{
279+
Scanner& scanner;
280+
int rtlOverrideDepth = 0;
281+
282+
/// Tries to scan for an RLO/LRO/PDF and keeps track of script writing direction override depth.
283+
///
284+
/// @returns a 2-tuple indicating whether or not successfully some script writing direction
285+
/// override has been consumed from the input and an error code in case
286+
/// the input's lexical parser state is invalid and this error should be reported
287+
/// to the user.
288+
pair<bool, ScannerError> operator()()
289+
{
290+
// U+202D (LRO - Left-to-Right Override)
291+
// U+202E (RLO - Right-to-Left Override)
292+
if (
293+
scanner.tryScanByteSequence("\xE2\x80\xAD") ||
294+
scanner.tryScanByteSequence("\xE2\x80\xAE")
295+
)
296+
{
297+
rtlOverrideDepth++;
298+
return pair{true, ScannerError::NoError};
299+
}
300+
else if (scanner.tryScanByteSequence("\xE2\x80\xAC")) // U+202C (PDF - Pop Directional Formatting)
301+
{
302+
rtlOverrideDepth--;
303+
if (rtlOverrideDepth < 0)
304+
return pair{true, ScannerError::DirectionalOverrideUnderflowInComment};
305+
else
306+
return pair{true, ScannerError::NoError};
307+
}
308+
else
309+
return pair{false, ScannerError::NoError};
310+
}
311+
312+
ScannerError error() const noexcept
313+
{
314+
if (rtlOverrideDepth < 0)
315+
return ScannerError::DirectionalOverrideUnderflowInComment;
316+
else if (rtlOverrideDepth > 0)
317+
return ScannerError::DirectionalOverrideMismatchInComment;
318+
else
319+
return ScannerError::NoError;
320+
}
321+
};
322+
274323
Token Scanner::skipSingleLineComment()
275324
{
325+
UnicodeDirectionOverrideProcessor unicodeDirectionalOverride{*this};
326+
276327
// Line terminator is not part of the comment. If it is a
277328
// non-ascii line terminator, it will result in a parser error.
278329
while (!isUnicodeLinebreak())
279-
if (!advance()) break;
330+
{
331+
auto const [processed, errorCode] = unicodeDirectionalOverride();
332+
if (processed && errorCode != ScannerError::NoError)
333+
return setError(errorCode);
334+
else if (!processed)
335+
{
336+
if (!advance())
337+
break;
338+
}
339+
}
340+
341+
if (unicodeDirectionalOverride.error() != ScannerError::NoError)
342+
// Unbalanced RLO/LRO/PDF codepoint sequences in comment.
343+
return setError(unicodeDirectionalOverride.error());
280344

281345
return Token::Whitespace;
282346
}
@@ -349,18 +413,30 @@ size_t Scanner::scanSingleLineDocComment()
349413

350414
Token Scanner::skipMultiLineComment()
351415
{
416+
UnicodeDirectionOverrideProcessor unicodeDirectionalOverride{*this};
417+
352418
while (!isSourcePastEndOfInput())
353419
{
354-
char ch = m_char;
355-
advance();
356-
357-
// If we have reached the end of the multi-line comment, we
358-
// consume the '/' and insert a whitespace. This way all
359-
// multi-line comments are treated as whitespace.
360-
if (ch == '*' && m_char == '/')
420+
auto const [processed, errorCode] = unicodeDirectionalOverride();
421+
if (processed && errorCode != ScannerError::NoError)
422+
return setError(errorCode);
423+
else if (!processed)
361424
{
362-
m_char = ' ';
363-
return Token::Whitespace;
425+
char ch = m_char;
426+
advance();
427+
428+
// If we have reached the end of the multi-line comment, we
429+
// consume the '/' and insert a whitespace. This way all
430+
// multi-line comments are treated as whitespace.
431+
if (ch == '*' && m_char == '/')
432+
{
433+
if (unicodeDirectionalOverride.error() != ScannerError::NoError)
434+
// Unbalanced RLO/LRO/PDF codepoint sequences in comment.
435+
return setError(unicodeDirectionalOverride.error());
436+
437+
m_char = ' ';
438+
return Token::Whitespace;
439+
}
364440
}
365441
}
366442
// Unterminated multi-line comment.

liblangutil/Scanner.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ enum class ScannerError
8989
IllegalExponent,
9090
IllegalNumberEnd,
9191

92+
DirectionalOverrideUnderflowInComment,
93+
DirectionalOverrideMismatchInComment,
94+
9295
OctalNotAllowed,
9396
};
9497

@@ -183,6 +186,8 @@ class Scanner
183186
///@}
184187

185188
private:
189+
struct UnicodeDirectionOverrideProcessor;
190+
186191
inline Token setError(ScannerError _error) noexcept
187192
{
188193
m_tokens[NextNext].error = _error;
@@ -248,6 +253,19 @@ class Scanner
248253
/// Scans a slash '/' and depending on the characters returns the appropriate token
249254
Token scanSlash();
250255

256+
/// Tries scanning given octet sequence and advances reading position respectively iff found.
257+
/// @returns true if it could be scanned, false otherwise.
258+
bool tryScanByteSequence(std::string_view _sequence)
259+
{
260+
if (!m_source->prefixMatch(_sequence))
261+
return false;
262+
263+
for (size_t i = 0; i < _sequence.size(); ++i)
264+
advance();
265+
266+
return true;
267+
}
268+
251269
/// Scans an escape-sequence which is part of a string and adds the
252270
/// decoded character to the current literal. Returns true if a pattern
253271
/// is scanned.

scripts/test_antlr_grammar.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ done < <(
116116
grep -riL -E \
117117
"^\/\/ (Syntax|Type|Declaration)Error|^\/\/ ParserError (2837|3716|3997|5333|6275|6281|6933|7319)|^==== Source:" \
118118
"${ROOT_DIR}/test/libsolidity/syntaxTests" \
119-
"${ROOT_DIR}/test/libsolidity/semanticTests" \
119+
"${ROOT_DIR}/test/libsolidity/semanticTests" |
120+
grep -v -E 'comments/.*_unicode.*.sol'
121+
# Skipping the unicode tests as I couldn't adapt the lexical grammar to recursively counting RLO/LRO/PDF's.
120122
)
121123

122124
YUL_FILES=()
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// pop 1
8+
/*underflow ‬*/
9+
}
10+
}
11+
// ----
12+
// ParserError 8936: (137-152): Unicode direction override underflow in comment.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// pop 2
8+
/*underflow ‬‬*/
9+
}
10+
}
11+
// ----
12+
// ParserError 8936: (137-152): Unicode direction override underflow in comment.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// push 1
8+
/*overflow ‮*/
9+
}
10+
}
11+
// ----
12+
// ParserError 8936: (138-153): Mismatching directional override markers in comment.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// push 2
8+
/*overflow ‮‮*/
9+
}
10+
}
11+
// ----
12+
// ParserError 8936: (138-156): Mismatching directional override markers in comment.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// first push 1, then pop 1
8+
/*ok ‮‬*/
9+
10+
// first push 2, then pop 2
11+
/*ok ‮‮‬‬*/
12+
13+
// first push 3, then pop 3
14+
/*ok ‮‮‮‬‬‬*/
15+
}
16+
}
17+
// ----
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// first pop, then push
8+
/*overflow ‬‮*/
9+
}
10+
}
11+
// ----
12+
// ParserError 8936: (152-166): Unicode direction override underflow in comment.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// pop 1
8+
// underflow ‬
9+
}
10+
}
11+
// ----
12+
// ParserError 8936: (137-153): Unicode direction override underflow in comment.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// pop 2
8+
// underflow ‬‬
9+
}
10+
}
11+
// ----
12+
// ParserError 8936: (137-153): Unicode direction override underflow in comment.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// push 1
8+
// overflow ‮
9+
}
10+
}
11+
// ----
12+
// ParserError 8936: (138-153): Mismatching directional override markers in comment.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// push 2
8+
// overflow ‮‮
9+
}
10+
}
11+
// ----
12+
// ParserError 8936: (138-156): Mismatching directional override markers in comment.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// first push 1, then pop 1
8+
// ok ‮‬
9+
10+
// first push 2, then pop 2
11+
// ok ‮‮‬‬
12+
13+
// first push 3, then pop 3
14+
// ok ‮‮‮‬‬‬
15+
}
16+
}
17+
// ----
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// SPDX-License-Identifier: UNLICENSED
2+
pragma solidity ^0.7.0;
3+
4+
contract C {
5+
function f() public pure
6+
{
7+
// first pop, then push
8+
// underflow ‬‮
9+
}
10+
}
11+
// ----
12+
// ParserError 8936: (152-168): Unicode direction override underflow in comment.

0 commit comments

Comments
 (0)