Skip to content

Commit a743337

Browse files
christianparpartmijovic
authored andcommitted
Scanner: Generate error on inbalanced RLO/LRO/PDF override markers.
1 parent 1521e9e commit a743337

31 files changed

+331
-7
lines changed

Changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Language Features:
66
* Code generator: Support conversion from calldata slices to memory and storage arrays.
77
* The fallback function can now also have a single ``calldata`` argument (equaling ``msg.data``) and return ``bytes memory`` (which will not be ABI-encoded but returned as-is).
88
* Wasm backend: Add ``i32.select`` and ``i64.select`` instructions.
9+
* Scanner: Generate a parser error when comments or unicode strings do contain an unbalanced or underflowing set of unicode direction override markers (LRO, RLO, PDF).
910

1011
Compiler Features:
1112
* Build System: Optionally support dynamic loading of Z3 and use that mechanism for Linux release builds.

liblangutil/CharStream.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,20 @@ class CharStream
9898
std::tuple<int, int> translatePositionToLineColumn(int _position) const;
9999
///@}
100100

101+
/// Tests whether or not given octet sequence is present at the current position in stream.
102+
/// @returns true if the sequence could be found, false otherwise.
103+
bool prefixMatch(std::string_view _sequence)
104+
{
105+
if (isPastEndOfInput(_sequence.size()))
106+
return false;
107+
108+
for (size_t i = 0; i < _sequence.size(); ++i)
109+
if (_sequence[i] != get(i))
110+
return false;
111+
112+
return true;
113+
}
114+
101115
private:
102116
std::string m_source;
103117
std::string m_name;

liblangutil/Scanner.cpp

Lines changed: 69 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,10 @@
5454
#include <liblangutil/Exceptions.h>
5555
#include <liblangutil/Scanner.h>
5656

57-
#include <algorithm>
57+
#include <boost/algorithm/string/classification.hpp>
58+
5859
#include <optional>
59-
#include <ostream>
60+
#include <string_view>
6061
#include <tuple>
6162

6263
using namespace std;
@@ -79,6 +80,8 @@ string to_string(ScannerError _errorCode)
7980
case ScannerError::IllegalExponent: return "Invalid exponent.";
8081
case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
8182
case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
83+
case ScannerError::DirectionalOverrideUnderflowInComment: return "Unicode direction override underflow in comment or string literal.";
84+
case ScannerError::DirectionalOverrideMismatchInComment: return "Mismatching directional override markers in comment or string literal.";
8285
default:
8386
solAssert(false, "Unhandled case in to_string(ScannerError)");
8487
return "";
@@ -271,12 +274,59 @@ bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
271274
return sourcePos() != startPosition;
272275
}
273276

277+
278+
namespace
279+
{
280+
/// Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth.
281+
///
282+
/// @returns ScannerError::NoError in case of successful parsing and directional encodings are paired
283+
/// and error code in case the input's lexical parser state is invalid and this error should be reported
284+
/// to the user.
285+
static ScannerError validateBiDiMarkup(CharStream& _stream, size_t _startPosition)
286+
{
287+
static array<pair<string_view, int>, 5> constexpr directionalSequences{
288+
pair<string_view, int>{"\xE2\x80\xAD", 1}, // U+202D (LRO - Left-to-Right Override)
289+
pair<string_view, int>{"\xE2\x80\xAE", 1}, // U+202E (RLO - Right-to-Left Override)
290+
pair<string_view, int>{"\xE2\x80\xAA", 1}, // U+202A (LRE - Left-to-Right Embedding)
291+
pair<string_view, int>{"\xE2\x80\xAB", 1}, // U+202B (RLE - Right-to-Left Embedding)
292+
pair<string_view, int>{"\xE2\x80\xAC", -1} // PDF - Pop Directional Formatting
293+
};
294+
295+
size_t endPosition = _stream.position();
296+
_stream.setPosition(_startPosition);
297+
298+
int directionOverrideDepth = 0;
299+
300+
for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos)
301+
{
302+
_stream.setPosition(currentPos);
303+
304+
for (auto const& [sequence, depthChange]: directionalSequences)
305+
if (_stream.prefixMatch(sequence))
306+
directionOverrideDepth += depthChange;
307+
308+
if (directionOverrideDepth < 0)
309+
return ScannerError::DirectionalOverrideUnderflowInComment;
310+
}
311+
312+
_stream.setPosition(endPosition);
313+
314+
return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatchInComment : ScannerError::NoError;
315+
}
316+
}
317+
274318
Token Scanner::skipSingleLineComment()
275319
{
276320
// Line terminator is not part of the comment. If it is a
277321
// non-ascii line terminator, it will result in a parser error.
322+
size_t startPosition = m_source->position();
278323
while (!isUnicodeLinebreak())
279-
if (!advance()) break;
324+
if (!advance())
325+
break;
326+
327+
ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
328+
if (unicodeDirectionError != ScannerError::NoError)
329+
return setError(unicodeDirectionError);
280330

281331
return Token::Whitespace;
282332
}
@@ -349,16 +399,21 @@ size_t Scanner::scanSingleLineDocComment()
349399

350400
Token Scanner::skipMultiLineComment()
351401
{
402+
size_t startPosition = m_source->position();
352403
while (!isSourcePastEndOfInput())
353404
{
354-
char ch = m_char;
405+
char prevChar = m_char;
355406
advance();
356407

357408
// If we have reached the end of the multi-line comment, we
358409
// consume the '/' and insert a whitespace. This way all
359410
// multi-line comments are treated as whitespace.
360-
if (ch == '*' && m_char == '/')
411+
if (prevChar == '*' && m_char == '/')
361412
{
413+
ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
414+
if (unicodeDirectionError != ScannerError::NoError)
415+
return setError(unicodeDirectionError);
416+
362417
m_char = ' ';
363418
return Token::Whitespace;
364419
}
@@ -785,6 +840,7 @@ bool Scanner::isUnicodeLinebreak()
785840

786841
Token Scanner::scanString(bool const _isUnicode)
787842
{
843+
size_t startPosition = m_source->position();
788844
char const quote = m_char;
789845
advance(); // consume quote
790846
LiteralScope literal(this, LITERAL_TYPE_STRING);
@@ -812,6 +868,14 @@ Token Scanner::scanString(bool const _isUnicode)
812868
}
813869
if (m_char != quote)
814870
return setError(ScannerError::IllegalStringEndQuote);
871+
872+
if (_isUnicode)
873+
{
874+
ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
875+
if (unicodeDirectionError != ScannerError::NoError)
876+
return setError(unicodeDirectionError);
877+
}
878+
815879
literal.complete();
816880
advance(); // consume quote
817881
return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;

liblangutil/Scanner.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ enum class ScannerError
8989
IllegalExponent,
9090
IllegalNumberEnd,
9191

92+
DirectionalOverrideUnderflowInComment,
93+
DirectionalOverrideMismatchInComment,
94+
9295
OctalNotAllowed,
9396
};
9497

@@ -183,6 +186,7 @@ class Scanner
183186
///@}
184187

185188
private:
189+
186190
inline Token setError(ScannerError _error) noexcept
187191
{
188192
m_tokens[NextNext].error = _error;

scripts/check_style.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66
REPO_ROOT="$(dirname "$0")"/..
77
cd $REPO_ROOT
88

9-
WHITESPACE=$(git grep -n -I -E "^.*[[:space:]]+$" | grep -v "test/libsolidity/ASTJSON\|test/libsolidity/ASTRecoveryTests\|test/compilationTests/zeppelin/LICENSE")
9+
WHITESPACE=$(git grep -n -I -E "^.*[[:space:]]+$" |
10+
grep -v "test/libsolidity/ASTJSON\|test/libsolidity/ASTRecoveryTests\|test/compilationTests/zeppelin/LICENSE" |
11+
grep -v -E "test/libsolidity/syntaxTests/comments/unicode_direction_override_1.sol"
12+
)
1013

1114
if [[ "$WHITESPACE" != "" ]]
1215
then

scripts/test_antlr_grammar.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,10 @@ done < <(
116116
grep -riL -E \
117117
"^\/\/ (Syntax|Type|Declaration)Error|^\/\/ ParserError (2837|3716|3997|5333|6275|6281|6933|7319)|^==== Source:" \
118118
"${ROOT_DIR}/test/libsolidity/syntaxTests" \
119-
"${ROOT_DIR}/test/libsolidity/semanticTests" \
119+
"${ROOT_DIR}/test/libsolidity/semanticTests" |
120+
grep -v -E 'comments/.*_direction_override.*.sol' |
121+
grep -v -E 'literals/.*_direction_override.*.sol'
122+
# Skipping the unicode tests as I couldn't adapt the lexical grammar to recursively counting RLO/LRO/PDF's.
120123
)
121124

122125
YUL_FILES=()
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// PDF
5+
/*underflow ‬*/
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (71-83): Unicode direction override underflow in comment or string literal.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// PDF PDF
5+
/*underflow ‬‬*/
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (75-87): Unicode direction override underflow in comment or string literal.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// RLO
5+
/*overflow ‮*/
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (71-86): Mismatching directional override markers in comment or string literal.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// RLO RLO
5+
/*overflow ‮‮*/
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (75-93): Mismatching directional override markers in comment or string literal.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// RLO PDF
5+
/*ok ‮‬*/
6+
7+
// RLO RLO PDF PDF
8+
/*ok ‮‮‬‬*/
9+
10+
// RLO RLO RLO PDF PDF PDF
11+
/*ok ‮‮‮‬‬‬*/
12+
}
13+
}
14+
// ----
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// PDF RLO
5+
/*overflow ‬‮*/
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (75-86): Unicode direction override underflow in comment or string literal.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
contract C {
2+
function f() public pure {
3+
/* LRO‭ LRE‪ RLE ‫ PDF‬ RLO‮ PDF ‬ PDF‬
4+
}
5+
}
6+
// ----
7+
// ParserError 8936: (52-115): Expected multi-line comment-terminator.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// PDF
5+
// underflow ‬
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (71-84): Unicode direction override underflow in comment or string literal.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// PDF PDF
5+
// underflow ‬‬
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (75-88): Unicode direction override underflow in comment or string literal.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// RLO
5+
// overflow ‮
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (71-86): Mismatching directional override markers in comment or string literal.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// RLO RLO
5+
// overflow ‮‮
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (75-93): Mismatching directional override markers in comment or string literal.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// RLO PDF
5+
// ok ‮‬
6+
7+
// RLO RLO PDF PDF
8+
// ok ‮‮‬‬
9+
10+
// RLO RLO RLO PDF PDF PDF
11+
// ok ‮‮‮‬‬‬
12+
}
13+
}
14+
// ----
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// PDF RLO
5+
// underflow ‬‮
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (75-88): Unicode direction override underflow in comment or string literal.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
contract C {
2+
function f(bool b) public pure
3+
{
4+
if ‬(b) { return; }
5+
}
6+
}
7+
// ----
8+
// ParserError 2314: (65-66): Expected '(' but got 'ILLEGAL'
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
contract C {
2+
function f(bool b) public pure
3+
{
4+
uint a = 10; ‬
5+
}
6+
}
7+
// ----
8+
// ParserError 8936: (75-76): Invalid token.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
contract TimelockUpgrade {
2+
function confirmUpgrade() external {
3+
uint256 m;
4+
uint256 d;
5+
(/*year*/,/*month‮*/,d/*yad*/,m/*‬‬hour*/,/*minute*/,/*second*/) = BokkyDateTime.timestampToDateTime(block.timestamp);
6+
}
7+
}
8+
9+
// ----
10+
// ParserError 8936: (128-139): Mismatching directional override markers in comment or string literal.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// PDF
5+
bytes memory s = unicode"underflow ‬";
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (88-106): Unicode direction override underflow in comment or string literal.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// PDF PDF
5+
bytes memory m = unicode"underflow ‬‬";
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (92-110): Unicode direction override underflow in comment or string literal.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
contract C {
2+
function f() public pure
3+
{
4+
// RLO
5+
bytes memory m = unicode"overflow ‮";
6+
}
7+
}
8+
// ----
9+
// ParserError 8936: (88-108): Mismatching directional override markers in comment or string literal.

0 commit comments

Comments
 (0)