Skip to content

Commit 39121ad

Browse files
authored
Reject bidirectional formatting characters (#11391)
1 parent a618213 commit 39121ad

File tree

4 files changed

+95
-41
lines changed

4 files changed

+95
-41
lines changed

lib/elixir/src/elixir_interpolation.erl

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33
-export([extract/6, unescape_string/1, unescape_string/2,
44
unescape_tokens/1, unescape_map/1]).
55
-include("elixir.hrl").
6-
-define(is_hex(S), ((S >= $0 andalso S =< $9) orelse
7-
(S >= $A andalso S =< $F) orelse
8-
(S >= $a andalso S =< $f))).
6+
-include("elixir_tokenizer.hrl").
97

108
%% Extract string interpolations
119

@@ -60,8 +58,8 @@ extract([$#, ${ | Rest], Buffer, Output, Line, Column, Scope, true, Last) ->
6058
{error, {string, Line, Column, "missing interpolation terminator: \"}\"", []}}
6159
end;
6260

63-
extract([$\\, Char | Rest], Buffer, Output, Line, Column, Scope, Interpol, Last) ->
64-
extract(Rest, [Char, $\\ | Buffer], Output, Line, Column + 2, Scope, Interpol, Last);
61+
extract([$\\ | Rest], Buffer, Output, Line, Column, Scope, Interpol, Last) ->
62+
extract_char(Rest, [$\\ | Buffer], Output, Line, Column + 1, Scope, Interpol, Last);
6563

6664
%% Catch all clause
6765

@@ -70,8 +68,21 @@ extract([Char1, Char2 | Rest], Buffer, Output, Line, Column, Scope, Interpol, La
7068
extract([Char2 | Rest], [Char1 | Buffer], Output, Line, Column + 1, Scope, Interpol, Last);
7169

7270
extract(Rest, Buffer, Output, Line, Column, Scope, Interpol, Last) ->
71+
extract_char(Rest, Buffer, Output, Line, Column, Scope, Interpol, Last).
72+
73+
extract_char(Rest, Buffer, Output, Line, Column, Scope, Interpol, Last) ->
7374
[Char | NewRest] = unicode_util:gc(Rest),
74-
extract(NewRest, [Char | Buffer], Output, Line, Column + 1, Scope, Interpol, Last).
75+
76+
if
77+
?bidi(Char) ->
78+
Token = io_lib:format("\\u~4.16.0B", [Char]),
79+
Pre = "invalid bidirectional formatting character in string: ",
80+
Pos = io_lib:format(". If you want to use such character, use it in its escaped ~ts form instead", [Token]),
81+
{error, {Line, Column, {Pre, Pos}, Token}};
82+
83+
true ->
84+
extract(NewRest, [Char | Buffer], Output, Line, Column + 1, Scope, Interpol, Last)
85+
end.
7586

7687
%% Handle newlines. Heredocs require special attention
7788

lib/elixir/src/elixir_tokenizer.erl

Lines changed: 29 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,8 @@
11
-module(elixir_tokenizer).
22
-include("elixir.hrl").
3+
-include("elixir_tokenizer.hrl").
34
-export([tokenize/1, tokenize/3, tokenize/4, invalid_do_error/1]).
45

5-
%% Numbers
6-
-define(is_hex(S), (?is_digit(S) orelse (S >= $A andalso S =< $F) orelse (S >= $a andalso S =< $f))).
7-
-define(is_bin(S), (S >= $0 andalso S =< $1)).
8-
-define(is_octal(S), (S >= $0 andalso S =< $7)).
9-
10-
%% Digits and letters
11-
-define(is_digit(S), (S >= $0 andalso S =< $9)).
12-
-define(is_upcase(S), (S >= $A andalso S =< $Z)).
13-
-define(is_downcase(S), (S >= $a andalso S =< $z)).
14-
15-
%% Others
16-
-define(is_quote(S), (S =:= $" orelse S =:= $')).
17-
-define(is_sigil(S), (S =:= $/ orelse S =:= $< orelse S =:= $" orelse S =:= $' orelse
18-
S =:= $[ orelse S =:= $( orelse S =:= ${ orelse S =:= $|)).
19-
20-
%% Spaces
21-
-define(is_horizontal_space(S), (S =:= $\s orelse S =:= $\t)).
22-
-define(is_vertical_space(S), (S =:= $\r orelse S =:= $\n)).
23-
-define(is_space(S), (?is_horizontal_space(S) orelse ?is_vertical_space(S))).
24-
25-
%% Operators
266
-define(at_op(T),
277
T =:= $@).
288

@@ -198,9 +178,13 @@ tokenize([$0, $o, H | T], Line, Column, Scope, Tokens) when ?is_octal(H) ->
198178
% Comments
199179

200180
tokenize([$# | String], Line, Column, Scope, Tokens) ->
201-
{Rest, Comment} = tokenize_comment(String, [$#]),
202-
preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
203-
tokenize(Rest, Line, Column, Scope, reset_eol(Tokens));
181+
case tokenize_comment(String, [$#]) of
182+
{error, Char} ->
183+
error_comment(Char, [$# | String], Line, Column, Scope, Tokens);
184+
{Rest, Comment} ->
185+
preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
186+
tokenize(Rest, Line, Column, Scope, reset_eol(Tokens))
187+
end;
204188

205189
% Sigils
206190

@@ -645,9 +629,7 @@ tokenize([$% | T], Line, Column, Scope, Tokens) ->
645629
tokenize(T, Line, Column + 1, Scope, [{'%', {Line, Column, nil}} | Tokens]);
646630

647631
tokenize([$. | T], Line, Column, Scope, Tokens) ->
648-
DotInfo = {Line, Column, nil},
649-
{Rest, EndLine, EndColumn} = strip_dot_space(T, Line, Column + 1, [{'.', DotInfo}| Tokens], Scope),
650-
handle_dot([$. | Rest], EndLine, EndColumn, DotInfo, Scope, Tokens);
632+
tokenize_dot(T, Line, Column + 1, {Line, Column, nil}, Scope, Tokens);
651633

652634
% Identifiers
653635

@@ -719,18 +701,23 @@ strip_horizontal_space([H | T], Counter) when ?is_horizontal_space(H) ->
719701
strip_horizontal_space(T, Counter) ->
720702
{T, Counter}.
721703

722-
strip_dot_space(T, Line, Column, Tokens, Scope) ->
704+
tokenize_dot(T, Line, Column, DotInfo, Scope, Tokens) ->
723705
case strip_horizontal_space(T, 0) of
724-
{"#" ++ R, _} ->
725-
{Rest, Comment} = tokenize_comment(R, [$#]),
726-
preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
727-
strip_dot_space(Rest, Line, 1, reset_eol(Tokens), Scope);
706+
{[$# | R], _} ->
707+
case tokenize_comment(R, [$#]) of
708+
{error, Char} ->
709+
error_comment(Char, [$# | R], Line, Column, Scope, Tokens);
710+
711+
{Rest, Comment} ->
712+
preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
713+
tokenize_dot(Rest, Line, 1, DotInfo, Scope, Tokens)
714+
end;
728715
{"\r\n" ++ Rest, _} ->
729-
strip_dot_space(Rest, Line + 1, 1, eol(Line, Column, Tokens), Scope);
716+
tokenize_dot(Rest, Line + 1, 1, DotInfo, Scope, Tokens);
730717
{"\n" ++ Rest, _} ->
731-
strip_dot_space(Rest, Line + 1, 1, eol(Line, Column, Tokens), Scope);
718+
tokenize_dot(Rest, Line + 1, 1, DotInfo, Scope, Tokens);
732719
{Rest, Length} ->
733-
{Rest, Line, Column + Length}
720+
handle_dot([$. | Rest], Line, Column + Length, DotInfo, Scope, Tokens)
734721
end.
735722

736723
handle_char(0) -> {"\\0", "null byte"};
@@ -1171,11 +1158,18 @@ tokenize_comment("\r\n" ++ _ = Rest, Acc) ->
11711158
{Rest, lists:reverse(Acc)};
11721159
tokenize_comment("\n" ++ _ = Rest, Acc) ->
11731160
{Rest, lists:reverse(Acc)};
1161+
tokenize_comment([H | _Rest], _) when ?bidi(H) ->
1162+
{error, H};
11741163
tokenize_comment([H | Rest], Acc) ->
11751164
tokenize_comment(Rest, [H | Acc]);
11761165
tokenize_comment([], Acc) ->
11771166
{[], lists:reverse(Acc)}.
11781167

1168+
error_comment(H, Comment, Line, Column, Scope, Tokens) ->
1169+
Token = io_lib:format("\\u~4.16.0B", [H]),
1170+
Reason = {Line, Column, "invalid bidirectional formatting character in comment: ", Token},
1171+
error(Reason, Comment, Scope, Tokens).
1172+
11791173
preserve_comments(Line, Column, Tokens, Comment, Rest, Scope) ->
11801174
case Scope#elixir_tokenizer.preserve_comments of
11811175
Fun when is_function(Fun) ->

lib/elixir/src/elixir_tokenizer.hrl

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
%% Numbers
2+
-define(is_hex(S), (?is_digit(S) orelse (S >= $A andalso S =< $F) orelse (S >= $a andalso S =< $f))).
3+
-define(is_bin(S), (S >= $0 andalso S =< $1)).
4+
-define(is_octal(S), (S >= $0 andalso S =< $7)).
5+
6+
%% Digits and letters
7+
-define(is_digit(S), (S >= $0 andalso S =< $9)).
8+
-define(is_upcase(S), (S >= $A andalso S =< $Z)).
9+
-define(is_downcase(S), (S >= $a andalso S =< $z)).
10+
11+
%% Others
12+
-define(is_quote(S), (S =:= $" orelse S =:= $')).
13+
-define(is_sigil(S), (S =:= $/ orelse S =:= $< orelse S =:= $" orelse S =:= $' orelse
14+
S =:= $[ orelse S =:= $( orelse S =:= ${ orelse S =:= $|)).
15+
16+
%% Spaces
17+
-define(is_horizontal_space(S), (S =:= $\s orelse S =:= $\t)).
18+
-define(is_vertical_space(S), (S =:= $\r orelse S =:= $\n)).
19+
-define(is_space(S), (?is_horizontal_space(S) orelse ?is_vertical_space(S))).
20+
21+
%% Bidirectional control
22+
%% Retrieved from https://trojansource.codes/trojan-source.pdf
23+
-define(bidi(C), C =:= 16#202A;
24+
C =:= 16#202B;
25+
C =:= 16#202D;
26+
C =:= 16#202E;
27+
C =:= 16#2066;
28+
C =:= 16#2067;
29+
C =:= 16#2068;
30+
C =:= 16#202C;
31+
C =:= 16#2069).

lib/elixir/test/elixir/kernel/errors_test.exs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,24 @@ defmodule Kernel.ErrorsTest do
816816
'x = 8; <<a, b::size(^x)>> = <<?a, ?b>>'
817817
end
818818

819+
test "invalid bidi in source" do
820+
assert_eval_raise SyntaxError,
821+
~r"nofile:1:1: invalid bidirectional formatting character in comment: \\u202A",
822+
'# This is a \u202A'
823+
824+
assert_eval_raise SyntaxError,
825+
~r"nofile:1:5: invalid bidirectional formatting character in comment: \\u202A",
826+
'foo. # This is a \u202A'
827+
828+
assert_eval_raise SyntaxError,
829+
~r"nofile:1:12: invalid bidirectional formatting character in string: \\u202A. If you want to use such character, use it in its escaped \\u202A form instead",
830+
'"this is a \u202A"'
831+
832+
assert_eval_raise SyntaxError,
833+
~r"nofile:1:13: invalid bidirectional formatting character in string: \\u202A. If you want to use such character, use it in its escaped \\u202A form instead",
834+
'"this is a \\\u202A"'
835+
end
836+
819837
test "function head with guard" do
820838
assert_eval_raise CompileError, "nofile:2: missing :do option in \"def\"", '''
821839
defmodule Kernel.ErrorsTest.BodyessFunctionWithGuard do

0 commit comments

Comments
 (0)