|
1 | 1 | -module(elixir_tokenizer).
|
2 | 2 | -include("elixir.hrl").
|
| 3 | +-include("elixir_tokenizer.hrl"). |
3 | 4 | -export([tokenize/1, tokenize/3, tokenize/4, invalid_do_error/1]).
|
4 | 5 |
|
5 |
| -%% Numbers |
6 |
| --define(is_hex(S), (?is_digit(S) orelse (S >= $A andalso S =< $F) orelse (S >= $a andalso S =< $f))). |
7 |
| --define(is_bin(S), (S >= $0 andalso S =< $1)). |
8 |
| --define(is_octal(S), (S >= $0 andalso S =< $7)). |
9 |
| - |
10 |
| -%% Digits and letters |
11 |
| --define(is_digit(S), (S >= $0 andalso S =< $9)). |
12 |
| --define(is_upcase(S), (S >= $A andalso S =< $Z)). |
13 |
| --define(is_downcase(S), (S >= $a andalso S =< $z)). |
14 |
| - |
15 |
| -%% Others |
16 |
| --define(is_quote(S), (S =:= $" orelse S =:= $')). |
17 |
| --define(is_sigil(S), (S =:= $/ orelse S =:= $< orelse S =:= $" orelse S =:= $' orelse |
18 |
| - S =:= $[ orelse S =:= $( orelse S =:= ${ orelse S =:= $|)). |
19 |
| - |
20 |
| -%% Spaces |
21 |
| --define(is_horizontal_space(S), (S =:= $\s orelse S =:= $\t)). |
22 |
| --define(is_vertical_space(S), (S =:= $\r orelse S =:= $\n)). |
23 |
| --define(is_space(S), (?is_horizontal_space(S) orelse ?is_vertical_space(S))). |
24 |
| - |
25 |
| -%% Operators |
26 | 6 | -define(at_op(T),
|
27 | 7 | T =:= $@).
|
28 | 8 |
|
@@ -198,9 +178,13 @@ tokenize([$0, $o, H | T], Line, Column, Scope, Tokens) when ?is_octal(H) ->
|
198 | 178 | % Comments
|
199 | 179 |
|
200 | 180 | tokenize([$# | String], Line, Column, Scope, Tokens) ->
|
201 |
| - {Rest, Comment} = tokenize_comment(String, [$#]), |
202 |
| - preserve_comments(Line, Column, Tokens, Comment, Rest, Scope), |
203 |
| - tokenize(Rest, Line, Column, Scope, reset_eol(Tokens)); |
| 181 | + case tokenize_comment(String, [$#]) of |
| 182 | + {error, Char} -> |
| 183 | + error_comment(Char, [$# | String], Line, Column, Scope, Tokens); |
| 184 | + {Rest, Comment} -> |
| 185 | + preserve_comments(Line, Column, Tokens, Comment, Rest, Scope), |
| 186 | + tokenize(Rest, Line, Column, Scope, reset_eol(Tokens)) |
| 187 | + end; |
204 | 188 |
|
205 | 189 | % Sigils
|
206 | 190 |
|
@@ -645,9 +629,7 @@ tokenize([$% | T], Line, Column, Scope, Tokens) ->
|
645 | 629 | tokenize(T, Line, Column + 1, Scope, [{'%', {Line, Column, nil}} | Tokens]);
|
646 | 630 |
|
647 | 631 | tokenize([$. | T], Line, Column, Scope, Tokens) ->
|
648 |
| - DotInfo = {Line, Column, nil}, |
649 |
| - {Rest, EndLine, EndColumn} = strip_dot_space(T, Line, Column + 1, [{'.', DotInfo}| Tokens], Scope), |
650 |
| - handle_dot([$. | Rest], EndLine, EndColumn, DotInfo, Scope, Tokens); |
| 632 | + tokenize_dot(T, Line, Column + 1, {Line, Column, nil}, Scope, Tokens); |
651 | 633 |
|
652 | 634 | % Identifiers
|
653 | 635 |
|
@@ -719,18 +701,23 @@ strip_horizontal_space([H | T], Counter) when ?is_horizontal_space(H) ->
|
719 | 701 | strip_horizontal_space(T, Counter) ->
|
720 | 702 | {T, Counter}.
|
721 | 703 |
|
722 |
| -strip_dot_space(T, Line, Column, Tokens, Scope) -> |
| 704 | +tokenize_dot(T, Line, Column, DotInfo, Scope, Tokens) -> |
723 | 705 | case strip_horizontal_space(T, 0) of
|
724 |
| - {"#" ++ R, _} -> |
725 |
| - {Rest, Comment} = tokenize_comment(R, [$#]), |
726 |
| - preserve_comments(Line, Column, Tokens, Comment, Rest, Scope), |
727 |
| - strip_dot_space(Rest, Line, 1, reset_eol(Tokens), Scope); |
| 706 | + {[$# | R], _} -> |
| 707 | + case tokenize_comment(R, [$#]) of |
| 708 | + {error, Char} -> |
| 709 | + error_comment(Char, [$# | R], Line, Column, Scope, Tokens); |
| 710 | + |
| 711 | + {Rest, Comment} -> |
| 712 | + preserve_comments(Line, Column, Tokens, Comment, Rest, Scope), |
| 713 | + tokenize_dot(Rest, Line, 1, DotInfo, Scope, Tokens) |
| 714 | + end; |
728 | 715 | {"\r\n" ++ Rest, _} ->
|
729 |
| - strip_dot_space(Rest, Line + 1, 1, eol(Line, Column, Tokens), Scope); |
| 716 | + tokenize_dot(Rest, Line + 1, 1, DotInfo, Scope, Tokens); |
730 | 717 | {"\n" ++ Rest, _} ->
|
731 |
| - strip_dot_space(Rest, Line + 1, 1, eol(Line, Column, Tokens), Scope); |
| 718 | + tokenize_dot(Rest, Line + 1, 1, DotInfo, Scope, Tokens); |
732 | 719 | {Rest, Length} ->
|
733 |
| - {Rest, Line, Column + Length} |
| 720 | + handle_dot([$. | Rest], Line, Column + Length, DotInfo, Scope, Tokens) |
734 | 721 | end.
|
735 | 722 |
|
736 | 723 | handle_char(0) -> {"\\0", "null byte"};
|
@@ -1171,11 +1158,18 @@ tokenize_comment("\r\n" ++ _ = Rest, Acc) ->
|
1171 | 1158 | {Rest, lists:reverse(Acc)};
|
1172 | 1159 | tokenize_comment("\n" ++ _ = Rest, Acc) ->
|
1173 | 1160 | {Rest, lists:reverse(Acc)};
|
| 1161 | +tokenize_comment([H | _Rest], _) when ?bidi(H) -> |
| 1162 | + {error, H}; |
1174 | 1163 | tokenize_comment([H | Rest], Acc) ->
|
1175 | 1164 | tokenize_comment(Rest, [H | Acc]);
|
1176 | 1165 | tokenize_comment([], Acc) ->
|
1177 | 1166 | {[], lists:reverse(Acc)}.
|
1178 | 1167 |
|
| 1168 | +error_comment(H, Comment, Line, Column, Scope, Tokens) -> |
| 1169 | + Token = io_lib:format("\\u~4.16.0B", [H]), |
| 1170 | + Reason = {Line, Column, "invalid bidirectional formatting character in comment: ", Token}, |
| 1171 | + error(Reason, Comment, Scope, Tokens). |
| 1172 | + |
1179 | 1173 | preserve_comments(Line, Column, Tokens, Comment, Rest, Scope) ->
|
1180 | 1174 | case Scope#elixir_tokenizer.preserve_comments of
|
1181 | 1175 | Fun when is_function(Fun) ->
|
|
0 commit comments