Skip to content

Commit 5ec7d1d

Browse files
committed
Auto merge of #90559 - rusticstuff:optimize-bidi-detection, r=davidtwco
Optimize bidi character detection. Should fix most of the performance regression of the bidi character detection (#90514), to be confirmed with a perf run.
2 parents 3326f19 + 39110be commit 5ec7d1d

File tree

5 files changed

+46
-16
lines changed

5 files changed

+46
-16
lines changed

compiler/rustc_ast/src/lib.rs

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#![feature(nll)]
1717
#![feature(min_specialization)]
1818
#![recursion_limit = "256"]
19+
#![feature(slice_internals)]
1920

2021
#[macro_use]
2122
extern crate rustc_macros;
@@ -25,6 +26,7 @@ pub mod util {
2526
pub mod comments;
2627
pub mod literal;
2728
pub mod parser;
29+
pub mod unicode;
2830
}
2931

3032
pub mod ast;
+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
pub const TEXT_FLOW_CONTROL_CHARS: &[char] = &[
2+
'\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
3+
'\u{2069}',
4+
];
5+
6+
#[inline]
7+
pub fn contains_text_flow_control_chars(s: &str) -> bool {
8+
// Char - UTF-8
9+
// U+202A - E2 80 AA
10+
// U+202B - E2 80 AB
11+
// U+202C - E2 80 AC
12+
// U+202D - E2 80 AD
13+
// U+202E - E2 80 AE
14+
// U+2066 - E2 81 A6
15+
// U+2067 - E2 81 A7
16+
// U+2068 - E2 81 A8
17+
// U+2069 - E2 81 A9
18+
let mut bytes = s.as_bytes();
19+
loop {
20+
match core::slice::memchr::memchr(0xE2, &bytes) {
21+
Some(idx) => {
22+
// bytes are valid UTF-8 -> E2 must be followed by two bytes
23+
let ch = &bytes[idx..idx + 3];
24+
match ch {
25+
[_, 0x80, 0xAA..=0xAE] | [_, 0x81, 0xA6..=0xA9] => break true,
26+
_ => {}
27+
}
28+
bytes = &bytes[idx + 3..];
29+
}
30+
None => {
31+
break false;
32+
}
33+
}
34+
}
35+
}

compiler/rustc_lint/src/context.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
1717
use self::TargetLint::*;
1818

19-
use crate::hidden_unicode_codepoints::UNICODE_TEXT_FLOW_CHARS;
2019
use crate::levels::{is_known_lint_tool, LintLevelsBuilder};
2120
use crate::passes::{EarlyLintPassObject, LateLintPassObject};
21+
use ast::util::unicode::TEXT_FLOW_CONTROL_CHARS;
2222
use rustc_ast as ast;
2323
use rustc_data_structures::fx::FxHashMap;
2424
use rustc_data_structures::sync;
@@ -602,7 +602,7 @@ pub trait LintContext: Sized {
602602
let spans: Vec<_> = content
603603
.char_indices()
604604
.filter_map(|(i, c)| {
605-
UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
605+
TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
606606
let lo = span.lo() + BytePos(2 + i as u32);
607607
(c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
608608
})

compiler/rustc_lint/src/hidden_unicode_codepoints.rs

+4-8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use crate::{EarlyContext, EarlyLintPass, LintContext};
2+
use ast::util::unicode::{contains_text_flow_control_chars, TEXT_FLOW_CONTROL_CHARS};
23
use rustc_ast as ast;
34
use rustc_errors::{Applicability, SuggestionStyle};
45
use rustc_span::{BytePos, Span, Symbol};
@@ -37,11 +38,6 @@ declare_lint! {
3738

3839
declare_lint_pass!(HiddenUnicodeCodepoints => [TEXT_DIRECTION_CODEPOINT_IN_LITERAL]);
3940

40-
crate const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
41-
'\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
42-
'\u{2069}',
43-
];
44-
4541
impl HiddenUnicodeCodepoints {
4642
fn lint_text_direction_codepoint(
4743
&self,
@@ -57,7 +53,7 @@ impl HiddenUnicodeCodepoints {
5753
.as_str()
5854
.char_indices()
5955
.filter_map(|(i, c)| {
60-
UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
56+
TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
6157
let lo = span.lo() + BytePos(i as u32 + padding);
6258
(c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
6359
})
@@ -131,7 +127,7 @@ impl HiddenUnicodeCodepoints {
131127
impl EarlyLintPass for HiddenUnicodeCodepoints {
132128
fn check_attribute(&mut self, cx: &EarlyContext<'_>, attr: &ast::Attribute) {
133129
if let ast::AttrKind::DocComment(_, comment) = attr.kind {
134-
if comment.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
130+
if contains_text_flow_control_chars(&comment.as_str()) {
135131
self.lint_text_direction_codepoint(cx, comment, attr.span, 0, false, "doc comment");
136132
}
137133
}
@@ -142,7 +138,7 @@ impl EarlyLintPass for HiddenUnicodeCodepoints {
142138
let (text, span, padding) = match &expr.kind {
143139
ast::ExprKind::Lit(ast::Lit { token, kind, span }) => {
144140
let text = token.symbol;
145-
if !text.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
141+
if !contains_text_flow_control_chars(&text.as_str()) {
146142
return;
147143
}
148144
let padding = match kind {

compiler/rustc_parse/src/lexer/mod.rs

+3-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use rustc_ast::ast::{self, AttrStyle};
22
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
33
use rustc_ast::tokenstream::{Spacing, TokenStream};
4+
use rustc_ast::util::unicode::contains_text_flow_control_chars;
45
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError, PResult};
56
use rustc_lexer::unescape::{self, Mode};
67
use rustc_lexer::{Base, DocStyle, RawStrError};
@@ -137,12 +138,8 @@ impl<'a> StringReader<'a> {
137138
// Opening delimiter of the length 2 is not included into the comment text.
138139
let content_start = start + BytePos(2);
139140
let content = self.str_from(content_start);
140-
let span = self.mk_sp(start, self.pos);
141-
const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
142-
'\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}',
143-
'\u{202C}', '\u{2069}',
144-
];
145-
if content.contains(UNICODE_TEXT_FLOW_CHARS) {
141+
if contains_text_flow_control_chars(content) {
142+
let span = self.mk_sp(start, self.pos);
146143
self.sess.buffer_lint_with_diagnostic(
147144
&TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
148145
span,

0 commit comments

Comments
 (0)