Skip to content

Commit 651f498

Browse files
ndmitchellmeta-codesync[bot]
authored andcommitted
Optimise comment parsing
Summary: While profiling, this ended up taking 5% of the code. After optimising, it doesn't even show up anywhere I can find (likely got inlined as the code is now also much simpler). Trick is to work on bytes, as all the comment characters are bytes, since char iterators are hilariously expensive. Reviewed By: rchen152 Differential Revision: D96801831 fbshipit-source-id: 6b59e049fb2e0f079156cf68d89d1bf45f9e0a9e
1 parent 372fe49 commit 651f498

File tree

1 file changed

+27
-41
lines changed

1 file changed

+27
-41
lines changed

crates/pyrefly_python/src/ignore.rs

Lines changed: 27 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,6 @@
2626
//! We are permissive with whitespace, allowing `#type:ignore[code]` and
2727
//! `# type: ignore [ code ]`, but do not allow a space before the colon.
2828
29-
use std::iter::Peekable;
30-
use std::str::CharIndices;
31-
3229
use clap::ValueEnum;
3330
use dupe::Dupe;
3431
use enum_iterator::Sequence;
@@ -42,6 +39,10 @@ use starlark_map::smallset;
4239
/// Finds the byte offset of the first '#' character that starts a comment, tracking
4340
/// whether we're inside a multi-line triple-quoted string.
4441
///
42+
/// All interesting characters (`#`, `'`, `"`, `\`) are ASCII, so we operate
43+
/// on bytes directly — UTF-8 guarantees these never appear inside multi-byte
44+
/// sequences.
45+
///
4546
/// `in_triple_quote` should be `Some('"')` or `Some('\'')` if the line begins
4647
/// inside an open triple-quoted string from a previous line, or `None` otherwise.
4748
///
@@ -50,66 +51,51 @@ pub fn find_comment_start(
5051
line: &str,
5152
in_triple_quote: Option<char>,
5253
) -> (Option<usize>, Option<char>) {
53-
let mut chars = line.char_indices().peekable();
54-
let mut triple_quote = in_triple_quote;
55-
let mut single_quote = None;
56-
57-
let advance_if_matches = |chars: &mut Peekable<CharIndices>, q| {
58-
if chars.peek().is_some_and(|(_, next)| *next == q) {
59-
chars.next();
60-
true
61-
} else {
62-
false
63-
}
64-
};
54+
let mut bytes = line.bytes().enumerate().peekable();
55+
let mut triple_quote: Option<u8> = in_triple_quote.map(|c| c as u8);
56+
let mut single_quote: Option<u8> = None;
6557

66-
while let Some((idx, ch)) = chars.next() {
58+
while let Some((idx, b)) = bytes.next() {
6759
if let Some(q) = triple_quote {
68-
// Inside triple-quoted string
69-
if ch == '\\' {
70-
// Skip next char if escaped
71-
chars.next();
72-
} else if ch == q
73-
// This check consumes zero, one, or two additional chars:
74-
// - zero or one: this is a single quote or a pair of quotes, not interesting
75-
// - two: this is the end of a triple-quoted string
76-
&& advance_if_matches(&mut chars, q)
77-
&& advance_if_matches(&mut chars, q)
60+
// Inside triple-quoted string.
61+
if b == b'\\' {
62+
bytes.next(); // Skip escaped character.
63+
} else if b == q
64+
&& bytes.next_if(|&(_, next)| next == q).is_some()
65+
&& bytes.next_if(|&(_, next)| next == q).is_some()
7866
{
7967
triple_quote = None;
8068
}
8169
continue;
8270
}
8371

8472
if let Some(q) = single_quote {
85-
// Inside regular string
86-
if ch == '\\' {
87-
// Skip next char if escaped
88-
chars.next();
89-
} else if ch == q {
73+
// Inside regular string.
74+
if b == b'\\' {
75+
bytes.next(); // Skip escaped character.
76+
} else if b == q {
9077
single_quote = None;
9178
}
9279
continue;
9380
}
9481

9582
// Normal code.
96-
match ch {
97-
'"' | '\'' => {
98-
if advance_if_matches(&mut chars, ch) {
99-
if advance_if_matches(&mut chars, ch) {
100-
triple_quote = Some(ch);
101-
} else {
102-
// We've advanced past the opening and closing quotes of an empty string
83+
match b {
84+
b'"' | b'\'' => {
85+
if bytes.next_if(|&(_, next)| next == b).is_some() {
86+
if bytes.next_if(|&(_, next)| next == b).is_some() {
87+
triple_quote = Some(b);
10388
}
89+
// else: empty string ("" or ''), both quotes already consumed.
10490
} else {
105-
single_quote = Some(ch);
91+
single_quote = Some(b);
10692
}
10793
}
108-
'#' => return (Some(idx), None),
94+
b'#' => return (Some(idx), None),
10995
_ => {}
11096
}
11197
}
112-
(None, triple_quote)
98+
(None, triple_quote.map(|b| b as char))
11399
}
114100

115101
/// Finds the byte offset of the first '#' character that starts a comment.

0 commit comments

Comments
 (0)