Skip to content

Commit 47ad7b4

Browse files
authored
Approximate tokens len (#9546)
1 parent b3a6f0c commit 47ad7b4

File tree

5 files changed

+38
-13
lines changed

5 files changed

+38
-13
lines changed

crates/ruff_benchmark/benches/formatter.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use ruff_benchmark::{TestCase, TestFile, TestFileDownloadError};
77
use ruff_python_formatter::{format_module_ast, PreviewMode, PyFormatOptions};
88
use ruff_python_index::CommentRangesBuilder;
99
use ruff_python_parser::lexer::lex;
10-
use ruff_python_parser::{parse_tokens, Mode};
10+
use ruff_python_parser::{allocate_tokens_vec, parse_tokens, Mode};
1111

1212
#[cfg(target_os = "windows")]
1313
#[global_allocator]
@@ -52,7 +52,7 @@ fn benchmark_formatter(criterion: &mut Criterion) {
5252
BenchmarkId::from_parameter(case.name()),
5353
&case,
5454
|b, case| {
55-
let mut tokens = Vec::new();
55+
let mut tokens = allocate_tokens_vec(case.code());
5656
let mut comment_ranges = CommentRangesBuilder::default();
5757

5858
for result in lex(case.code(), Mode::Module) {

crates/ruff_python_index/src/comment_ranges.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use std::fmt::Debug;
22

33
use ruff_python_ast::PySourceType;
44
use ruff_python_parser::lexer::{lex, LexResult, LexicalError};
5-
use ruff_python_parser::{AsMode, Tok};
5+
use ruff_python_parser::{allocate_tokens_vec, AsMode, Tok};
66
use ruff_python_trivia::CommentRanges;
77
use ruff_text_size::TextRange;
88

@@ -28,7 +28,7 @@ pub fn tokens_and_ranges(
2828
source: &str,
2929
source_type: PySourceType,
3030
) -> Result<(Vec<LexResult>, CommentRanges), LexicalError> {
31-
let mut tokens = Vec::new();
31+
let mut tokens = allocate_tokens_vec(source);
3232
let mut comment_ranges = CommentRangesBuilder::default();
3333

3434
for result in lex(source, source_type.as_mode()) {

crates/ruff_python_parser/src/lib.rs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,14 @@
7878
//! These tokens can be directly fed into the `ruff_python_parser` to generate an AST:
7979
//!
8080
//! ```
81-
//! use ruff_python_parser::{lexer::lex, Mode, parse_tokens};
81+
//! use ruff_python_parser::{Mode, parse_tokens, tokenize_all};
8282
//!
8383
//! let python_source = r#"
8484
//! def is_odd(i):
8585
//! return bool(i & 1)
8686
//! "#;
87-
//! let tokens = lex(python_source, Mode::Module);
88-
//! let ast = parse_tokens(tokens.collect(), python_source, Mode::Module);
87+
//! let tokens = tokenize_all(python_source, Mode::Module);
88+
//! let ast = parse_tokens(tokens, python_source, Mode::Module);
8989
//!
9090
//! assert!(ast.is_ok());
9191
//! ```
@@ -133,17 +133,43 @@ pub mod typing;
133133

134134
/// Collect tokens up to and including the first error.
135135
pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
136-
let mut tokens: Vec<LexResult> = vec![];
136+
let mut tokens: Vec<LexResult> = allocate_tokens_vec(contents);
137137
for tok in lexer::lex(contents, mode) {
138138
let is_err = tok.is_err();
139139
tokens.push(tok);
140140
if is_err {
141141
break;
142142
}
143143
}
144+
145+
tokens
146+
}
147+
148+
/// Tokenizes all tokens.
149+
///
150+
/// It differs from [`tokenize`] in that it tokenizes all tokens and doesn't stop
151+
/// after the first `Err`.
152+
pub fn tokenize_all(contents: &str, mode: Mode) -> Vec<LexResult> {
153+
let mut tokens = allocate_tokens_vec(contents);
154+
for token in lexer::lex(contents, mode) {
155+
tokens.push(token);
156+
}
144157
tokens
145158
}
146159

160+
/// Allocates a [`Vec`] with an approximated capacity to fit all tokens
161+
/// of `contents`.
162+
///
163+
/// See [#9546](https://github.com/astral-sh/ruff/pull/9546) for a more detailed explanation.
164+
pub fn allocate_tokens_vec(contents: &str) -> Vec<LexResult> {
165+
Vec::with_capacity(approximate_tokens_lower_bound(contents))
166+
}
167+
168+
/// Approximates the number of tokens when lexing `contents`.
169+
fn approximate_tokens_lower_bound(contents: &str) -> usize {
170+
contents.len().saturating_mul(15) / 100
171+
}
172+
147173
/// Parse a full Python program from its tokens.
148174
pub fn parse_program_tokens(
149175
tokens: Vec<LexResult>,

crates/ruff_python_parser/src/parser.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ use crate::{
3131
lexer::{self, LexicalError, LexicalErrorType},
3232
python,
3333
token::Tok,
34-
Mode,
34+
tokenize_all, Mode,
3535
};
3636

3737
/// Parse a full Python program usually consisting of multiple lines.
@@ -55,8 +55,7 @@ use crate::{
5555
/// assert!(program.is_ok());
5656
/// ```
5757
pub fn parse_program(source: &str) -> Result<ModModule, ParseError> {
58-
let lexer = lex(source, Mode::Module);
59-
match parse_tokens(lexer.collect(), source, Mode::Module)? {
58+
match parse_tokens(tokenize_all(source, Mode::Module), source, Mode::Module)? {
6059
Mod::Module(m) => Ok(m),
6160
Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"),
6261
}

crates/ruff_wasm/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use ruff_python_codegen::Stylist;
1717
use ruff_python_formatter::{format_module_ast, pretty_comments, PyFormatContext, QuoteStyle};
1818
use ruff_python_index::{CommentRangesBuilder, Indexer};
1919
use ruff_python_parser::lexer::LexResult;
20-
use ruff_python_parser::{parse_tokens, AsMode, Mode};
20+
use ruff_python_parser::{parse_tokens, tokenize_all, AsMode, Mode};
2121
use ruff_python_trivia::CommentRanges;
2222
use ruff_source_file::{Locator, SourceLocation};
2323
use ruff_text_size::Ranged;
@@ -272,7 +272,7 @@ struct ParsedModule<'a> {
272272

273273
impl<'a> ParsedModule<'a> {
274274
fn from_source(source_code: &'a str) -> Result<Self, Error> {
275-
let tokens: Vec<_> = ruff_python_parser::lexer::lex(source_code, Mode::Module).collect();
275+
let tokens: Vec<_> = tokenize_all(source_code, Mode::Module);
276276
let mut comment_ranges = CommentRangesBuilder::default();
277277

278278
for (token, range) in tokens.iter().flatten() {

0 commit comments

Comments
 (0)