Approximate tokens len (#9546)

MichaReiser · web-flow · commit 47ad7b4500d3 · 2024-01-19T17:39:37.000+01:00
diff --git a/crates/ruff_benchmark/benches/formatter.rs b/crates/ruff_benchmark/benches/formatter.rs
@@ -7,7 +7,7 @@ use ruff_benchmark::{TestCase, TestFile, TestFileDownloadError};
 use ruff_python_formatter::{format_module_ast, PreviewMode, PyFormatOptions};
 use ruff_python_index::CommentRangesBuilder;
 use ruff_python_parser::lexer::lex;
-use ruff_python_parser::{parse_tokens, Mode};
+use ruff_python_parser::{allocate_tokens_vec, parse_tokens, Mode};
 
 #[cfg(target_os = "windows")]
 #[global_allocator]
@@ -52,7 +52,7 @@ fn benchmark_formatter(criterion: &mut Criterion) {
             BenchmarkId::from_parameter(case.name()),
             &case,
             |b, case| {
-                let mut tokens = Vec::new();
+                let mut tokens = allocate_tokens_vec(case.code());
                 let mut comment_ranges = CommentRangesBuilder::default();
 
                 for result in lex(case.code(), Mode::Module) {
diff --git a/crates/ruff_python_index/src/comment_ranges.rs b/crates/ruff_python_index/src/comment_ranges.rs
@@ -2,7 +2,7 @@ use std::fmt::Debug;
 
 use ruff_python_ast::PySourceType;
 use ruff_python_parser::lexer::{lex, LexResult, LexicalError};
-use ruff_python_parser::{AsMode, Tok};
+use ruff_python_parser::{allocate_tokens_vec, AsMode, Tok};
 use ruff_python_trivia::CommentRanges;
 use ruff_text_size::TextRange;
 
@@ -28,7 +28,7 @@ pub fn tokens_and_ranges(
     source: &str,
     source_type: PySourceType,
 ) -> Result<(Vec<LexResult>, CommentRanges), LexicalError> {
-    let mut tokens = Vec::new();
+    let mut tokens = allocate_tokens_vec(source);
     let mut comment_ranges = CommentRangesBuilder::default();
 
     for result in lex(source, source_type.as_mode()) {
diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs
@@ -78,14 +78,14 @@
 //! These tokens can be directly fed into the `ruff_python_parser` to generate an AST:
 //!
 //! ```
-//! use ruff_python_parser::{lexer::lex, Mode, parse_tokens};
+//! use ruff_python_parser::{Mode, parse_tokens, tokenize_all};
 //!
 //! let python_source = r#"
 //! def is_odd(i):
 //!    return bool(i & 1)
 //! "#;
-//! let tokens = lex(python_source, Mode::Module);
-//! let ast = parse_tokens(tokens.collect(), python_source, Mode::Module);
+//! let tokens = tokenize_all(python_source, Mode::Module);
+//! let ast = parse_tokens(tokens, python_source, Mode::Module);
 //!
 //! assert!(ast.is_ok());
 //! ```
@@ -133,17 +133,43 @@ pub mod typing;
 
 /// Collect tokens up to and including the first error.
 pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
-    let mut tokens: Vec<LexResult> = vec![];
+    let mut tokens: Vec<LexResult> = allocate_tokens_vec(contents);
     for tok in lexer::lex(contents, mode) {
         let is_err = tok.is_err();
         tokens.push(tok);
         if is_err {
             break;
         }
     }
+
+    tokens
+}
+
+/// Tokenizes all tokens.
+///
+/// It differs from [`tokenize`] in that it tokenizes all tokens and doesn't stop
+/// after the first `Err`.
+pub fn tokenize_all(contents: &str, mode: Mode) -> Vec<LexResult> {
+    let mut tokens = allocate_tokens_vec(contents);
+    for token in lexer::lex(contents, mode) {
+        tokens.push(token);
+    }
     tokens
 }
 
+/// Allocates a [`Vec`] with an approximated capacity to fit all tokens
+/// of `contents`.
+///
+/// See [#9546](https://github.com/astral-sh/ruff/pull/9546) for a more detailed explanation.
+pub fn allocate_tokens_vec(contents: &str) -> Vec<LexResult> {
+    Vec::with_capacity(approximate_tokens_lower_bound(contents))
+}
+
+/// Approximates the number of tokens when lexing `contents`.
+fn approximate_tokens_lower_bound(contents: &str) -> usize {
+    contents.len().saturating_mul(15) / 100
+}
+
 /// Parse a full Python program from its tokens.
 pub fn parse_program_tokens(
     tokens: Vec<LexResult>,
diff --git a/crates/ruff_python_parser/src/parser.rs b/crates/ruff_python_parser/src/parser.rs
@@ -31,7 +31,7 @@ use crate::{
     lexer::{self, LexicalError, LexicalErrorType},
     python,
     token::Tok,
-    Mode,
+    tokenize_all, Mode,
 };
 
 /// Parse a full Python program usually consisting of multiple lines.
@@ -55,8 +55,7 @@ use crate::{
 /// assert!(program.is_ok());
 /// ```
 pub fn parse_program(source: &str) -> Result<ModModule, ParseError> {
-    let lexer = lex(source, Mode::Module);
-    match parse_tokens(lexer.collect(), source, Mode::Module)? {
+    match parse_tokens(tokenize_all(source, Mode::Module), source, Mode::Module)? {
         Mod::Module(m) => Ok(m),
         Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"),
     }
diff --git a/crates/ruff_wasm/src/lib.rs b/crates/ruff_wasm/src/lib.rs
@@ -17,7 +17,7 @@ use ruff_python_codegen::Stylist;
 use ruff_python_formatter::{format_module_ast, pretty_comments, PyFormatContext, QuoteStyle};
 use ruff_python_index::{CommentRangesBuilder, Indexer};
 use ruff_python_parser::lexer::LexResult;
-use ruff_python_parser::{parse_tokens, AsMode, Mode};
+use ruff_python_parser::{parse_tokens, tokenize_all, AsMode, Mode};
 use ruff_python_trivia::CommentRanges;
 use ruff_source_file::{Locator, SourceLocation};
 use ruff_text_size::Ranged;
@@ -272,7 +272,7 @@ struct ParsedModule<'a> {
 
 impl<'a> ParsedModule<'a> {
     fn from_source(source_code: &'a str) -> Result<Self, Error> {
-        let tokens: Vec<_> = ruff_python_parser::lexer::lex(source_code, Mode::Module).collect();
+        let tokens: Vec<_> = tokenize_all(source_code, Mode::Module);
         let mut comment_ranges = CommentRangesBuilder::default();
 
         for (token, range) in tokens.iter().flatten() {