|
78 | 78 | //! These tokens can be directly fed into the `ruff_python_parser` to generate an AST:
|
79 | 79 | //!
|
80 | 80 | //! ```
|
81 |
| -//! use ruff_python_parser::{lexer::lex, Mode, parse_tokens}; |
| 81 | +//! use ruff_python_parser::{Mode, parse_tokens, tokenize_all}; |
82 | 82 | //!
|
83 | 83 | //! let python_source = r#"
|
84 | 84 | //! def is_odd(i):
|
85 | 85 | //! return bool(i & 1)
|
86 | 86 | //! "#;
|
87 |
| -//! let tokens = lex(python_source, Mode::Module); |
88 |
| -//! let ast = parse_tokens(tokens.collect(), python_source, Mode::Module); |
| 87 | +//! let tokens = tokenize_all(python_source, Mode::Module); |
| 88 | +//! let ast = parse_tokens(tokens, python_source, Mode::Module); |
89 | 89 | //!
|
90 | 90 | //! assert!(ast.is_ok());
|
91 | 91 | //! ```
|
@@ -133,17 +133,43 @@ pub mod typing;
|
133 | 133 |
|
134 | 134 | /// Collect tokens up to and including the first error.
|
135 | 135 | pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
|
136 |
| - let mut tokens: Vec<LexResult> = vec![]; |
| 136 | + let mut tokens: Vec<LexResult> = allocate_tokens_vec(contents); |
137 | 137 | for tok in lexer::lex(contents, mode) {
|
138 | 138 | let is_err = tok.is_err();
|
139 | 139 | tokens.push(tok);
|
140 | 140 | if is_err {
|
141 | 141 | break;
|
142 | 142 | }
|
143 | 143 | }
|
| 144 | + |
| 145 | + tokens |
| 146 | +} |
| 147 | + |
| 148 | +/// Tokenizes all tokens. |
| 149 | +/// |
| 150 | +/// It differs from [`tokenize`] in that it tokenizes all tokens and doesn't stop |
| 151 | +/// after the first `Err`. |
| 152 | +pub fn tokenize_all(contents: &str, mode: Mode) -> Vec<LexResult> { |
| 153 | + let mut tokens = allocate_tokens_vec(contents); |
| 154 | + for token in lexer::lex(contents, mode) { |
| 155 | + tokens.push(token); |
| 156 | + } |
144 | 157 | tokens
|
145 | 158 | }
|
146 | 159 |
|
| 160 | +/// Allocates a [`Vec`] with an approximated capacity to fit all tokens |
| 161 | +/// of `contents`. |
| 162 | +/// |
| 163 | +/// See [#9546](https://github.com/astral-sh/ruff/pull/9546) for a more detailed explanation. |
| 164 | +pub fn allocate_tokens_vec(contents: &str) -> Vec<LexResult> { |
| 165 | + Vec::with_capacity(approximate_tokens_lower_bound(contents)) |
| 166 | +} |
| 167 | + |
| 168 | +/// Approximates the number of tokens when lexing `contents`. |
| 169 | +fn approximate_tokens_lower_bound(contents: &str) -> usize { |
| 170 | + contents.len().saturating_mul(15) / 100 |
| 171 | +} |
| 172 | + |
147 | 173 | /// Parse a full Python program from its tokens.
|
148 | 174 | pub fn parse_program_tokens(
|
149 | 175 | tokens: Vec<LexResult>,
|
|
0 commit comments