Skip to content

Commit c0c1b98

Browse files
authored
Format markdown code blocks with line-by-line regex parse (#22996)
Format markdown with line-by-line regex parse - Uses basic `regex` crate, so no backtracking or backreferences needed - Supports `~~~` and arbitrary length code fences - Supports `<!-- fmt:off -->` to skip formatting code blocks - Includes test cases from previous PRs, as well as new ones Obviates #22962 and #22937
1 parent 9f8f3e1 commit c0c1b98

File tree

3 files changed

+238
-37
lines changed

3 files changed

+238
-37
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/ruff_markdown/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ license = { workspace = true }
1313
ruff_python_ast = { workspace = true }
1414
ruff_python_formatter = { workspace = true }
1515
ruff_python_trivia = { workspace = true }
16+
ruff_source_file = { workspace = true }
17+
ruff_text_size = { workspace = true }
1618
ruff_workspace = { workspace = true }
1719

1820
insta = { workspace = true }

crates/ruff_markdown/src/lib.rs

Lines changed: 234 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ use regex::Regex;
44
use ruff_python_ast::PySourceType;
55
use ruff_python_formatter::format_module_source;
66
use ruff_python_trivia::textwrap::{dedent, indent};
7+
use ruff_source_file::{Line, UniversalNewlines};
8+
use ruff_text_size::{TextRange, TextSize};
79
use ruff_workspace::FormatterSettings;
810

911
#[derive(Debug, PartialEq, Eq)]
@@ -12,67 +14,115 @@ pub enum MarkdownResult {
1214
Unchanged,
1315
}
1416

15-
// TODO: account for ~~~ and arbitrary length code fences
1617
// TODO: support code blocks nested inside block quotes, etc
17-
static MARKDOWN_CODE_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
18-
// adapted from blacken-docs
19-
// https://github.com/adamchainz/blacken-docs/blob/fb107c1dce25f9206e29297aaa1ed7afc2980a5a/src/blacken_docs/__init__.py#L17
18+
static MARKDOWN_CODE_FENCE: LazyLock<Regex> = LazyLock::new(|| {
2019
Regex::new(
21-
r"(?imsx)
22-
(?<before>
23-
^(?<indent>\ *)```[^\S\r\n]*
24-
(?<lang>(?:python|py|python3|py3|pyi)?)
25-
(?:\ .*?)?\n
26-
)
27-
(?<code>.*?)
28-
(?<after>
29-
^\ *```[^\S\r\n]*$
30-
)
31-
",
20+
r"(?ix)
21+
^
22+
(?<indent>\s*)
23+
(?<fence>(?:```+|~~~+))\s*
24+
(?<language>(?:\w+)?)\s*
25+
(?<info>(?:.*))\s*
26+
$
27+
",
3228
)
3329
.unwrap()
3430
});
3531

32+
static OFF_ON_DIRECTIVES: LazyLock<Regex> = LazyLock::new(|| {
33+
Regex::new(
34+
r"(?imx)
35+
^
36+
\s*<!--\s*(?:blacken-docs|fmt)\s*:\s*(?<action>off|on)\s*-->
37+
",
38+
)
39+
.unwrap()
40+
});
41+
42+
#[derive(Debug, Default, PartialEq, Eq)]
43+
enum MarkdownState {
44+
#[default]
45+
On,
46+
Off,
47+
}
48+
3649
pub fn format_code_blocks(
3750
source: &str,
3851
path: Option<&Path>,
3952
settings: &FormatterSettings,
4053
) -> MarkdownResult {
54+
let mut state = MarkdownState::On;
4155
let mut changed = false;
4256
let mut formatted = String::with_capacity(source.len());
43-
let mut last_match = 0;
57+
let mut last_match = TextSize::new(0);
4458

45-
for capture in MARKDOWN_CODE_BLOCK.captures_iter(source) {
46-
let (_, [before, code_indent, language, code, after]) = capture.extract();
59+
let mut lines = source.universal_newlines().peekable();
60+
while let Some(line) = lines.next() {
61+
// Toggle code block formatting off/on
62+
if let Some(capture) = OFF_ON_DIRECTIVES.captures(&line) {
63+
let (_, [action]) = capture.extract();
64+
state = match action {
65+
"off" => MarkdownState::Off,
66+
"on" => MarkdownState::On,
67+
_ => state,
68+
};
69+
// Process code blocks
70+
} else if let Some(opening_capture) = MARKDOWN_CODE_FENCE.captures(&line) {
71+
let (_, [code_indent, opening_fence, language, _info]) = opening_capture.extract();
72+
let start = lines.peek().map(Line::start).unwrap_or_default();
4773

48-
let py_source_type = PySourceType::from_extension(language);
49-
let unformatted_code = dedent(code);
50-
let options = settings.to_format_options(py_source_type, &unformatted_code, path);
74+
// Consume lines until reaching the matching/ending code fence
75+
for code_line in lines.by_ref() {
76+
let Some((_, [_, closing_fence, _, _])) = MARKDOWN_CODE_FENCE
77+
.captures(&code_line)
78+
.map(|cap| cap.extract())
79+
else {
80+
continue;
81+
};
5182

52-
// Using `Printed::into_code` requires adding `ruff_formatter` as a direct dependency, and I suspect that Rust can optimize the closure away regardless.
53-
#[expect(clippy::redundant_closure_for_method_calls)]
54-
let formatted_code =
55-
format_module_source(&unformatted_code, options).map(|formatted| formatted.into_code());
83+
// Found the matching end of the code block
84+
if closing_fence == opening_fence {
85+
let language = language.to_ascii_lowercase();
86+
if state == MarkdownState::On
87+
&& matches!(
88+
language.as_str(),
89+
"python" | "py" | "python3" | "py3" | "pyi" | ""
90+
)
91+
{
92+
// Maybe python, try formatting it
93+
let end = code_line.start();
94+
let unformatted_code = dedent(&source[TextRange::new(start, end)]);
5695

57-
if let Ok(formatted_code) = formatted_code {
58-
if formatted_code.len() != unformatted_code.len() || formatted_code != *unformatted_code
59-
{
60-
let m = capture.get_match();
61-
formatted.push_str(&source[last_match..m.start()]);
96+
let py_source_type = PySourceType::from_extension(&language);
97+
let options =
98+
settings.to_format_options(py_source_type, &unformatted_code, path);
6299

63-
let indented_code = indent(&formatted_code, code_indent);
64-
// otherwise I need to deal with a result from write!
65-
#[expect(clippy::format_push_string)]
66-
formatted.push_str(&format!("{before}{indented_code}{after}"));
100+
// Using `Printed::into_code` requires adding `ruff_formatter` as a direct
101+
// dependency, and I suspect that Rust can optimize the closure away regardless.
102+
#[expect(clippy::redundant_closure_for_method_calls)]
103+
let formatted_code = format_module_source(&unformatted_code, options)
104+
.map(|formatted| formatted.into_code());
67105

68-
last_match = m.end();
69-
changed = true;
106+
// Formatting produced changes
107+
if let Ok(formatted_code) = formatted_code
108+
&& (formatted_code.len() != unformatted_code.len()
109+
|| formatted_code != *unformatted_code)
110+
{
111+
formatted.push_str(&source[TextRange::new(last_match, start)]);
112+
let formatted_code = indent(&formatted_code, code_indent);
113+
formatted.push_str(&formatted_code);
114+
last_match = end;
115+
changed = true;
116+
}
117+
}
118+
break;
119+
}
70120
}
71121
}
72122
}
73123

74124
if changed {
75-
formatted.push_str(&source[last_match..]);
125+
formatted.push_str(&source[last_match.to_usize()..]);
76126
MarkdownResult::Formatted(formatted)
77127
} else {
78128
MarkdownResult::Unchanged
@@ -187,4 +237,151 @@ fn (foo: &str) -> &str {
187237
format_code_blocks(code, None, &FormatterSettings::default()),
188238
@"Unchanged");
189239
}
240+
241+
#[test]
242+
fn format_code_blocks_tildes() {
243+
let code = r#"
244+
~~~py
245+
print( 'hello' )
246+
~~~
247+
"#;
248+
assert_snapshot!(
249+
format_code_blocks(code, None, &FormatterSettings::default()),
250+
@r#"
251+
~~~py
252+
print("hello")
253+
~~~
254+
"#);
255+
}
256+
257+
#[test]
258+
fn format_code_blocks_long_fence() {
259+
let code = r#"
260+
````py
261+
print( 'hello' )
262+
````
263+
~~~~~py
264+
print( 'hello' )
265+
~~~~~
266+
"#;
267+
assert_snapshot!(
268+
format_code_blocks(code, None, &FormatterSettings::default()),
269+
@r#"
270+
````py
271+
print("hello")
272+
````
273+
~~~~~py
274+
print("hello")
275+
~~~~~
276+
"#);
277+
}
278+
279+
#[test]
280+
fn format_code_blocks_nested() {
281+
let code = r#"
282+
````markdown
283+
```py
284+
print( 'hello' )
285+
```
286+
````
287+
"#;
288+
assert_snapshot!(
289+
format_code_blocks(code, None, &FormatterSettings::default()),
290+
@"Unchanged");
291+
}
292+
293+
#[test]
294+
fn format_code_blocks_ignore_blackendocs_off() {
295+
let code = r#"
296+
```py
297+
print( 'hello' )
298+
```
299+
300+
<!-- blacken-docs:off -->
301+
```py
302+
print( 'hello' )
303+
```
304+
<!-- blacken-docs:on -->
305+
306+
```py
307+
print( 'hello' )
308+
```
309+
"#;
310+
assert_snapshot!(format_code_blocks(
311+
code,
312+
None,
313+
&FormatterSettings::default()
314+
), @r#"
315+
```py
316+
print("hello")
317+
```
318+
319+
<!-- blacken-docs:off -->
320+
```py
321+
print( 'hello' )
322+
```
323+
<!-- blacken-docs:on -->
324+
325+
```py
326+
print("hello")
327+
```
328+
"#);
329+
}
330+
331+
#[test]
332+
fn format_code_blocks_ignore_ruff_off() {
333+
let code = r#"
334+
```py
335+
print( 'hello' )
336+
```
337+
338+
<!-- fmt:off -->
339+
```py
340+
print( 'hello' )
341+
```
342+
<!-- fmt:on -->
343+
344+
```py
345+
print( 'hello' )
346+
```
347+
"#;
348+
assert_snapshot!(format_code_blocks(
349+
code,
350+
None,
351+
&FormatterSettings::default()
352+
), @r#"
353+
```py
354+
print("hello")
355+
```
356+
357+
<!-- fmt:off -->
358+
```py
359+
print( 'hello' )
360+
```
361+
<!-- fmt:on -->
362+
363+
```py
364+
print("hello")
365+
```
366+
"#);
367+
}
368+
369+
#[test]
370+
fn format_code_blocks_ignore_to_end() {
371+
let code = r#"
372+
<!-- fmt:off -->
373+
```py
374+
print( 'hello' )
375+
```
376+
377+
```py
378+
print( 'hello' )
379+
```
380+
"#;
381+
assert_snapshot!(format_code_blocks(
382+
code,
383+
None,
384+
&FormatterSettings::default()
385+
), @"Unchanged");
386+
}
190387
}

0 commit comments

Comments
 (0)