Skip to content

Commit f2335fe

Browse files
Make Unicode-to-Unicode confusables a preview change (#8473)
1 parent b0f9a14 commit f2335fe

File tree

4 files changed

+225
-15
lines changed

4 files changed

+225
-15
lines changed

crates/ruff_linter/src/rules/ruff/mod.rs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ mod tests {
1717
use crate::pyproject_toml::lint_pyproject_toml;
1818
use crate::registry::Rule;
1919
use crate::settings::resolve_per_file_ignores;
20-
use crate::settings::types::{PerFileIgnore, PythonVersion};
20+
use crate::settings::types::{PerFileIgnore, PreviewMode, PythonVersion};
2121
use crate::test::{test_path, test_resource_path};
2222
use crate::{assert_messages, settings};
2323

@@ -88,6 +88,24 @@ mod tests {
8888
Ok(())
8989
}
9090

91+
#[test]
92+
fn preview_confusables() -> Result<()> {
93+
let diagnostics = test_path(
94+
Path::new("ruff/confusables.py"),
95+
&settings::LinterSettings {
96+
preview: PreviewMode::Enabled,
97+
allowed_confusables: FxHashSet::from_iter(['−', 'ρ', '∗']),
98+
..settings::LinterSettings::for_rules(vec![
99+
Rule::AmbiguousUnicodeCharacterString,
100+
Rule::AmbiguousUnicodeCharacterDocstring,
101+
Rule::AmbiguousUnicodeCharacterComment,
102+
])
103+
},
104+
)?;
105+
assert_messages!(diagnostics);
106+
Ok(())
107+
}
108+
91109
#[test]
92110
fn noqa() -> Result<()> {
93111
let diagnostics = test_path(

crates/ruff_linter/src/rules/ruff/rules/ambiguous_unicode_character.rs

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,20 @@ use crate::rules::ruff::rules::Context;
1313
use crate::settings::LinterSettings;
1414

1515
/// ## What it does
16-
/// Checks for ambiguous unicode characters in strings.
16+
/// Checks for ambiguous Unicode characters in strings.
1717
///
1818
/// ## Why is this bad?
19-
/// The use of ambiguous unicode characters can confuse readers and cause
19+
/// Some Unicode characters are visually similar to ASCII characters, but have
20+
/// different code points. For example, `LATIN CAPITAL LETTER A` (`U+0041`) is
21+
/// visually similar, but not identical, to the ASCII character `A`.
22+
///
23+
/// The use of ambiguous Unicode characters can confuse readers and cause
2024
/// subtle bugs.
2125
///
26+
/// In [preview], this rule will also flag Unicode characters that are
27+
/// confusable with other, non-preferred Unicode characters. For example, the
28+
/// spec recommends `GREEK CAPITAL LETTER OMEGA` over `OHM SIGN`.
29+
///
2230
/// ## Example
2331
/// ```python
2432
/// print("Ηello, world!") # "Η" is the Greek eta (`U+0397`).
@@ -28,6 +36,8 @@ use crate::settings::LinterSettings;
2836
/// ```python
2937
/// print("Hello, world!") # "H" is the Latin capital H (`U+0048`).
3038
/// ```
39+
///
40+
/// [preview]: https://docs.astral.sh/ruff/preview/
3141
#[violation]
3242
pub struct AmbiguousUnicodeCharacterString {
3343
confusable: char,
@@ -50,12 +60,20 @@ impl Violation for AmbiguousUnicodeCharacterString {
5060
}
5161

5262
/// ## What it does
53-
/// Checks for ambiguous unicode characters in docstrings.
63+
/// Checks for ambiguous Unicode characters in docstrings.
5464
///
5565
/// ## Why is this bad?
56-
/// The use of ambiguous unicode characters can confuse readers and cause
66+
/// Some Unicode characters are visually similar to ASCII characters, but have
67+
/// different code points. For example, `LATIN CAPITAL LETTER A` (`U+0041`) is
68+
/// visually similar, but not identical, to the ASCII character `A`.
69+
///
70+
/// The use of ambiguous Unicode characters can confuse readers and cause
5771
/// subtle bugs.
5872
///
73+
/// In [preview], this rule will also flag Unicode characters that are
74+
/// confusable with other, non-preferred Unicode characters. For example, the
75+
/// spec recommends `GREEK CAPITAL LETTER OMEGA` over `OHM SIGN`.
76+
///
5977
/// ## Example
6078
/// ```python
6179
/// """A lovely docstring (with a `U+FF09` parenthesis)."""
@@ -65,6 +83,8 @@ impl Violation for AmbiguousUnicodeCharacterString {
6583
/// ```python
6684
/// """A lovely docstring (with no strange parentheses)."""
6785
/// ```
86+
///
87+
/// [preview]: https://docs.astral.sh/ruff/preview/
6888
#[violation]
6989
pub struct AmbiguousUnicodeCharacterDocstring {
7090
confusable: char,
@@ -87,12 +107,20 @@ impl Violation for AmbiguousUnicodeCharacterDocstring {
87107
}
88108

89109
/// ## What it does
90-
/// Checks for ambiguous unicode characters in comments.
110+
/// Checks for ambiguous Unicode characters in comments.
91111
///
92112
/// ## Why is this bad?
93-
/// The use of ambiguous unicode characters can confuse readers and cause
113+
/// Some Unicode characters are visually similar to ASCII characters, but have
114+
/// different code points. For example, `LATIN CAPITAL LETTER A` (`U+0041`) is
115+
/// visually similar, but not identical, to the ASCII character `A`.
116+
///
117+
/// The use of ambiguous Unicode characters can confuse readers and cause
94118
/// subtle bugs.
95119
///
120+
/// In [preview], this rule will also flag Unicode characters that are
121+
/// confusable with other, non-preferred Unicode characters. For example, the
122+
/// spec recommends `GREEK CAPITAL LETTER OMEGA` over `OHM SIGN`.
123+
///
96124
/// ## Example
97125
/// ```python
98126
/// foo() # nоqa # "о" is Cyrillic (`U+043E`)
@@ -102,6 +130,8 @@ impl Violation for AmbiguousUnicodeCharacterDocstring {
102130
/// ```python
103131
/// foo() # noqa # "o" is Latin (`U+006F`)
104132
/// ```
133+
///
134+
/// [preview]: https://docs.astral.sh/ruff/preview/
105135
#[violation]
106136
pub struct AmbiguousUnicodeCharacterComment {
107137
confusable: char,
@@ -159,7 +189,9 @@ pub(crate) fn ambiguous_unicode_character(
159189
// Check if the boundary character is itself an ambiguous unicode character, in which
160190
// case, it's always included as a diagnostic.
161191
if !current_char.is_ascii() {
162-
if let Some(representant) = confusable(current_char as u32) {
192+
if let Some(representant) = confusable(current_char as u32)
193+
.filter(|representant| settings.preview.is_enabled() || representant.is_ascii())
194+
{
163195
let candidate = Candidate::new(
164196
TextSize::try_from(relative_offset).unwrap() + range.start(),
165197
current_char,
@@ -173,7 +205,9 @@ pub(crate) fn ambiguous_unicode_character(
173205
} else if current_char.is_ascii() {
174206
// The current word contains at least one ASCII character.
175207
word_flags |= WordFlags::ASCII;
176-
} else if let Some(representant) = confusable(current_char as u32) {
208+
} else if let Some(representant) = confusable(current_char as u32)
209+
.filter(|representant| settings.preview.is_enabled() || representant.is_ascii())
210+
{
177211
// The current word contains an ambiguous unicode character.
178212
word_candidates.push(Candidate::new(
179213
TextSize::try_from(relative_offset).unwrap() + range.start(),

crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__confusables.snap

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,4 @@ confusables.py:46:62: RUF003 Comment contains ambiguous `᜵` (PHILIPPINE SINGLE
155155
47 | }"
156156
|
157157

158-
confusables.py:55:28: RUF001 String contains ambiguous `µ` (MICRO SIGN). Did you mean `μ` (GREEK SMALL LETTER MU)?
159-
|
160-
55 | assert getattr(Labware(), "µL") == 1.5
161-
| ^ RUF001
162-
|
163-
164158

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
---
2+
source: crates/ruff_linter/src/rules/ruff/mod.rs
3+
---
4+
confusables.py:1:6: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)?
5+
|
6+
1 | x = "𝐁ad string"
7+
| ^ RUF001
8+
2 | y = ""
9+
|
10+
11+
confusables.py:6:56: RUF002 Docstring contains ambiguous `` (FULLWIDTH RIGHT PARENTHESIS). Did you mean `)` (RIGHT PARENTHESIS)?
12+
|
13+
5 | def f():
14+
6 | """Here's a docstring with an unusual parenthesis: )"""
15+
| ^^ RUF002
16+
7 | # And here's a comment with an unusual punctuation mark:
17+
8 | ...
18+
|
19+
20+
confusables.py:7:62: RUF003 Comment contains ambiguous `` (PHILIPPINE SINGLE PUNCTUATION). Did you mean `/` (SOLIDUS)?
21+
|
22+
5 | def f():
23+
6 | """Here's a docstring with an unusual parenthesis: )"""
24+
7 | # And here's a comment with an unusual punctuation mark:
25+
| ^ RUF003
26+
8 | ...
27+
|
28+
29+
confusables.py:17:6: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)?
30+
|
31+
17 | x = "𝐁ad string"
32+
| ^ RUF001
33+
18 | x = ""
34+
|
35+
36+
confusables.py:26:10: RUF001 String contains ambiguous `α` (GREEK SMALL LETTER ALPHA). Did you mean `a` (LATIN SMALL LETTER A)?
37+
|
38+
24 | # The first word should be ignored, while the second should be included, since it
39+
25 | # contains ASCII.
40+
26 | x = "βα Bαd"
41+
| ^ RUF001
42+
27 |
43+
28 | # The two characters should be flagged here. The first character is a "word"
44+
|
45+
46+
confusables.py:31:6: RUF001 String contains ambiguous `Р` (CYRILLIC CAPITAL LETTER ER). Did you mean `P` (LATIN CAPITAL LETTER P)?
47+
|
48+
29 | # consisting of a single ambiguous character, while the second character is a "word
49+
30 | # boundary" (whitespace) that it itself ambiguous.
50+
31 | x = "Р усский"
51+
| ^ RUF001
52+
32 |
53+
33 | # Same test cases as above but using f-strings instead:
54+
|
55+
56+
confusables.py:31:7: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)?
57+
|
58+
29 | # consisting of a single ambiguous character, while the second character is a "word
59+
30 | # boundary" (whitespace) that it itself ambiguous.
60+
31 | x = "Р усский"
61+
| ^ RUF001
62+
32 |
63+
33 | # Same test cases as above but using f-strings instead:
64+
|
65+
66+
confusables.py:34:7: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)?
67+
|
68+
33 | # Same test cases as above but using f-strings instead:
69+
34 | x = f"𝐁ad string"
70+
| ^ RUF001
71+
35 | x = f""
72+
36 | x = f"Русский"
73+
|
74+
75+
confusables.py:37:11: RUF001 String contains ambiguous `α` (GREEK SMALL LETTER ALPHA). Did you mean `a` (LATIN SMALL LETTER A)?
76+
|
77+
35 | x = f""
78+
36 | x = f"Русский"
79+
37 | x = f"βα Bαd"
80+
| ^ RUF001
81+
38 | x = f"Р усский"
82+
|
83+
84+
confusables.py:38:7: RUF001 String contains ambiguous `Р` (CYRILLIC CAPITAL LETTER ER). Did you mean `P` (LATIN CAPITAL LETTER P)?
85+
|
86+
36 | x = f"Русский"
87+
37 | x = f"βα Bαd"
88+
38 | x = f"Р усский"
89+
| ^ RUF001
90+
39 |
91+
40 | # Nested f-strings
92+
|
93+
94+
confusables.py:38:8: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)?
95+
|
96+
36 | x = f"Русский"
97+
37 | x = f"βα Bαd"
98+
38 | x = f"Р усский"
99+
| ^ RUF001
100+
39 |
101+
40 | # Nested f-strings
102+
|
103+
104+
confusables.py:41:7: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)?
105+
|
106+
40 | # Nested f-strings
107+
41 | x = f"𝐁ad string {f" {f"Р усский"}"}"
108+
| ^ RUF001
109+
42 |
110+
43 | # Comments inside f-strings
111+
|
112+
113+
confusables.py:41:21: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)?
114+
|
115+
40 | # Nested f-strings
116+
41 | x = f"𝐁ad string {f" {f"Р усский"}"}"
117+
| ^ RUF001
118+
42 |
119+
43 | # Comments inside f-strings
120+
|
121+
122+
confusables.py:41:25: RUF001 String contains ambiguous `Р` (CYRILLIC CAPITAL LETTER ER). Did you mean `P` (LATIN CAPITAL LETTER P)?
123+
|
124+
40 | # Nested f-strings
125+
41 | x = f"𝐁ad string {f" {f"Р усский"}"}"
126+
| ^ RUF001
127+
42 |
128+
43 | # Comments inside f-strings
129+
|
130+
131+
confusables.py:41:26: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)?
132+
|
133+
40 | # Nested f-strings
134+
41 | x = f"𝐁ad string {f" {f"Р усский"}"}"
135+
| ^ RUF001
136+
42 |
137+
43 | # Comments inside f-strings
138+
|
139+
140+
confusables.py:44:68: RUF003 Comment contains ambiguous `` (FULLWIDTH RIGHT PARENTHESIS). Did you mean `)` (RIGHT PARENTHESIS)?
141+
|
142+
43 | # Comments inside f-strings
143+
44 | x = f"string { # And here's a comment with an unusual parenthesis:
144+
| ^^ RUF003
145+
45 | # And here's a comment with a greek alpha:
146+
46 | foo # And here's a comment with an unusual punctuation mark:
147+
|
148+
149+
confusables.py:46:62: RUF003 Comment contains ambiguous `` (PHILIPPINE SINGLE PUNCTUATION). Did you mean `/` (SOLIDUS)?
150+
|
151+
44 | x = f"string { # And here's a comment with an unusual parenthesis:
152+
45 | # And here's a comment with a greek alpha:
153+
46 | foo # And here's a comment with an unusual punctuation mark:
154+
| ^ RUF003
155+
47 | }"
156+
|
157+
158+
confusables.py:55:28: RUF001 String contains ambiguous `µ` (MICRO SIGN). Did you mean `μ` (GREEK SMALL LETTER MU)?
159+
|
160+
55 | assert getattr(Labware(), "µL") == 1.5
161+
| ^ RUF001
162+
|
163+
164+

0 commit comments

Comments
 (0)