astral-sh · charliermarsh · Nov 2, 2024 · Oct 30, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/crates/ruff_linter/resources/test/fixtures/flake8_simplify/SIM905.py b/crates/ruff_linter/resources/test/fixtures/flake8_simplify/SIM905.py
@@ -0,0 +1,99 @@
+# setup
+sep = ","
+no_sep = None
+
+# positives
+"""
+	itemA
+	itemB
+	itemC
+""".split()
+
+"a,b,c,d".split(",")
+"a,b,c,d".split(None)
+"a,b,c,d".split(",", 1)
+"a,b,c,d".split(None, 1)
+"a,b,c,d".split(sep=",")
+"a,b,c,d".split(sep=None)
+"a,b,c,d".split(sep=",", maxsplit=1)
+"a,b,c,d".split(sep=None, maxsplit=1)
+"a,b,c,d".split(maxsplit=1, sep=",")
+"a,b,c,d".split(maxsplit=1, sep=None)
+"a,b,c,d".split(",", maxsplit=1)
+"a,b,c,d".split(None, maxsplit=1)
+"a,b,c,d".split(maxsplit=1)
+"a,b,c,d".split(maxsplit=1.0)
+"a,b,c,d".split(maxsplit=1)
+"a,b,c,d".split(maxsplit=0)
+"VERB AUX PRON ADP DET".split(" ")
+'   1   2   3   '.split()
+'1<>2<>3<4'.split('<>')
+
+" a*a a*a a ".split("*", -1)  # [' a', 'a a', 'a a ']
+"".split()  # []
+""" 	
+""".split()  # []
+"   	".split()  # []
+"/abc/".split() # ['/abc/']
+("a,b,c"
+# comment
+.split()
+)  # ['a,b,c']
+("a,b,c"
+# comment
+.split(",")
+) # ['a', 'b', 'c']
+("a,"
+# comment
+"b,"
+"c"
+.split(",")
+) # ['a', 'b', 'c']
+
+"hello "\
+	"world".split()
+# ['hello', 'world']
+
+# prefixes and isc
+u"a b".split()  # ['a', 'b']
+r"a \n b".split()  # ['a', '\\n', 'b']
+("a " "b").split()  # ['a', 'b']
+"a " "b".split()  # ['a', 'b']
+u"a " "b".split()  # ['a', 'b']
+"a " u"b".split()  # ['a', 'b']
+u"a " r"\n".split()  # ['a', '\\n']
+r"\n " u"\n".split()  # ['\\n']
+r"\n " "\n".split()  # ['\\n']
+"a " r"\n".split()  # ['a', '\\n']
+
+"a,b,c".split(',', maxsplit=0) # ['a,b,c']
+"a,b,c".split(',', maxsplit=-1)  # ['a', 'b', 'c']
+"a,b,c".split(',', maxsplit=-2)  # ['a', 'b', 'c']
+"a,b,c".split(',', maxsplit=-0)  # ['a,b,c']
+
+# negatives
+
+# invalid values should not cause panic 
+"a,b,c,d".split(maxsplit="hello")
+"a,b,c,d".split(maxsplit=-"hello")
+
+# variable names not implemented
+"a,b,c,d".split(sep)
+"a,b,c,d".split(no_sep)
+for n in range(3):
+	"a,b,c,d".split(",", maxsplit=n)
+
+# f-strings not yet implemented
+world = "world"
+_ = f"{world}_hello_world".split("_")
+
+hello = "hello_world"
+_ = f"{hello}_world".split("_")
+
+# split on bytes not yet implemented, much less frequent
+b"TesT.WwW.ExamplE.CoM".split(b".")
+
+# str.splitlines not yet implemented
+"hello\nworld".splitlines()
+"hello\nworld".splitlines(keepends=True)
+"hello\nworld".splitlines(keepends=False)
diff --git a/crates/ruff_linter/src/checkers/ast/analyze/expression.rs b/crates/ruff_linter/src/checkers/ast/analyze/expression.rs
@@ -384,6 +384,8 @@ pub(crate) fn expression(expr: &Expr, checker: &mut Checker) {
                 Rule::StaticJoinToFString,
                 // refurb
                 Rule::HashlibDigestHex,
+                // flake8-simplify
+                Rule::SplitStaticString,
             ]) {
                 if let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func.as_ref() {
                     let attr = attr.as_str();
@@ -401,6 +403,16 @@ pub(crate) fn expression(expr: &Expr, checker: &mut Checker) {
                                     string_value.to_str(),
                                 );
                             }
+                        } else if matches!(attr, "split" | "rsplit") {
+                            // "...".split(...) call
+                            if checker.enabled(Rule::SplitStaticString) {
+                                flake8_simplify::rules::split_static_string(
+                                    checker,
+                                    attr,
+                                    call,
+                                    string_value.to_str(),
+                                );
+                            }
                         } else if attr == "format" {
                             // "...".format(...) call
                             let location = expr.range();

diff --git a/crates/ruff_linter/src/codes.rs b/crates/ruff_linter/src/codes.rs
@@ -480,6 +480,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> {
         (Flake8Simplify, "223") => (RuleGroup::Stable, rules::flake8_simplify::rules::ExprAndFalse),
         (Flake8Simplify, "300") => (RuleGroup::Stable, rules::flake8_simplify::rules::YodaConditions),
         (Flake8Simplify, "401") => (RuleGroup::Stable, rules::flake8_simplify::rules::IfElseBlockInsteadOfDictGet),
+        (Flake8Simplify, "905") => (RuleGroup::Preview, rules::flake8_simplify::rules::SplitStaticString),
         (Flake8Simplify, "910") => (RuleGroup::Stable, rules::flake8_simplify::rules::DictGetWithNoneDefault),
         (Flake8Simplify, "911") => (RuleGroup::Stable, rules::flake8_simplify::rules::ZipDictKeysAndValues),
 

diff --git a/crates/ruff_linter/src/rules/flake8_simplify/mod.rs b/crates/ruff_linter/src/rules/flake8_simplify/mod.rs
@@ -29,7 +29,9 @@ mod tests {
     #[test_case(Rule::ReimplementedBuiltin, Path::new("SIM111.py"))]
     #[test_case(Rule::UncapitalizedEnvironmentVariables, Path::new("SIM112.py"))]
     #[test_case(Rule::EnumerateForLoop, Path::new("SIM113.py"))]
+    #[test_case(Rule::IfWithSameArms, Path::new("SIM114.py"))]
     #[test_case(Rule::OpenFileWithContextHandler, Path::new("SIM115.py"))]
+    #[test_case(Rule::IfElseBlockInsteadOfDictLookup, Path::new("SIM116.py"))]
     #[test_case(Rule::MultipleWithStatements, Path::new("SIM117.py"))]
     #[test_case(Rule::InDictKeys, Path::new("SIM118.py"))]
     #[test_case(Rule::NegateEqualOp, Path::new("SIM201.py"))]
@@ -44,9 +46,8 @@ mod tests {
     #[test_case(Rule::ExprAndFalse, Path::new("SIM223.py"))]
     #[test_case(Rule::YodaConditions, Path::new("SIM300.py"))]
     #[test_case(Rule::IfElseBlockInsteadOfDictGet, Path::new("SIM401.py"))]
+    #[test_case(Rule::SplitStaticString, Path::new("SIM905.py"))]
     #[test_case(Rule::DictGetWithNoneDefault, Path::new("SIM910.py"))]
-    #[test_case(Rule::IfElseBlockInsteadOfDictLookup, Path::new("SIM116.py"))]
-    #[test_case(Rule::IfWithSameArms, Path::new("SIM114.py"))]
     fn rules(rule_code: Rule, path: &Path) -> Result<()> {
         let snapshot = format!("{}_{}", rule_code.noqa_code(), path.to_string_lossy());
         let diagnostics = test_path(

diff --git a/crates/ruff_linter/src/rules/flake8_simplify/rules/mod.rs b/crates/ruff_linter/src/rules/flake8_simplify/rules/mod.rs
@@ -14,6 +14,7 @@ pub(crate) use needless_bool::*;
 pub(crate) use open_file_with_context_handler::*;
 pub(crate) use reimplemented_builtin::*;
 pub(crate) use return_in_try_except_finally::*;
+pub(crate) use split_static_string::*;
 pub(crate) use suppressible_exception::*;
 pub(crate) use yoda_conditions::*;
 pub(crate) use zip_dict_keys_and_values::*;
@@ -35,6 +36,7 @@ mod needless_bool;
 mod open_file_with_context_handler;
 mod reimplemented_builtin;
 mod return_in_try_except_finally;
+mod split_static_string;
 mod suppressible_exception;
 mod yoda_conditions;
 mod zip_dict_keys_and_values;
diff --git a/crates/ruff_linter/src/rules/flake8_simplify/rules/split_static_string.rs b/crates/ruff_linter/src/rules/flake8_simplify/rules/split_static_string.rs
@@ -0,0 +1,198 @@
+use std::cmp::Ordering;
+
+use ruff_diagnostics::{Diagnostic, Edit, Fix, FixAvailability, Violation};
+use ruff_macros::{derive_message_formats, violation};
+use ruff_python_ast::{
+    Expr, ExprCall, ExprContext, ExprList, ExprStringLiteral, ExprUnaryOp, StringLiteral,
+    StringLiteralFlags, StringLiteralValue, UnaryOp,
+};
+use ruff_text_size::{Ranged, TextRange};
+
+use crate::checkers::ast::Checker;
+
+/// ## What it does
+/// Checks for `str.split` calls that can be replaced with a list literal.
+///
+/// ## Why is this bad?
+/// List literals are more readable and do not require the overhead of calling `str.split`.
+///
+/// ## Example
+/// ```python
+/// "a,b,c,d".split(",")
+/// ```
+///
+/// Use instead:
+/// ```python
+/// ["a", "b", "c", "d"]
+/// ```
+///
+/// ## References
+///
+/// - [Python documentation: `str.split`](https://docs.python.org/3/library/stdtypes.html#str.split)
+///
+/// ```
+#[violation]
+pub struct SplitStaticString;
+
+impl Violation for SplitStaticString {
+    const FIX_AVAILABILITY: FixAvailability = FixAvailability::Sometimes;
+
+    #[derive_message_formats]
+    fn message(&self) -> String {
+        format!("Consider using a list instead of `str.split`")
+    }
+
+    fn fix_title(&self) -> Option<String> {
+        Some(format!("Replace `str.split` with list literal"))
+    }
+}
+
+fn construct_replacement(list_items: &[&str]) -> Expr {
+    Expr::List(ExprList {
+        elts: list_items
+            .iter()
+            .map(|list_item| {
+                Expr::StringLiteral(ExprStringLiteral {
+                    value: StringLiteralValue::single(StringLiteral {
+                        value: (*list_item).to_string().into_boxed_str(),
+                        range: TextRange::default(),
+                        flags: StringLiteralFlags::default(),
+                    }),
+                    range: TextRange::default(),
+                })
+            })
+            .collect(),
+        ctx: ExprContext::Load,
+        range: TextRange::default(),
+    })
+}
+
+fn split_default(str_value: &str, max_split: i32) -> Option<Expr> {
+    // From the Python documentation:
+    // > If sep is not specified or is None, a different splitting algorithm is applied: runs of
+    // > consecutive whitespace are regarded as a single separator, and the result will contain
+    // > no empty strings at the start or end if the string has leading or trailing whitespace.
+    // > Consequently, splitting an empty string or a string consisting of just whitespace with
+    // > a None separator returns [].
+    // https://docs.python.org/3/library/stdtypes.html#str.split
+    match max_split.cmp(&0) {
+        Ordering::Greater => {
+            // Autofix for maxsplit without separator not yet implemented
+            // split_whitespace().remainder() is still experimental:
+            // https://doc.rust-lang.org/std/str/struct.SplitWhitespace.html#method.remainder
+            None
+        }
+        Ordering::Equal => {
+            let list_items: Vec<&str> = vec![str_value];
+            Some(construct_replacement(&list_items))
+        }
+        Ordering::Less => {
+            let list_items: Vec<&str> = str_value.split_whitespace().collect();
+            Some(construct_replacement(&list_items))
+        }
+    }
+}
+
+fn split_sep(str_value: &str, sep_value: &str, max_split: i32, direction_left: bool) -> Expr {
+    let list_items: Vec<&str> = if let Ok(split_n) = usize::try_from(max_split) {
+        if direction_left {
+            str_value.splitn(split_n + 1, sep_value).collect()
+        } else {
+            str_value.rsplitn(split_n + 1, sep_value).collect()
+        }
+    } else {
+        if direction_left {
+            str_value.split(sep_value).collect()
+        } else {
+            str_value.rsplit(sep_value).collect()
+        }
+    };
+
+    construct_replacement(&list_items)
+}
+
+fn get_maxsplit_value(maxsplit_arg: Option<&Expr>) -> Option<i32> {
+    let maxsplit_value = if let Some(maxsplit) = maxsplit_arg {
+        match maxsplit {
+            // Negative number
+            Expr::UnaryOp(ExprUnaryOp {
+                op: UnaryOp::USub,
+                operand,
+                ..
+            }) => {
+                match &**operand {
+                    Expr::NumberLiteral(maxsplit_val) => maxsplit_val
+                        .value
+                        .as_int()
+                        .and_then(ruff_python_ast::Int::as_i32)
+                        .map(|f| -f),
+                    // Ignore when `maxsplit` is not a numeric value
+                    _ => None,
+                }
+            }
+            // Positive number
+            Expr::NumberLiteral(maxsplit_val) => maxsplit_val
+                .value
+                .as_int()
+                .and_then(ruff_python_ast::Int::as_i32),
+            // Ignore when `maxsplit` is not a numeric value
+            _ => None,
+        }
+    } else {
+        // Default value is -1 (no splits)
+        Some(-1)
+    };
+    maxsplit_value
+}
+
+/// SIM905
+pub(crate) fn split_static_string(
+    checker: &mut Checker,
+    attr: &str,
+    call: &ExprCall,
+    str_value: &str,
+) {
+    let ExprCall { arguments, .. } = call;
+
+    let maxsplit_arg = arguments.find_argument("maxsplit", 1);
+    let Some(maxsplit_value) = get_maxsplit_value(maxsplit_arg) else {
+        return;
+    };
+
+    // `split` vs `rsplit`
+    let direction_left = attr == "split";
+
+    let sep_arg = arguments.find_argument("sep", 0);
+    let split_replacement = if let Some(sep) = sep_arg {
+        match sep {
+            Expr::NoneLiteral(_) => split_default(str_value, maxsplit_value),
+            Expr::StringLiteral(sep_value) => {
+                let sep_value_str = sep_value.value.to_str();
+                Some(split_sep(
+                    str_value,
+                    sep_value_str,
+                    maxsplit_value,
+                    direction_left,
+                ))
+            }
+            // Ignore names until type inference is available
+            _ => {
+                return;
+            }
+        }
+    } else {
+        split_default(str_value, maxsplit_value)
+    };
+
+    let mut diagnostic = Diagnostic::new(SplitStaticString, call.range());
+    if let Some(ref replacement_expr) = split_replacement {
+        // Construct replacement list
+        let replacement = checker.generator().expr(replacement_expr);
+        // Unsafe because the fix does not preserve comments within implicit string concatenation
+        diagnostic.set_fix(Fix::unsafe_edit(Edit::range_replacement(
+            replacement,
+            call.range(),
+        )));
+    }
+    checker.diagnostics.push(diagnostic);
+}