From 34df9e5df736191c5ba22f18606b2c81b055d752 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 21 Apr 2021 20:28:52 +0100
Subject: [PATCH 01/15] [ArrowStringArray] implement
 ArrowStringArray._str_split

---
 pandas/core/arrays/string_arrow.py           |  32 +++
 pandas/tests/strings/test_split_partition.py | 277 +++++++++++++------
 2 files changed, 230 insertions(+), 79 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index dd09ef4e585ce..b0f6d1b7dd1b0 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -763,3 +763,35 @@ def _str_lower(self):
 
     def _str_upper(self):
         return type(self)(pc.utf8_upper(self._data))
+
+    def _str_split(self, pat=None, n=-1, expand=False):
+        if pat is None:
+            if hasattr(pc, "utf8_split_whitespace"):
+                if n is None or n == 0:
+                    n = -1
+                result = pc.utf8_split_whitespace(self._data, max_splits=n)
+            else:
+                return super()._str_split(pat=pat, n=n, expand=expand)
+        else:
+            if len(pat) == 1 and hasattr(pc, "split_pattern"):
+                if n is None or n == 0:
+                    n = -1
+                result = pc.split_pattern(self._data, pattern=pat, max_splits=n)
+            else:
+                return super()._str_split(pat=pat, n=n, expand=expand)
+
+        if result.null_count:
+            is_valid = np.array(result.is_valid())
+            result = np.array(result)
+            result[~is_valid] = self.dtype.na_value
+            valid = result[is_valid]
+            # we need to loop through to avoid numpy indexing assignment errors when
+            # the result is not a ragged array and interpreted as a 2 dimensional
+            # array
+            for i, val in enumerate(valid):
+                valid[i] = val.tolist()
+        else:
+            result = np.array(result)
+            for i, val in enumerate(result):
+                result[i] = val.tolist()
+        return result
diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
index 6df8fa805955d..842e9b5ccb2a0 100644
--- a/pandas/tests/strings/test_split_partition.py
+++ b/pandas/tests/strings/test_split_partition.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -13,15 +15,37 @@
 )
 
 
-def test_split():
-    values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
+@pytest.fixture(
+    params=[
+        "object",
+        "string",
+        pytest.param(
+            "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+        ),
+    ]
+)
+def any_string_dtype(request):
+    """
+    Parametrized fixture for string dtypes.
+
+    * 'object'
+    * 'string'
+    * 'arrow_string'
+    """
+    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
+
+    return request.param
+
+
+def test_split(any_string_dtype):
+    values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
 
     result = values.str.split("_")
     exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
     tm.assert_series_equal(result, exp)
 
     # more than one char
-    values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"])
+    values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype)
     result = values.str.split("__")
     tm.assert_series_equal(result, exp)
 
@@ -29,20 +53,46 @@ def test_split():
     tm.assert_series_equal(result, exp)
 
     # mixed
-    mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
-    result = mixed.str.split("_")
-    exp = Series(
+    mixed = Series(
         [
-            ["a", "b", "c"],
-            np.nan,
-            ["d", "e", "f"],
-            np.nan,
-            np.nan,
-            np.nan,
-            np.nan,
+            "a_b_c",
             np.nan,
-        ]
+            "d_e_f",
+            True,
+            datetime(2021, 4, 21, 18, 7, 26, 633720),
+            None,
+            1,
+            2.0,
+        ],
+        dtype=any_string_dtype,
     )
+    result = mixed.str.split("_")
+    if any_string_dtype == "object":
+        exp = Series(
+            [
+                ["a", "b", "c"],
+                np.nan,
+                ["d", "e", "f"],
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+            ]
+        )
+    else:
+        exp = Series(
+            [
+                ["a", "b", "c"],
+                pd.NA,
+                ["d", "e", "f"],
+                ["True"],
+                ["2021-04-21 18:07:26.633720"],
+                pd.NA,
+                ["1"],
+                ["2.0"],
+            ]
+        )
     assert isinstance(result, Series)
     tm.assert_almost_equal(result, exp)
 
@@ -51,16 +101,15 @@ def test_split():
     tm.assert_almost_equal(result, exp)
 
     # regex split
-    values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"])
+    values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
     result = values.str.split("[,_]")
     exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
     tm.assert_series_equal(result, exp)
 
 
-@pytest.mark.parametrize("dtype", [object, "string"])
 @pytest.mark.parametrize("method", ["split", "rsplit"])
-def test_split_n(dtype, method):
-    s = Series(["a b", pd.NA, "b c"], dtype=dtype)
+def test_split_n(any_string_dtype, method):
+    s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype)
     expected = Series([["a", "b"], pd.NA, ["b", "c"]])
 
     result = getattr(s.str, method)(" ", n=None)
@@ -70,14 +119,14 @@ def test_split_n(dtype, method):
     tm.assert_series_equal(result, expected)
 
 
-def test_rsplit():
-    values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
+def test_rsplit(any_string_dtype):
+    values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
     result = values.str.rsplit("_")
     exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
     tm.assert_series_equal(result, exp)
 
     # more than one char
-    values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"])
+    values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype)
     result = values.str.rsplit("__")
     tm.assert_series_equal(result, exp)
 
@@ -85,20 +134,46 @@ def test_rsplit():
     tm.assert_series_equal(result, exp)
 
     # mixed
-    mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
-    result = mixed.str.rsplit("_")
-    exp = Series(
+    mixed = Series(
         [
-            ["a", "b", "c"],
-            np.nan,
-            ["d", "e", "f"],
-            np.nan,
-            np.nan,
+            "a_b_c",
             np.nan,
-            np.nan,
-            np.nan,
-        ]
+            "d_e_f",
+            True,
+            datetime(2021, 4, 21, 18, 7, 26, 633720),
+            None,
+            1,
+            2.0,
+        ],
+        dtype=any_string_dtype,
     )
+    result = mixed.str.rsplit("_")
+    if any_string_dtype == "object":
+        exp = Series(
+            [
+                ["a", "b", "c"],
+                np.nan,
+                ["d", "e", "f"],
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+            ]
+        )
+    else:
+        exp = Series(
+            [
+                ["a", "b", "c"],
+                pd.NA,
+                ["d", "e", "f"],
+                ["True"],
+                ["2021-04-21 18:07:26.633720"],
+                pd.NA,
+                ["1"],
+                ["2.0"],
+            ]
+        )
     assert isinstance(result, Series)
     tm.assert_almost_equal(result, exp)
 
@@ -107,26 +182,31 @@ def test_rsplit():
     tm.assert_almost_equal(result, exp)
 
     # regex split is not supported by rsplit
-    values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"])
+    values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
     result = values.str.rsplit("[,_]")
     exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]])
     tm.assert_series_equal(result, exp)
 
     # setting max number of splits, make sure it's from reverse
-    values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
+    values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
     result = values.str.rsplit("_", n=1)
     exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]])
     tm.assert_series_equal(result, exp)
 
 
-def test_split_blank_string():
+def test_split_blank_string(any_string_dtype, request):
+    if any_string_dtype == "arrow_string":
+        reason = "AssertionError: DataFrame are different"
+        mark = pytest.mark.xfail(reason=reason, raises=AssertionError)
+        request.node.add_marker(mark)
+
     # expand blank split GH 20067
-    values = Series([""], name="test")
+    values = Series([""], name="test", dtype=any_string_dtype)
     result = values.str.split(expand=True)
-    exp = DataFrame([[]])  # NOTE: this is NOT an empty DataFrame
+    exp = DataFrame([[]], dtype=any_string_dtype)  # NOTE: this is NOT an empty df
     tm.assert_frame_equal(result, exp)
 
-    values = Series(["a b c", "a b", "", " "], name="test")
+    values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype)
     result = values.str.split(expand=True)
     exp = DataFrame(
         [
@@ -134,14 +214,15 @@ def test_split_blank_string():
             ["a", "b", np.nan],
             [np.nan, np.nan, np.nan],
             [np.nan, np.nan, np.nan],
-        ]
+        ],
+        dtype=any_string_dtype,
     )
     tm.assert_frame_equal(result, exp)
 
 
-def test_split_noargs():
+def test_split_noargs(any_string_dtype):
     # #1859
-    s = Series(["Wes McKinney", "Travis  Oliphant"])
+    s = Series(["Wes McKinney", "Travis  Oliphant"], dtype=any_string_dtype)
     result = s.str.split()
     expected = ["Travis", "Oliphant"]
     assert result[1] == expected
@@ -149,44 +230,64 @@ def test_split_noargs():
     assert result[1] == expected
 
 
-def test_split_maxsplit():
+@pytest.mark.parametrize(
+    "data, pat",
+    [
+        (["bd asdf jfg", "kjasdflqw asdfnfk"], None),
+        (["bd asdf jfg", "kjasdflqw asdfnfk"], "asdf"),
+        (["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"),
+    ],
+)
+def test_split_maxsplit(data, pat, any_string_dtype):
     # re.split 0, str.split -1
-    s = Series(["bd asdf jfg", "kjasdflqw asdfnfk"])
+    s = Series(data, dtype=any_string_dtype)
 
-    result = s.str.split(n=-1)
-    xp = s.str.split()
+    result = s.str.split(pat=pat, n=-1)
+    xp = s.str.split(pat=pat)
     tm.assert_series_equal(result, xp)
 
-    result = s.str.split(n=0)
-    tm.assert_series_equal(result, xp)
-
-    xp = s.str.split("asdf")
-    result = s.str.split("asdf", n=0)
-    tm.assert_series_equal(result, xp)
-
-    result = s.str.split("asdf", n=-1)
+    result = s.str.split(pat=pat, n=0)
     tm.assert_series_equal(result, xp)
 
 
-def test_split_no_pat_with_nonzero_n():
-    s = Series(["split once", "split once too!"])
-    result = s.str.split(n=1)
-    expected = Series({0: ["split", "once"], 1: ["split", "once too!"]})
+@pytest.mark.parametrize(
+    "data, pat, expected",
+    [
+        (
+            ["split once", "split once too!"],
+            None,
+            Series({0: ["split", "once"], 1: ["split", "once too!"]}),
+        ),
+        (
+            ["split_once", "split_once_too!"],
+            "_",
+            Series({0: ["split", "once"], 1: ["split", "once_too!"]}),
+        ),
+    ],
+)
+def test_split_no_pat_with_nonzero_n(data, pat, expected, any_string_dtype):
+    s = Series(data, dtype=any_string_dtype)
+    result = s.str.split(pat=pat, n=1)
     tm.assert_series_equal(expected, result, check_index_type=False)
 
 
-def test_split_to_dataframe():
-    s = Series(["nosplit", "alsonosplit"])
+def test_split_to_dataframe(any_string_dtype):
+    s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
     result = s.str.split("_", expand=True)
-    exp = DataFrame({0: Series(["nosplit", "alsonosplit"])})
+    exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)})
     tm.assert_frame_equal(result, exp)
 
-    s = Series(["some_equal_splits", "with_no_nans"])
+    s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
     result = s.str.split("_", expand=True)
-    exp = DataFrame({0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]})
+    exp = DataFrame(
+        {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
+        dtype=any_string_dtype,
+    )
     tm.assert_frame_equal(result, exp)
 
-    s = Series(["some_unequal_splits", "one_of_these_things_is_not"])
+    s = Series(
+        ["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype
+    )
     result = s.str.split("_", expand=True)
     exp = DataFrame(
         {
@@ -196,14 +297,19 @@ def test_split_to_dataframe():
             3: [np.nan, "things"],
             4: [np.nan, "is"],
             5: [np.nan, "not"],
-        }
+        },
+        dtype=any_string_dtype,
     )
     tm.assert_frame_equal(result, exp)
 
-    s = Series(["some_splits", "with_index"], index=["preserve", "me"])
+    s = Series(
+        ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
+    )
     result = s.str.split("_", expand=True)
     exp = DataFrame(
-        {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"]
+        {0: ["some", "with"], 1: ["splits", "index"]},
+        index=["preserve", "me"],
+        dtype=any_string_dtype,
     )
     tm.assert_frame_equal(result, exp)
 
@@ -250,15 +356,23 @@ def test_split_to_multiindex_expand():
         idx.str.split("_", expand="not_a_boolean")
 
 
-def test_rsplit_to_dataframe_expand():
-    s = Series(["nosplit", "alsonosplit"])
+def test_rsplit_to_dataframe_expand(any_string_dtype, request):
+    if any_string_dtype != "object":
+        reason = 'Attribute "dtype" are different'
+        mark = pytest.mark.xfail(reason=reason, raises=AssertionError)
+        request.node.add_marker(mark)
+
+    s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
     result = s.str.rsplit("_", expand=True)
-    exp = DataFrame({0: Series(["nosplit", "alsonosplit"])})
+    exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype)
     tm.assert_frame_equal(result, exp)
 
-    s = Series(["some_equal_splits", "with_no_nans"])
+    s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
     result = s.str.rsplit("_", expand=True)
-    exp = DataFrame({0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]})
+    exp = DataFrame(
+        {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
+        dtype=any_string_dtype,
+    )
     tm.assert_frame_equal(result, exp)
 
     result = s.str.rsplit("_", expand=True, n=2)
@@ -297,30 +411,35 @@ def test_rsplit_to_multiindex_expand():
     assert result.nlevels == 2
 
 
-def test_split_nan_expand():
+def test_split_nan_expand(any_string_dtype):
     # gh-18450
-    s = Series(["foo,bar,baz", np.nan])
+    s = Series(["foo,bar,baz", np.nan], dtype=any_string_dtype)
     result = s.str.split(",", expand=True)
-    exp = DataFrame([["foo", "bar", "baz"], [np.nan, np.nan, np.nan]])
+    exp = DataFrame(
+        [["foo", "bar", "baz"], [np.nan, np.nan, np.nan]], dtype=any_string_dtype
+    )
     tm.assert_frame_equal(result, exp)
 
-    # check that these are actually np.nan and not None
+    # check that these are actually np.nan/pd.NA and not None
     # TODO see GH 18463
     # tm.assert_frame_equal does not differentiate
-    assert all(np.isnan(x) for x in result.iloc[1])
+    if any_string_dtype == "object":
+        assert all(np.isnan(x) for x in result.iloc[1])
+    else:
+        assert all(x is pd.NA for x in result.iloc[1])
 
 
-def test_split_with_name():
+def test_split_with_name(any_string_dtype):
     # GH 12617
 
     # should preserve name
-    s = Series(["a,b", "c,d"], name="xxx")
+    s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
     res = s.str.split(",")
     exp = Series([["a", "b"], ["c", "d"]], name="xxx")
     tm.assert_series_equal(res, exp)
 
     res = s.str.split(",", expand=True)
-    exp = DataFrame([["a", "b"], ["c", "d"]])
+    exp = DataFrame([["a", "b"], ["c", "d"]], dtype=any_string_dtype)
     tm.assert_frame_equal(res, exp)
 
     idx = Index(["a,b", "c,d"], name="xxx")

From 427eff71b4d12b20ab8b97c30fd143c1cea7d668 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 29 Apr 2021 12:50:23 +0100
Subject: [PATCH 02/15] move fixture to conftest.py

---
 pandas/tests/strings/conftest.py             | 24 ++++++++++++++++++++
 pandas/tests/strings/test_split_partition.py | 24 --------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py
index 4fedbee91f649..379036932c580 100644
--- a/pandas/tests/strings/conftest.py
+++ b/pandas/tests/strings/conftest.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 from pandas import Series
 from pandas.core import strings as strings
 
@@ -173,3 +175,25 @@ def any_allowed_skipna_inferred_dtype(request):
 
     # correctness of inference tested in tests/dtypes/test_inference.py
     return inferred_dtype, values
+
+
+@pytest.fixture(
+    params=[
+        "object",
+        "string",
+        pytest.param(
+            "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+        ),
+    ]
+)
+def any_string_dtype(request):
+    """
+    Parametrized fixture for string dtypes.
+
+    * 'object'
+    * 'string'
+    * 'arrow_string'
+    """
+    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
+
+    return request.param
diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
index 842e9b5ccb2a0..3635997e6001c 100644
--- a/pandas/tests/strings/test_split_partition.py
+++ b/pandas/tests/strings/test_split_partition.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-import pandas.util._test_decorators as td
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -15,28 +13,6 @@
 )
 
 
-@pytest.fixture(
-    params=[
-        "object",
-        "string",
-        pytest.param(
-            "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
-        ),
-    ]
-)
-def any_string_dtype(request):
-    """
-    Parametrized fixture for string dtypes.
-
-    * 'object'
-    * 'string'
-    * 'arrow_string'
-    """
-    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
-    return request.param
-
-
 def test_split(any_string_dtype):
     values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
 

From 09ad85e4300887a2b8c5cba65585d57646897605 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 29 Apr 2021 12:57:21 +0100
Subject: [PATCH 03/15] mixed object to seperate test

---
 pandas/tests/strings/test_split_partition.py | 135 ++++++-------------
 1 file changed, 43 insertions(+), 92 deletions(-)

diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
index 3635997e6001c..284be3bdb38b7 100644
--- a/pandas/tests/strings/test_split_partition.py
+++ b/pandas/tests/strings/test_split_partition.py
@@ -28,47 +28,28 @@ def test_split(any_string_dtype):
     result = values.str.split("__", expand=False)
     tm.assert_series_equal(result, exp)
 
-    # mixed
-    mixed = Series(
+    # regex split
+    values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
+    result = values.str.split("[,_]")
+    exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
+    tm.assert_series_equal(result, exp)
+
+
+def test_split_object_mixed():
+    mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
+    result = mixed.str.split("_")
+    exp = Series(
         [
-            "a_b_c",
+            ["a", "b", "c"],
             np.nan,
-            "d_e_f",
-            True,
-            datetime(2021, 4, 21, 18, 7, 26, 633720),
-            None,
-            1,
-            2.0,
-        ],
-        dtype=any_string_dtype,
+            ["d", "e", "f"],
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+        ]
     )
-    result = mixed.str.split("_")
-    if any_string_dtype == "object":
-        exp = Series(
-            [
-                ["a", "b", "c"],
-                np.nan,
-                ["d", "e", "f"],
-                np.nan,
-                np.nan,
-                np.nan,
-                np.nan,
-                np.nan,
-            ]
-        )
-    else:
-        exp = Series(
-            [
-                ["a", "b", "c"],
-                pd.NA,
-                ["d", "e", "f"],
-                ["True"],
-                ["2021-04-21 18:07:26.633720"],
-                pd.NA,
-                ["1"],
-                ["2.0"],
-            ]
-        )
     assert isinstance(result, Series)
     tm.assert_almost_equal(result, exp)
 
@@ -76,12 +57,6 @@ def test_split(any_string_dtype):
     assert isinstance(result, Series)
     tm.assert_almost_equal(result, exp)
 
-    # regex split
-    values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
-    result = values.str.split("[,_]")
-    exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
-    tm.assert_series_equal(result, exp)
-
 
 @pytest.mark.parametrize("method", ["split", "rsplit"])
 def test_split_n(any_string_dtype, method):
@@ -109,54 +84,6 @@ def test_rsplit(any_string_dtype):
     result = values.str.rsplit("__", expand=False)
     tm.assert_series_equal(result, exp)
 
-    # mixed
-    mixed = Series(
-        [
-            "a_b_c",
-            np.nan,
-            "d_e_f",
-            True,
-            datetime(2021, 4, 21, 18, 7, 26, 633720),
-            None,
-            1,
-            2.0,
-        ],
-        dtype=any_string_dtype,
-    )
-    result = mixed.str.rsplit("_")
-    if any_string_dtype == "object":
-        exp = Series(
-            [
-                ["a", "b", "c"],
-                np.nan,
-                ["d", "e", "f"],
-                np.nan,
-                np.nan,
-                np.nan,
-                np.nan,
-                np.nan,
-            ]
-        )
-    else:
-        exp = Series(
-            [
-                ["a", "b", "c"],
-                pd.NA,
-                ["d", "e", "f"],
-                ["True"],
-                ["2021-04-21 18:07:26.633720"],
-                pd.NA,
-                ["1"],
-                ["2.0"],
-            ]
-        )
-    assert isinstance(result, Series)
-    tm.assert_almost_equal(result, exp)
-
-    result = mixed.str.rsplit("_", expand=False)
-    assert isinstance(result, Series)
-    tm.assert_almost_equal(result, exp)
-
     # regex split is not supported by rsplit
     values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
     result = values.str.rsplit("[,_]")
@@ -170,6 +97,30 @@ def test_rsplit(any_string_dtype):
     tm.assert_series_equal(result, exp)
 
 
+def test_rsplit_object_mixed():
+    # mixed
+    mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
+    result = mixed.str.rsplit("_")
+    exp = Series(
+        [
+            ["a", "b", "c"],
+            np.nan,
+            ["d", "e", "f"],
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+        ]
+    )
+    assert isinstance(result, Series)
+    tm.assert_almost_equal(result, exp)
+
+    result = mixed.str.rsplit("_", expand=False)
+    assert isinstance(result, Series)
+    tm.assert_almost_equal(result, exp)
+
+
 def test_split_blank_string(any_string_dtype, request):
     if any_string_dtype == "arrow_string":
         reason = "AssertionError: DataFrame are different"

From 39dd30a183323101549f3f84479b823b76181d34 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 29 Apr 2021 13:22:08 +0100
Subject: [PATCH 04/15] add benchmark

---
 asv_bench/benchmarks/strings.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 45a9053954569..e77f74b90d6c8 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -230,17 +230,24 @@ def time_contains(self, dtype, regex):
 
 class Split:
 
-    params = [True, False]
-    param_names = ["expand"]
+    params = (["str", "string", "arrow_string"], [None, "-", "--"], [True, False])
+    param_names = ["dtype", "pat", "expand"]
 
-    def setup(self, expand):
-        self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--")
+    def setup(self, dtype, pat, expand):
+        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
+
+        if pat is None:
+            pat = "   "
+        try:
+            self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join(pat)
+        except ImportError:
+            raise NotImplementedError
 
-    def time_split(self, expand):
-        self.s.str.split("--", expand=expand)
+    def time_split(self, dtype, pat, expand):
+        self.s.str.split(pat, expand=expand)
 
-    def time_rsplit(self, expand):
-        self.s.str.rsplit("--", expand=expand)
+    def time_rsplit(self, dtype, pat, expand):
+        self.s.str.rsplit(pat, expand=expand)
 
 
 class Dummies:

From c9511d94463cf135ac3c96a9574e6c685f19b5f5 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 29 Apr 2021 13:47:16 +0100
Subject: [PATCH 05/15] wip

---
 pandas/core/arrays/string_arrow.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 24dd5c6814148..d75fb65885122 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -852,14 +852,16 @@ def _str_split(self, pat=None, n=-1, expand=False):
             is_valid = np.array(result.is_valid())
             result = np.array(result)
             result[~is_valid] = self.dtype.na_value
-            valid = result[is_valid]
-            # we need to loop through to avoid numpy indexing assignment errors when
-            # the result is not a ragged array and interpreted as a 2 dimensional
-            # array
-            for i, val in enumerate(valid):
-                valid[i] = val.tolist()
+            # if not expand:
+            #     valid = result[is_valid]
+            #     # we need to loop through to avoid numpy indexing assignment errors when
+            #     # the result is not a ragged array and interpreted as a 2 dimensional
+            #     # array
+            #     for i, val in enumerate(valid):
+            #         valid[i] = val.tolist()
         else:
             result = np.array(result)
-            for i, val in enumerate(result):
-                result[i] = val.tolist()
+            # if not expand:
+            #     for i, val in enumerate(result):
+            #         result[i] = val.tolist()
         return result

From 5c2ab242f65942b3e70029310dedfbd083f7c70f Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 12 May 2021 12:57:48 +0100
Subject: [PATCH 06/15] post merge fix-up

---
 pandas/core/arrays/string_arrow.py           | 20 ++++++++++----------
 pandas/tests/strings/test_split_partition.py |  7 +------
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index bf7fef1a2ee16..8a49b7829ce25 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -919,18 +919,18 @@ def _str_split(self, pat=None, n=-1, expand=False):
             is_valid = np.array(result.is_valid())
             result = np.array(result)
             result[~is_valid] = self.dtype.na_value
-            # if not expand:
-            #     valid = result[is_valid]
-            #     # we need to loop through to avoid numpy indexing assignment errors when
-            #     # the result is not a ragged array and interpreted as a 2 dimensional
-            #     # array
-            #     for i, val in enumerate(valid):
-            #         valid[i] = val.tolist()
+            if not expand:
+                valid = result[is_valid]
+                # we need to loop through to avoid numpy indexing assignment errors when
+                # the result is not a ragged array and interpreted as a 2 dimensional
+                # array
+                for i, val in enumerate(valid):
+                    valid[i] = val.tolist()
         else:
             result = np.array(result)
-            # if not expand:
-            #     for i, val in enumerate(result):
-            #         result[i] = val.tolist()
+            if not expand:
+                for i, val in enumerate(result):
+                    result[i] = val.tolist()
         return result
 
     def _str_strip(self, to_strip=None):
diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
index 2e71f86807606..358bab1fbe661 100644
--- a/pandas/tests/strings/test_split_partition.py
+++ b/pandas/tests/strings/test_split_partition.py
@@ -283,12 +283,7 @@ def test_split_to_multiindex_expand():
         idx.str.split("_", expand="not_a_boolean")
 
 
-def test_rsplit_to_dataframe_expand(any_string_dtype, request):
-    if any_string_dtype != "object":
-        reason = 'Attribute "dtype" are different'
-        mark = pytest.mark.xfail(reason=reason, raises=AssertionError)
-        request.node.add_marker(mark)
-
+def test_rsplit_to_dataframe_expand(any_string_dtype):
     s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
     result = s.str.rsplit("_", expand=True)
     exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype)

From 12407fbe5dc9a996d64e899f92e5dc01ac4c2646 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 12 May 2021 13:02:25 +0100
Subject: [PATCH 07/15] remove fixture

---
 pandas/tests/strings/conftest.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py
index 379036932c580..4fedbee91f649 100644
--- a/pandas/tests/strings/conftest.py
+++ b/pandas/tests/strings/conftest.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-import pandas.util._test_decorators as td
-
 from pandas import Series
 from pandas.core import strings as strings
 
@@ -175,25 +173,3 @@ def any_allowed_skipna_inferred_dtype(request):
 
     # correctness of inference tested in tests/dtypes/test_inference.py
     return inferred_dtype, values
-
-
-@pytest.fixture(
-    params=[
-        "object",
-        "string",
-        pytest.param(
-            "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
-        ),
-    ]
-)
-def any_string_dtype(request):
-    """
-    Parametrized fixture for string dtypes.
-
-    * 'object'
-    * 'string'
-    * 'arrow_string'
-    """
-    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
-    return request.param

From 24d23951a245de0317a877eda070337fd7aa9b78 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 12 May 2021 13:04:48 +0100
Subject: [PATCH 08/15] remove xfail (need to fix failing test on blank string
 before merge)

---
 pandas/tests/strings/test_split_partition.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
index 358bab1fbe661..e59105eccc67c 100644
--- a/pandas/tests/strings/test_split_partition.py
+++ b/pandas/tests/strings/test_split_partition.py
@@ -121,12 +121,7 @@ def test_rsplit_object_mixed():
     tm.assert_almost_equal(result, exp)
 
 
-def test_split_blank_string(any_string_dtype, request):
-    if any_string_dtype == "arrow_string":
-        reason = "AssertionError: DataFrame are different"
-        mark = pytest.mark.xfail(reason=reason, raises=AssertionError)
-        request.node.add_marker(mark)
-
+def test_split_blank_string(any_string_dtype):
     # expand blank split GH 20067
     values = Series([""], name="test", dtype=any_string_dtype)
     result = values.str.split(expand=True)

From 3d9297dee776f87309821f2547c1a96b69d16dd2 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 17 May 2021 15:53:17 +0100
Subject: [PATCH 09/15] seperate benchmark for pattern

---
 asv_bench/benchmarks/strings.py | 34 ++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 277f2ec91e58c..a5b39c9da5eee 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -230,10 +230,30 @@ def time_contains(self, dtype, regex):
 
 class Split:
 
-    params = (["str", "string", "arrow_string"], [None, "-", "--"], [True, False])
-    param_names = ["dtype", "pat", "expand"]
+    params = (["str", "string", "arrow_string"], [True, False])
+    param_names = ["dtype", "expand"]
+
+    def setup(self, dtype, expand):
+        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
+
+        try:
+            self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--")
+        except ImportError:
+            raise NotImplementedError
+
+    def time_split(self, dtype, expand):
+        self.s.str.split("--", expand=expand)
+
+    def time_rsplit(self, dtype, expand):
+        self.s.str.rsplit("--", expand=expand)
+
+
+class SplitPattern:
+
+    params = (["str", "string", "arrow_string"], [None, "-"])
+    param_names = ["dtype", "pat"]
 
-    def setup(self, dtype, pat, expand):
+    def setup(self, dtype, pat):
         from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
 
         if pat is None:
@@ -243,11 +263,11 @@ def setup(self, dtype, pat, expand):
         except ImportError:
             raise NotImplementedError
 
-    def time_split(self, dtype, pat, expand):
-        self.s.str.split(pat, expand=expand)
+    def time_split(self, dtype, pat):
+        self.s.str.split(pat)
 
-    def time_rsplit(self, dtype, pat, expand):
-        self.s.str.rsplit(pat, expand=expand)
+    def time_rsplit(self, dtype, pat):
+        self.s.str.rsplit(pat)
 
 
 class Dummies:

From a574ccb4c2c0f20385b059b26ed542fed595adba Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 17 May 2021 16:35:37 +0100
Subject: [PATCH 10/15] use pa_version_under3p0 instead of hasattr

---
 pandas/core/arrays/string_arrow.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 8a49b7829ce25..108520551bd85 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -900,20 +900,16 @@ def _str_upper(self):
         return type(self)(pc.utf8_upper(self._data))
 
     def _str_split(self, pat=None, n=-1, expand=False):
+        if pa_version_under3p0 or (pat is not None and len(pat) > 1):
+            return super()._str_split(pat=pat, n=n, expand=expand)
+
+        if n is None or n == 0:
+            n = -1
+
         if pat is None:
-            if hasattr(pc, "utf8_split_whitespace"):
-                if n is None or n == 0:
-                    n = -1
-                result = pc.utf8_split_whitespace(self._data, max_splits=n)
-            else:
-                return super()._str_split(pat=pat, n=n, expand=expand)
+            result = pc.utf8_split_whitespace(self._data, max_splits=n)
         else:
-            if len(pat) == 1 and hasattr(pc, "split_pattern"):
-                if n is None or n == 0:
-                    n = -1
-                result = pc.split_pattern(self._data, pattern=pat, max_splits=n)
-            else:
-                return super()._str_split(pat=pat, n=n, expand=expand)
+            result = pc.split_pattern(self._data, pattern=pat, max_splits=n)
 
         if result.null_count:
             is_valid = np.array(result.is_valid())

From 9fc0144bf251dfb1677546c8c88561521e753554 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 18 May 2021 22:34:58 +0100
Subject: [PATCH 11/15] add test case

---
 pandas/core/arrays/string_arrow.py           | 10 +++++-----
 pandas/tests/strings/test_split_partition.py |  7 +++++--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index a708b9d8d57a6..17537408cbae1 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -917,16 +917,16 @@ def _str_split(self, pat=None, n=-1, expand=False):
             result = pc.split_pattern(self._data, pattern=pat, max_splits=n)
 
         if result.null_count:
-            is_valid = np.array(result.is_valid())
+            mask = np.array(result.is_null())
             result = np.array(result)
-            result[~is_valid] = self.dtype.na_value
+            result[mask] = self.dtype.na_value
             if not expand:
-                valid = result[is_valid]
                 # we need to loop through to avoid numpy indexing assignment errors when
                 # the result is not a ragged array and interpreted as a 2 dimensional
                 # array
-                for i, val in enumerate(valid):
-                    valid[i] = val.tolist()
+                for idx in np.argwhere(~mask):
+                    idx = idx[0]
+                    result[idx] = result[idx].tolist()
         else:
             result = np.array(result)
             if not expand:
diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
index f3f5acd0d2f1c..b80656f52dbf0 100644
--- a/pandas/tests/strings/test_split_partition.py
+++ b/pandas/tests/strings/test_split_partition.py
@@ -144,11 +144,14 @@ def test_split_blank_string(any_string_dtype):
 
 def test_split_noargs(any_string_dtype):
     # #1859
+    expected = ["Travis", "Oliphant"]
+
     s = Series(["Wes McKinney", "Travis  Oliphant"], dtype=any_string_dtype)
     result = s.str.split()
-    expected = ["Travis", "Oliphant"]
     assert result[1] == expected
-    result = s.str.rsplit()
+
+    s = Series(["Wes McKinney", "Travis  Oliphant", np.nan], dtype=any_string_dtype)
+    result = s.str.split()
     assert result[1] == expected
 
 

From 885510013f7a95cde91e879468974c0602a19211 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 18 May 2021 23:02:41 +0100
Subject: [PATCH 12/15] use ObjectStringArrayMixin._str_map

---
 pandas/core/arrays/string_arrow.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 17537408cbae1..458b08cf9d4f7 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -916,22 +916,11 @@ def _str_split(self, pat=None, n=-1, expand=False):
         else:
             result = pc.split_pattern(self._data, pattern=pat, max_splits=n)
 
-        if result.null_count:
-            mask = np.array(result.is_null())
-            result = np.array(result)
-            result[mask] = self.dtype.na_value
-            if not expand:
-                # we need to loop through to avoid numpy indexing assignment errors when
-                # the result is not a ragged array and interpreted as a 2 dimensional
-                # array
-                for idx in np.argwhere(~mask):
-                    idx = idx[0]
-                    result[idx] = result[idx].tolist()
-        else:
-            result = np.array(result)
-            if not expand:
-                for i, val in enumerate(result):
-                    result[i] = val.tolist()
+        result = np.array(result)
+        if not expand:
+            result = ObjectStringArrayMixin._str_map(
+                result, lambda x: x.tolist(), na_value=self.dtype.na_value, dtype=object
+            )
         return result
 
     def _str_strip(self, to_strip=None):

From ad3480fc8ea08264c8a6d8bff514db047ab2de10 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 19 May 2021 09:47:52 +0100
Subject: [PATCH 13/15] use lib.map_infer_mask

---
 pandas/core/arrays/string_arrow.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 458b08cf9d4f7..e1cc9051b847c 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -916,10 +916,15 @@ def _str_split(self, pat=None, n=-1, expand=False):
         else:
             result = pc.split_pattern(self._data, pattern=pat, max_splits=n)
 
+        mask = np.array(result.is_null())
         result = np.array(result)
         if not expand:
-            result = ObjectStringArrayMixin._str_map(
-                result, lambda x: x.tolist(), na_value=self.dtype.na_value, dtype=object
+            result = lib.map_infer_mask(
+                result,
+                lambda x: x.tolist(),
+                mask.view(np.uint8),
+                na_value=self.dtype.na_value,
+                dtype=np.dtype(object),
             )
         return result
 

From 70677c4992a3cb224fc25c059525872fac03ca49 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 19 May 2021 10:06:01 +0100
Subject: [PATCH 14/15] update benchmark

---
 asv_bench/benchmarks/strings.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 83f80c930f007..700393cc72492 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -235,18 +235,18 @@ class Split(Dtypes):
 
     def setup(self, dtype, expand):
         super().setup(dtype)
-        self.s = self.s.str.join("--")
+        self.s = self.s.str.join("-")
 
     def time_split(self, dtype, expand):
-        self.s.str.split("--", expand=expand)
+        self.s.str.split("-", expand=expand)
 
     def time_rsplit(self, dtype, expand):
-        self.s.str.rsplit("--", expand=expand)
+        self.s.str.rsplit("-", expand=expand)
 
 
 class SplitPattern(Dtypes):
 
-    params = (Dtypes.params, [None, "-"])
+    params = (Dtypes.params, [None, "--"])
     param_names = ["dtype", "pat"]
 
     def setup(self, dtype, pat):

From af580558c5f3ef01b3a82aa811cc193aad78189c Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 19 May 2021 10:48:26 +0100
Subject: [PATCH 15/15] always convert to lists

---
 pandas/core/arrays/string_arrow.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index e1cc9051b847c..3b20df5fdf82b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -918,14 +918,13 @@ def _str_split(self, pat=None, n=-1, expand=False):
 
         mask = np.array(result.is_null())
         result = np.array(result)
-        if not expand:
-            result = lib.map_infer_mask(
-                result,
-                lambda x: x.tolist(),
-                mask.view(np.uint8),
-                na_value=self.dtype.na_value,
-                dtype=np.dtype(object),
-            )
+        result = lib.map_infer_mask(
+            result,
+            lambda x: x.tolist(),
+            mask.view(np.uint8),
+            na_value=self.dtype.na_value,
+            dtype=np.dtype(object),
+        )
         return result
 
     def _str_strip(self, to_strip=None):