From 34df9e5df736191c5ba22f18606b2c81b055d752 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 21 Apr 2021 20:28:52 +0100 Subject: [PATCH 01/15] [ArrowStringArray] implement ArrowStringArray._str_split --- pandas/core/arrays/string_arrow.py | 32 +++ pandas/tests/strings/test_split_partition.py | 277 +++++++++++++------ 2 files changed, 230 insertions(+), 79 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index dd09ef4e585ce..b0f6d1b7dd1b0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -763,3 +763,35 @@ def _str_lower(self): def _str_upper(self): return type(self)(pc.utf8_upper(self._data)) + + def _str_split(self, pat=None, n=-1, expand=False): + if pat is None: + if hasattr(pc, "utf8_split_whitespace"): + if n is None or n == 0: + n = -1 + result = pc.utf8_split_whitespace(self._data, max_splits=n) + else: + return super()._str_split(pat=pat, n=n, expand=expand) + else: + if len(pat) == 1 and hasattr(pc, "split_pattern"): + if n is None or n == 0: + n = -1 + result = pc.split_pattern(self._data, pattern=pat, max_splits=n) + else: + return super()._str_split(pat=pat, n=n, expand=expand) + + if result.null_count: + is_valid = np.array(result.is_valid()) + result = np.array(result) + result[~is_valid] = self.dtype.na_value + valid = result[is_valid] + # we need to loop through to avoid numpy indexing assignment errors when + # the result is not a ragged array and interpreted as a 2 dimensional + # array + for i, val in enumerate(valid): + valid[i] = val.tolist() + else: + result = np.array(result) + for i, val in enumerate(result): + result[i] = val.tolist() + return result diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 6df8fa805955d..842e9b5ccb2a0 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -13,15 +15,37 @@ ) -def test_split(): - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) +@pytest.fixture( + params=[ + "object", + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def any_string_dtype(request): + """ + Parametrized fixture for string dtypes. + + * 'object' + * 'string' + * 'arrow_string' + """ + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + return request.param + + +def test_split(any_string_dtype): + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.split("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char - values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) + values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) result = values.str.split("__") tm.assert_series_equal(result, exp) @@ -29,20 +53,46 @@ def test_split(): tm.assert_series_equal(result, exp) # mixed - mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) - result = mixed.str.split("_") - exp = Series( + mixed = Series( [ - ["a", "b", "c"], - np.nan, - ["d", "e", "f"], - np.nan, - np.nan, - np.nan, - np.nan, + "a_b_c", np.nan, - ] + "d_e_f", + True, + datetime(2021, 4, 21, 18, 7, 26, 633720), + None, + 1, + 2.0, + ], + dtype=any_string_dtype, ) + result = mixed.str.split("_") + if any_string_dtype == "object": + exp = Series( + [ + ["a", "b", "c"], + np.nan, + ["d", "e", "f"], + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] + ) + else: + exp = Series( + [ + ["a", "b", "c"], + pd.NA, + ["d", "e", "f"], + ["True"], + ["2021-04-21 18:07:26.633720"], + pd.NA, + ["1"], + ["2.0"], + ] + ) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) @@ -51,16 +101,15 @@ def test_split(): tm.assert_almost_equal(result, exp) # regex split - values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) + values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.split("[,_]") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) -@pytest.mark.parametrize("dtype", [object, "string"]) @pytest.mark.parametrize("method", ["split", "rsplit"]) -def test_split_n(dtype, method): - s = Series(["a b", pd.NA, "b c"], dtype=dtype) +def test_split_n(any_string_dtype, method): + s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype) expected = Series([["a", "b"], pd.NA, ["b", "c"]]) result = getattr(s.str, method)(" ", n=None) @@ -70,14 +119,14 @@ def test_split_n(dtype, method): tm.assert_series_equal(result, expected) -def test_rsplit(): - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) +def test_rsplit(any_string_dtype): + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.rsplit("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char - values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) + values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) result = values.str.rsplit("__") tm.assert_series_equal(result, exp) @@ -85,20 +134,46 @@ def test_rsplit(): tm.assert_series_equal(result, exp) # mixed - mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) - result = mixed.str.rsplit("_") - exp = Series( + mixed = Series( [ - ["a", "b", "c"], - np.nan, - ["d", "e", "f"], - np.nan, - np.nan, + "a_b_c", np.nan, - np.nan, - np.nan, - ] + "d_e_f", + True, + datetime(2021, 4, 21, 18, 7, 26, 633720), + None, + 1, + 2.0, + ], + dtype=any_string_dtype, ) + result = mixed.str.rsplit("_") + if any_string_dtype == "object": + exp = Series( + [ + ["a", "b", "c"], + np.nan, + ["d", "e", "f"], + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] + ) + else: + exp = Series( + [ + ["a", "b", "c"], + pd.NA, + ["d", "e", "f"], + ["True"], + ["2021-04-21 18:07:26.633720"], + pd.NA, + ["1"], + ["2.0"], + ] + ) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) @@ -107,26 +182,31 @@ def test_rsplit(): tm.assert_almost_equal(result, exp) # regex split is not supported by rsplit - values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) + values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.rsplit("[,_]") exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) tm.assert_series_equal(result, exp) # setting max number of splits, make sure it's from reverse - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.rsplit("_", n=1) exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) tm.assert_series_equal(result, exp) -def test_split_blank_string(): +def test_split_blank_string(any_string_dtype, request): + if any_string_dtype == "arrow_string": + reason = "AssertionError: DataFrame are different" + mark = pytest.mark.xfail(reason=reason, raises=AssertionError) + request.node.add_marker(mark) + # expand blank split GH 20067 - values = Series([""], name="test") + values = Series([""], name="test", dtype=any_string_dtype) result = values.str.split(expand=True) - exp = DataFrame([[]]) # NOTE: this is NOT an empty DataFrame + exp = DataFrame([[]], dtype=any_string_dtype) # NOTE: this is NOT an empty df tm.assert_frame_equal(result, exp) - values = Series(["a b c", "a b", "", " "], name="test") + values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype) result = values.str.split(expand=True) exp = DataFrame( [ @@ -134,14 +214,15 @@ def test_split_blank_string(): ["a", "b", np.nan], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], - ] + ], + dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) -def test_split_noargs(): +def test_split_noargs(any_string_dtype): # #1859 - s = Series(["Wes McKinney", "Travis Oliphant"]) + s = Series(["Wes McKinney", "Travis Oliphant"], dtype=any_string_dtype) result = s.str.split() expected = ["Travis", "Oliphant"] assert result[1] == expected @@ -149,44 +230,64 @@ def test_split_noargs(): assert result[1] == expected -def test_split_maxsplit(): +@pytest.mark.parametrize( + "data, pat", + [ + (["bd asdf jfg", "kjasdflqw asdfnfk"], None), + (["bd asdf jfg", "kjasdflqw asdfnfk"], "asdf"), + (["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"), + ], +) +def test_split_maxsplit(data, pat, any_string_dtype): # re.split 0, str.split -1 - s = Series(["bd asdf jfg", "kjasdflqw asdfnfk"]) + s = Series(data, dtype=any_string_dtype) - result = s.str.split(n=-1) - xp = s.str.split() + result = s.str.split(pat=pat, n=-1) + xp = s.str.split(pat=pat) tm.assert_series_equal(result, xp) - result = s.str.split(n=0) - tm.assert_series_equal(result, xp) - - xp = s.str.split("asdf") - result = s.str.split("asdf", n=0) - tm.assert_series_equal(result, xp) - - result = s.str.split("asdf", n=-1) + result = s.str.split(pat=pat, n=0) tm.assert_series_equal(result, xp) -def test_split_no_pat_with_nonzero_n(): - s = Series(["split once", "split once too!"]) - result = s.str.split(n=1) - expected = Series({0: ["split", "once"], 1: ["split", "once too!"]}) +@pytest.mark.parametrize( + "data, pat, expected", + [ + ( + ["split once", "split once too!"], + None, + Series({0: ["split", "once"], 1: ["split", "once too!"]}), + ), + ( + ["split_once", "split_once_too!"], + "_", + Series({0: ["split", "once"], 1: ["split", "once_too!"]}), + ), + ], +) +def test_split_no_pat_with_nonzero_n(data, pat, expected, any_string_dtype): + s = Series(data, dtype=any_string_dtype) + result = s.str.split(pat=pat, n=1) tm.assert_series_equal(expected, result, check_index_type=False) -def test_split_to_dataframe(): - s = Series(["nosplit", "alsonosplit"]) +def test_split_to_dataframe(any_string_dtype): + s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) result = s.str.split("_", expand=True) - exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) + exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)}) tm.assert_frame_equal(result, exp) - s = Series(["some_equal_splits", "with_no_nans"]) + s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype) result = s.str.split("_", expand=True) - exp = DataFrame({0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}, + dtype=any_string_dtype, + ) tm.assert_frame_equal(result, exp) - s = Series(["some_unequal_splits", "one_of_these_things_is_not"]) + s = Series( + ["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype + ) result = s.str.split("_", expand=True) exp = DataFrame( { @@ -196,14 +297,19 @@ def test_split_to_dataframe(): 3: [np.nan, "things"], 4: [np.nan, "is"], 5: [np.nan, "not"], - } + }, + dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) - s = Series(["some_splits", "with_index"], index=["preserve", "me"]) + s = Series( + ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype + ) result = s.str.split("_", expand=True) exp = DataFrame( - {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] + {0: ["some", "with"], 1: ["splits", "index"]}, + index=["preserve", "me"], + dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) @@ -250,15 +356,23 @@ def test_split_to_multiindex_expand(): idx.str.split("_", expand="not_a_boolean") -def test_rsplit_to_dataframe_expand(): - s = Series(["nosplit", "alsonosplit"]) +def test_rsplit_to_dataframe_expand(any_string_dtype, request): + if any_string_dtype != "object": + reason = 'Attribute "dtype" are different' + mark = pytest.mark.xfail(reason=reason, raises=AssertionError) + request.node.add_marker(mark) + + s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) result = s.str.rsplit("_", expand=True) - exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) + exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype) tm.assert_frame_equal(result, exp) - s = Series(["some_equal_splits", "with_no_nans"]) + s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype) result = s.str.rsplit("_", expand=True) - exp = DataFrame({0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}, + dtype=any_string_dtype, + ) tm.assert_frame_equal(result, exp) result = s.str.rsplit("_", expand=True, n=2) @@ -297,30 +411,35 @@ def test_rsplit_to_multiindex_expand(): assert result.nlevels == 2 -def test_split_nan_expand(): +def test_split_nan_expand(any_string_dtype): # gh-18450 - s = Series(["foo,bar,baz", np.nan]) + s = Series(["foo,bar,baz", np.nan], dtype=any_string_dtype) result = s.str.split(",", expand=True) - exp = DataFrame([["foo", "bar", "baz"], [np.nan, np.nan, np.nan]]) + exp = DataFrame( + [["foo", "bar", "baz"], [np.nan, np.nan, np.nan]], dtype=any_string_dtype + ) tm.assert_frame_equal(result, exp) - # check that these are actually np.nan and not None + # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - assert all(np.isnan(x) for x in result.iloc[1]) + if any_string_dtype == "object": + assert all(np.isnan(x) for x in result.iloc[1]) + else: + assert all(x is pd.NA for x in result.iloc[1]) -def test_split_with_name(): +def test_split_with_name(any_string_dtype): # GH 12617 # should preserve name - s = Series(["a,b", "c,d"], name="xxx") + s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype) res = s.str.split(",") exp = Series([["a", "b"], ["c", "d"]], name="xxx") tm.assert_series_equal(res, exp) res = s.str.split(",", expand=True) - exp = DataFrame([["a", "b"], ["c", "d"]]) + exp = DataFrame([["a", "b"], ["c", "d"]], dtype=any_string_dtype) tm.assert_frame_equal(res, exp) idx = Index(["a,b", "c,d"], name="xxx") From 427eff71b4d12b20ab8b97c30fd143c1cea7d668 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 29 Apr 2021 12:50:23 +0100 Subject: [PATCH 02/15] move fixture to conftest.py --- pandas/tests/strings/conftest.py | 24 ++++++++++++++++++++ pandas/tests/strings/test_split_partition.py | 24 -------------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 4fedbee91f649..379036932c580 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import Series from pandas.core import strings as strings @@ -173,3 +175,25 @@ def any_allowed_skipna_inferred_dtype(request): # correctness of inference tested in tests/dtypes/test_inference.py return inferred_dtype, values + + +@pytest.fixture( + params=[ + "object", + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def any_string_dtype(request): + """ + Parametrized fixture for string dtypes. + + * 'object' + * 'string' + * 'arrow_string' + """ + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + return request.param diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 842e9b5ccb2a0..3635997e6001c 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -15,28 +13,6 @@ ) -@pytest.fixture( - params=[ - "object", - "string", - pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), - ] -) -def any_string_dtype(request): - """ - Parametrized fixture for string dtypes. - - * 'object' - * 'string' - * 'arrow_string' - """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - return request.param - - def test_split(any_string_dtype): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) From 09ad85e4300887a2b8c5cba65585d57646897605 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 29 Apr 2021 12:57:21 +0100 Subject: [PATCH 03/15] mixed object to seperate test --- pandas/tests/strings/test_split_partition.py | 135 ++++++------------- 1 file changed, 43 insertions(+), 92 deletions(-) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 3635997e6001c..284be3bdb38b7 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -28,47 +28,28 @@ def test_split(any_string_dtype): result = values.str.split("__", expand=False) tm.assert_series_equal(result, exp) - # mixed - mixed = Series( + # regex split + values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) + result = values.str.split("[,_]") + exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) + tm.assert_series_equal(result, exp) + + +def test_split_object_mixed(): + mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) + result = mixed.str.split("_") + exp = Series( [ - "a_b_c", + ["a", "b", "c"], np.nan, - "d_e_f", - True, - datetime(2021, 4, 21, 18, 7, 26, 633720), - None, - 1, - 2.0, - ], - dtype=any_string_dtype, + ["d", "e", "f"], + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] ) - result = mixed.str.split("_") - if any_string_dtype == "object": - exp = Series( - [ - ["a", "b", "c"], - np.nan, - ["d", "e", "f"], - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ] - ) - else: - exp = Series( - [ - ["a", "b", "c"], - pd.NA, - ["d", "e", "f"], - ["True"], - ["2021-04-21 18:07:26.633720"], - pd.NA, - ["1"], - ["2.0"], - ] - ) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) @@ -76,12 +57,6 @@ def test_split(any_string_dtype): assert isinstance(result, Series) tm.assert_almost_equal(result, exp) - # regex split - values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) - result = values.str.split("[,_]") - exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - tm.assert_series_equal(result, exp) - @pytest.mark.parametrize("method", ["split", "rsplit"]) def test_split_n(any_string_dtype, method): @@ -109,54 +84,6 @@ def test_rsplit(any_string_dtype): result = values.str.rsplit("__", expand=False) tm.assert_series_equal(result, exp) - # mixed - mixed = Series( - [ - "a_b_c", - np.nan, - "d_e_f", - True, - datetime(2021, 4, 21, 18, 7, 26, 633720), - None, - 1, - 2.0, - ], - dtype=any_string_dtype, - ) - result = mixed.str.rsplit("_") - if any_string_dtype == "object": - exp = Series( - [ - ["a", "b", "c"], - np.nan, - ["d", "e", "f"], - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ] - ) - else: - exp = Series( - [ - ["a", "b", "c"], - pd.NA, - ["d", "e", "f"], - ["True"], - ["2021-04-21 18:07:26.633720"], - pd.NA, - ["1"], - ["2.0"], - ] - ) - assert isinstance(result, Series) - tm.assert_almost_equal(result, exp) - - result = mixed.str.rsplit("_", expand=False) - assert isinstance(result, Series) - tm.assert_almost_equal(result, exp) - # regex split is not supported by rsplit values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.rsplit("[,_]") @@ -170,6 +97,30 @@ def test_rsplit(any_string_dtype): tm.assert_series_equal(result, exp) +def test_rsplit_object_mixed(): + # mixed + mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) + result = mixed.str.rsplit("_") + exp = Series( + [ + ["a", "b", "c"], + np.nan, + ["d", "e", "f"], + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] + ) + assert isinstance(result, Series) + tm.assert_almost_equal(result, exp) + + result = mixed.str.rsplit("_", expand=False) + assert isinstance(result, Series) + tm.assert_almost_equal(result, exp) + + def test_split_blank_string(any_string_dtype, request): if any_string_dtype == "arrow_string": reason = "AssertionError: DataFrame are different" From 39dd30a183323101549f3f84479b823b76181d34 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 29 Apr 2021 13:22:08 +0100 Subject: [PATCH 04/15] add benchmark --- asv_bench/benchmarks/strings.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 45a9053954569..e77f74b90d6c8 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -230,17 +230,24 @@ def time_contains(self, dtype, regex): class Split: - params = [True, False] - param_names = ["expand"] + params = (["str", "string", "arrow_string"], [None, "-", "--"], [True, False]) + param_names = ["dtype", "pat", "expand"] - def setup(self, expand): - self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--") + def setup(self, dtype, pat, expand): + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + if pat is None: + pat = " " + try: + self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join(pat) + except ImportError: + raise NotImplementedError - def time_split(self, expand): - self.s.str.split("--", expand=expand) + def time_split(self, dtype, pat, expand): + self.s.str.split(pat, expand=expand) - def time_rsplit(self, expand): - self.s.str.rsplit("--", expand=expand) + def time_rsplit(self, dtype, pat, expand): + self.s.str.rsplit(pat, expand=expand) class Dummies: From c9511d94463cf135ac3c96a9574e6c685f19b5f5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 29 Apr 2021 13:47:16 +0100 Subject: [PATCH 05/15] wip --- pandas/core/arrays/string_arrow.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 24dd5c6814148..d75fb65885122 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -852,14 +852,16 @@ def _str_split(self, pat=None, n=-1, expand=False): is_valid = np.array(result.is_valid()) result = np.array(result) result[~is_valid] = self.dtype.na_value - valid = result[is_valid] - # we need to loop through to avoid numpy indexing assignment errors when - # the result is not a ragged array and interpreted as a 2 dimensional - # array - for i, val in enumerate(valid): - valid[i] = val.tolist() + # if not expand: + # valid = result[is_valid] + # # we need to loop through to avoid numpy indexing assignment errors when + # # the result is not a ragged array and interpreted as a 2 dimensional + # # array + # for i, val in enumerate(valid): + # valid[i] = val.tolist() else: result = np.array(result) - for i, val in enumerate(result): - result[i] = val.tolist() + # if not expand: + # for i, val in enumerate(result): + # result[i] = val.tolist() return result From 5c2ab242f65942b3e70029310dedfbd083f7c70f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 12 May 2021 12:57:48 +0100 Subject: [PATCH 06/15] post merge fix-up --- pandas/core/arrays/string_arrow.py | 20 ++++++++++---------- pandas/tests/strings/test_split_partition.py | 7 +------ 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index bf7fef1a2ee16..8a49b7829ce25 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -919,18 +919,18 @@ def _str_split(self, pat=None, n=-1, expand=False): is_valid = np.array(result.is_valid()) result = np.array(result) result[~is_valid] = self.dtype.na_value - # if not expand: - # valid = result[is_valid] - # # we need to loop through to avoid numpy indexing assignment errors when - # # the result is not a ragged array and interpreted as a 2 dimensional - # # array - # for i, val in enumerate(valid): - # valid[i] = val.tolist() + if not expand: + valid = result[is_valid] + # we need to loop through to avoid numpy indexing assignment errors when + # the result is not a ragged array and interpreted as a 2 dimensional + # array + for i, val in enumerate(valid): + valid[i] = val.tolist() else: result = np.array(result) - # if not expand: - # for i, val in enumerate(result): - # result[i] = val.tolist() + if not expand: + for i, val in enumerate(result): + result[i] = val.tolist() return result def _str_strip(self, to_strip=None): diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 2e71f86807606..358bab1fbe661 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -283,12 +283,7 @@ def test_split_to_multiindex_expand(): idx.str.split("_", expand="not_a_boolean") -def test_rsplit_to_dataframe_expand(any_string_dtype, request): - if any_string_dtype != "object": - reason = 'Attribute "dtype" are different' - mark = pytest.mark.xfail(reason=reason, raises=AssertionError) - request.node.add_marker(mark) - +def test_rsplit_to_dataframe_expand(any_string_dtype): s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) result = s.str.rsplit("_", expand=True) exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype) From 12407fbe5dc9a996d64e899f92e5dc01ac4c2646 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 12 May 2021 13:02:25 +0100 Subject: [PATCH 07/15] remove fixture --- pandas/tests/strings/conftest.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 379036932c580..4fedbee91f649 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import Series from pandas.core import strings as strings @@ -175,25 +173,3 @@ def any_allowed_skipna_inferred_dtype(request): # correctness of inference tested in tests/dtypes/test_inference.py return inferred_dtype, values - - -@pytest.fixture( - params=[ - "object", - "string", - pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), - ] -) -def any_string_dtype(request): - """ - Parametrized fixture for string dtypes. - - * 'object' - * 'string' - * 'arrow_string' - """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - return request.param From 24d23951a245de0317a877eda070337fd7aa9b78 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 12 May 2021 13:04:48 +0100 Subject: [PATCH 08/15] remove xfail (need to fix failing test on blank string before merge) --- pandas/tests/strings/test_split_partition.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 358bab1fbe661..e59105eccc67c 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -121,12 +121,7 @@ def test_rsplit_object_mixed(): tm.assert_almost_equal(result, exp) -def test_split_blank_string(any_string_dtype, request): - if any_string_dtype == "arrow_string": - reason = "AssertionError: DataFrame are different" - mark = pytest.mark.xfail(reason=reason, raises=AssertionError) - request.node.add_marker(mark) - +def test_split_blank_string(any_string_dtype): # expand blank split GH 20067 values = Series([""], name="test", dtype=any_string_dtype) result = values.str.split(expand=True) From 3d9297dee776f87309821f2547c1a96b69d16dd2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 17 May 2021 15:53:17 +0100 Subject: [PATCH 09/15] seperate benchmark for pattern --- asv_bench/benchmarks/strings.py | 34 ++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 277f2ec91e58c..a5b39c9da5eee 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -230,10 +230,30 @@ def time_contains(self, dtype, regex): class Split: - params = (["str", "string", "arrow_string"], [None, "-", "--"], [True, False]) - param_names = ["dtype", "pat", "expand"] + params = (["str", "string", "arrow_string"], [True, False]) + param_names = ["dtype", "expand"] + + def setup(self, dtype, expand): + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + try: + self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--") + except ImportError: + raise NotImplementedError + + def time_split(self, dtype, expand): + self.s.str.split("--", expand=expand) + + def time_rsplit(self, dtype, expand): + self.s.str.rsplit("--", expand=expand) + + +class SplitPattern: + + params = (["str", "string", "arrow_string"], [None, "-"]) + param_names = ["dtype", "pat"] - def setup(self, dtype, pat, expand): + def setup(self, dtype, pat): from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 if pat is None: @@ -243,11 +263,11 @@ def setup(self, dtype, pat, expand): except ImportError: raise NotImplementedError - def time_split(self, dtype, pat, expand): - self.s.str.split(pat, expand=expand) + def time_split(self, dtype, pat): + self.s.str.split(pat) - def time_rsplit(self, dtype, pat, expand): - self.s.str.rsplit(pat, expand=expand) + def time_rsplit(self, dtype, pat): + self.s.str.rsplit(pat) class Dummies: From a574ccb4c2c0f20385b059b26ed542fed595adba Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 17 May 2021 16:35:37 +0100 Subject: [PATCH 10/15] use pa_version_under3p0 instead of hasattr --- pandas/core/arrays/string_arrow.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8a49b7829ce25..108520551bd85 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -900,20 +900,16 @@ def _str_upper(self): return type(self)(pc.utf8_upper(self._data)) def _str_split(self, pat=None, n=-1, expand=False): + if pa_version_under3p0 or (pat is not None and len(pat) > 1): + return super()._str_split(pat=pat, n=n, expand=expand) + + if n is None or n == 0: + n = -1 + if pat is None: - if hasattr(pc, "utf8_split_whitespace"): - if n is None or n == 0: - n = -1 - result = pc.utf8_split_whitespace(self._data, max_splits=n) - else: - return super()._str_split(pat=pat, n=n, expand=expand) + result = pc.utf8_split_whitespace(self._data, max_splits=n) else: - if len(pat) == 1 and hasattr(pc, "split_pattern"): - if n is None or n == 0: - n = -1 - result = pc.split_pattern(self._data, pattern=pat, max_splits=n) - else: - return super()._str_split(pat=pat, n=n, expand=expand) + result = pc.split_pattern(self._data, pattern=pat, max_splits=n) if result.null_count: is_valid = np.array(result.is_valid()) From 9fc0144bf251dfb1677546c8c88561521e753554 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 18 May 2021 22:34:58 +0100 Subject: [PATCH 11/15] add test case --- pandas/core/arrays/string_arrow.py | 10 +++++----- pandas/tests/strings/test_split_partition.py | 7 +++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a708b9d8d57a6..17537408cbae1 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -917,16 +917,16 @@ def _str_split(self, pat=None, n=-1, expand=False): result = pc.split_pattern(self._data, pattern=pat, max_splits=n) if result.null_count: - is_valid = np.array(result.is_valid()) + mask = np.array(result.is_null()) result = np.array(result) - result[~is_valid] = self.dtype.na_value + result[mask] = self.dtype.na_value if not expand: - valid = result[is_valid] # we need to loop through to avoid numpy indexing assignment errors when # the result is not a ragged array and interpreted as a 2 dimensional # array - for i, val in enumerate(valid): - valid[i] = val.tolist() + for idx in np.argwhere(~mask): + idx = idx[0] + result[idx] = result[idx].tolist() else: result = np.array(result) if not expand: diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index f3f5acd0d2f1c..b80656f52dbf0 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -144,11 +144,14 @@ def test_split_blank_string(any_string_dtype): def test_split_noargs(any_string_dtype): # #1859 + expected = ["Travis", "Oliphant"] + s = Series(["Wes McKinney", "Travis Oliphant"], dtype=any_string_dtype) result = s.str.split() - expected = ["Travis", "Oliphant"] assert result[1] == expected - result = s.str.rsplit() + + s = Series(["Wes McKinney", "Travis Oliphant", np.nan], dtype=any_string_dtype) + result = s.str.split() assert result[1] == expected From 885510013f7a95cde91e879468974c0602a19211 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 18 May 2021 23:02:41 +0100 Subject: [PATCH 12/15] use ObjectStringArrayMixin._str_map --- pandas/core/arrays/string_arrow.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 17537408cbae1..458b08cf9d4f7 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -916,22 +916,11 @@ def _str_split(self, pat=None, n=-1, expand=False): else: result = pc.split_pattern(self._data, pattern=pat, max_splits=n) - if result.null_count: - mask = np.array(result.is_null()) - result = np.array(result) - result[mask] = self.dtype.na_value - if not expand: - # we need to loop through to avoid numpy indexing assignment errors when - # the result is not a ragged array and interpreted as a 2 dimensional - # array - for idx in np.argwhere(~mask): - idx = idx[0] - result[idx] = result[idx].tolist() - else: - result = np.array(result) - if not expand: - for i, val in enumerate(result): - result[i] = val.tolist() + result = np.array(result) + if not expand: + result = ObjectStringArrayMixin._str_map( + result, lambda x: x.tolist(), na_value=self.dtype.na_value, dtype=object + ) return result def _str_strip(self, to_strip=None): From ad3480fc8ea08264c8a6d8bff514db047ab2de10 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 19 May 2021 09:47:52 +0100 Subject: [PATCH 13/15] use lib.map_infer_mask --- pandas/core/arrays/string_arrow.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 458b08cf9d4f7..e1cc9051b847c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -916,10 +916,15 @@ def _str_split(self, pat=None, n=-1, expand=False): else: result = pc.split_pattern(self._data, pattern=pat, max_splits=n) + mask = np.array(result.is_null()) result = np.array(result) if not expand: - result = ObjectStringArrayMixin._str_map( - result, lambda x: x.tolist(), na_value=self.dtype.na_value, dtype=object + result = lib.map_infer_mask( + result, + lambda x: x.tolist(), + mask.view(np.uint8), + na_value=self.dtype.na_value, + dtype=np.dtype(object), ) return result From 70677c4992a3cb224fc25c059525872fac03ca49 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 19 May 2021 10:06:01 +0100 Subject: [PATCH 14/15] update benchmark --- asv_bench/benchmarks/strings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 83f80c930f007..700393cc72492 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -235,18 +235,18 @@ class Split(Dtypes): def setup(self, dtype, expand): super().setup(dtype) - self.s = self.s.str.join("--") + self.s = self.s.str.join("-") def time_split(self, dtype, expand): - self.s.str.split("--", expand=expand) + self.s.str.split("-", expand=expand) def time_rsplit(self, dtype, expand): - self.s.str.rsplit("--", expand=expand) + self.s.str.rsplit("-", expand=expand) class SplitPattern(Dtypes): - params = (Dtypes.params, [None, "-"]) + params = (Dtypes.params, [None, "--"]) param_names = ["dtype", "pat"] def setup(self, dtype, pat): From af580558c5f3ef01b3a82aa811cc193aad78189c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 19 May 2021 10:48:26 +0100 Subject: [PATCH 15/15] always convert to lists --- pandas/core/arrays/string_arrow.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e1cc9051b847c..3b20df5fdf82b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -918,14 +918,13 @@ def _str_split(self, pat=None, n=-1, expand=False): mask = np.array(result.is_null()) result = np.array(result) - if not expand: - result = lib.map_infer_mask( - result, - lambda x: x.tolist(), - mask.view(np.uint8), - na_value=self.dtype.na_value, - dtype=np.dtype(object), - ) + result = lib.map_infer_mask( + result, + lambda x: x.tolist(), + mask.view(np.uint8), + na_value=self.dtype.na_value, + dtype=np.dtype(object), + ) return result def _str_strip(self, to_strip=None):