From c548c67f15066638eae4469af39999d5e6255f11 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Dec 2019 09:04:54 -0600 Subject: [PATCH 1/5] StringArray comparisions return BooleanArray xref https://github.com/pandas-dev/pandas/issues/29556 --- doc/source/user_guide/text.rst | 4 +++ pandas/core/arrays/string_.py | 31 +++++++++++++++----- pandas/core/ops/__init__.py | 33 +++++++++++++++++++++- pandas/tests/arrays/string_/test_string.py | 33 ++++++++++++++++++++++ pandas/tests/extension/test_string.py | 2 +- 5 files changed, 94 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 072871f89bdae..fcbe0a5b35fb5 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -94,6 +94,10 @@ l. For ``StringDtype``, :ref:`string accessor methods` 2. Some string methods, like :meth:`Series.str.decode` are not available on ``StringArray`` because ``StringArray`` only holds strings, not bytes. +3. In comparision operations, :class:`StringArray` and ``Series`` backed + by a ``StringArray`` will return a :class:`BooleanArray`, rather than + a ``bool`` or ``object`` dtype array, depending on whether there are + missing values. Everything else that follows in the rest of this document applies equally to diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2de19a3319cc5..8eea825a7fe2a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -120,6 +120,10 @@ class StringArray(PandasArray): copy : bool, default False Whether to copy the array of data. + Notes + ----- + StringArray returns a BooleanArray for comparison methods. + Attributes ---------- None @@ -148,6 +152,13 @@ class StringArray(PandasArray): Traceback (most recent call last): ... ValueError: StringArray requires an object-dtype ndarray of strings. + + For comparision methods, this returns a :class:`pandas.BooleanArray` + + >>> pd.array(["a", None, "c"], dtype="string") == "a" + + [True, NA, False] + Length: 3, dtype: boolean """ # undo the PandasArray hack @@ -255,7 +266,12 @@ def value_counts(self, dropna=False): # Overrride parent because we have different return types. @classmethod def _create_arithmetic_method(cls, op): + # Note: this handles both arithmetic and comparison methods. def method(self, other): + from pandas.arrays import BooleanArray + + assert op.__name__ in ops.ARITHMETIC_BINOPS | ops.COMPARISON_BINOPS + if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): return NotImplemented @@ -275,15 +291,16 @@ def method(self, other): other = np.asarray(other) other = other[valid] - result = np.empty_like(self._ndarray, dtype="object") - result[mask] = StringDtype.na_value - result[valid] = op(self._ndarray[valid], other) - - if op.__name__ in {"add", "radd", "mul", "rmul"}: + if op.__name__ in ops.ARITHMETIC_BINOPS: + result = np.empty_like(self._ndarray, dtype="object") + result[mask] = StringDtype.na_value + result[valid] = op(self._ndarray[valid], other) return StringArray(result) else: - dtype = "object" if mask.any() else "bool" - return np.asarray(result, dtype=dtype) + # logical + result = np.zeros(len(self._ndarray), dtype="bool") + result[valid] = op(self._ndarray[valid], other) + return BooleanArray(result, mask) return compat.set_function_name(method, f"__{op.__name__}__", cls) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index f3c01efed6d43..af43ea8a8bc30 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -5,7 +5,7 @@ """ import datetime import operator -from typing import Tuple, Union +from typing import Set, Tuple, Union import numpy as np @@ -59,6 +59,37 @@ rxor, ) +# ----------------------------------------------------------------------------- +# constants +ARITHMETIC_BINOPS: Set[str] = { + "add", + "sub", + "mul", + "pow", + "mod", + "floordiv", + "truediv", + "divmod", + "radd", + "rsub", + "rmul", + "rpow", + "rmod", + "rfloordiv", + "rtruediv", + "rdivmod", +} + + +COMPARISON_BINOPS: Set[str] = { + "eq", + "ne", + "lt", + "gt", + "le", + "ge", +} + # ----------------------------------------------------------------------------- # Ops Wrapping Utilities diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 0dfd75a2042b0..03f3e9828d6f9 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -154,6 +154,39 @@ def test_add_frame(): tm.assert_frame_equal(result, expected) +def test_comparison_methods_scalar(all_compare_operators): + op_name = all_compare_operators + + a = pd.array(["a", None, "c"], dtype="string") + other = "a" + result = getattr(a, op_name)(other) + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected[1] = None + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_comparison_methods_array(all_compare_operators): + op_name = all_compare_operators + + a = pd.array(["a", None, "c"], dtype="string") + other = [None, None, "c"] + result = getattr(a, op_name)(other) + expected = np.empty_like(a, dtype="object") + expected[:2] = None + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + def test_constructor_raises(): with pytest.raises(ValueError, match="sequence of strings"): pd.arrays.StringArray(np.array(["a", "b"], dtype="S1")) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 471a1b79d23bc..8519c2999ade3 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -91,7 +91,7 @@ class TestCasting(base.BaseCastingTests): class TestComparisonOps(base.BaseComparisonOpsTests): def _compare_other(self, s, data, op_name, other): result = getattr(s, op_name)(other) - expected = getattr(s.astype(object), op_name)(other) + expected = getattr(s.astype(object), op_name)(other).astype("boolean") self.assert_series_equal(result, expected) def test_compare_scalar(self, data, all_compare_operators): From f4ed269518361ccdd0a51930ee78193df24786c7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Dec 2019 09:15:49 -0600 Subject: [PATCH 2/5] update docs --- doc/source/user_guide/text.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index fcbe0a5b35fb5..8ea7f547076f5 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -95,9 +95,9 @@ l. For ``StringDtype``, :ref:`string accessor methods` on ``StringArray`` because ``StringArray`` only holds strings, not bytes. 3. In comparision operations, :class:`StringArray` and ``Series`` backed - by a ``StringArray`` will return a :class:`BooleanArray`, rather than - a ``bool`` or ``object`` dtype array, depending on whether there are - missing values. + by a ``StringArray`` will return an object with :class:`BooleanDtype`, + rather than a ``bool`` or ``object`` dtype object, depending on whether + there are missing values. Everything else that follows in the rest of this document applies equally to From 240c6ee8045f361de98f9a534e34ed2e4559e2f0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Dec 2019 09:57:30 -0600 Subject: [PATCH 3/5] fixups --- doc/source/user_guide/text.rst | 11 ++++++----- pandas/tests/arrays/string_/test_string.py | 2 -- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 8ea7f547076f5..c9cf5669621c8 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -94,11 +94,12 @@ l. For ``StringDtype``, :ref:`string accessor methods` 2. Some string methods, like :meth:`Series.str.decode` are not available on ``StringArray`` because ``StringArray`` only holds strings, not bytes. -3. In comparision operations, :class:`StringArray` and ``Series`` backed - by a ``StringArray`` will return an object with :class:`BooleanDtype`, - rather than a ``bool`` or ``object`` dtype object, depending on whether - there are missing values. - +3. In comparision operations, :class:`arrays.StringArray` and ``Series`` backed + by a ``StringArray`` will return an object with :class:`arrays.BooleanDtype`, + rather than a ``bool`` dtype object, depending on whether + there are missing values. Missing values in a ``StringArray`` will propagate + in comparision operations, rather than always comparing unequal like + :attr:`numpy.nan`. Everything else that follows in the rest of this document applies equally to ``string`` and ``object`` dtype. diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 03f3e9828d6f9..0544ee4002890 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -161,7 +161,6 @@ def test_comparison_methods_scalar(all_compare_operators): other = "a" result = getattr(a, op_name)(other) expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected[1] = None expected = pd.array(expected, dtype="boolean") tm.assert_extension_array_equal(result, expected) @@ -177,7 +176,6 @@ def test_comparison_methods_array(all_compare_operators): other = [None, None, "c"] result = getattr(a, op_name)(other) expected = np.empty_like(a, dtype="object") - expected[:2] = None expected[-1] = getattr(other[-1], op_name)(a[-1]) expected = pd.array(expected, dtype="boolean") tm.assert_extension_array_equal(result, expected) From 5a860e10cea7780758639658f9e323520269a1a1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Dec 2019 12:08:36 -0600 Subject: [PATCH 4/5] fixup --- pandas/core/arrays/string_.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8eea825a7fe2a..3bad7f0162f44 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -120,10 +120,6 @@ class StringArray(PandasArray): copy : bool, default False Whether to copy the array of data. - Notes - ----- - StringArray returns a BooleanArray for comparison methods. - Attributes ---------- None @@ -138,6 +134,10 @@ class StringArray(PandasArray): The string methods are available on Series backed by a StringArray. + Notes + ----- + StringArray returns a BooleanArray for comparison methods. + Examples -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") From e3b10ac62c5531843e45b5cec5dbace801f61455 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Dec 2019 08:16:22 -0600 Subject: [PATCH 5/5] fix docs --- doc/source/user_guide/text.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index c9cf5669621c8..ff0474dbecbb4 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -95,11 +95,10 @@ l. For ``StringDtype``, :ref:`string accessor methods` on ``StringArray`` because ``StringArray`` only holds strings, not bytes. 3. In comparision operations, :class:`arrays.StringArray` and ``Series`` backed - by a ``StringArray`` will return an object with :class:`arrays.BooleanDtype`, - rather than a ``bool`` dtype object, depending on whether - there are missing values. Missing values in a ``StringArray`` will propagate - in comparision operations, rather than always comparing unequal like - :attr:`numpy.nan`. + by a ``StringArray`` will return an object with :class:`BooleanDtype`, + rather than a ``bool`` dtype object. Missing values in a ``StringArray`` + will propagate in comparision operations, rather than always comparing + unequal like :attr:`numpy.nan`. Everything else that follows in the rest of this document applies equally to ``string`` and ``object`` dtype.