diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 4bd56ccb1b5ce..d1dcbbb0063f1 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -44,7 +44,7 @@ pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype, - pd.CategoricalDtype, + pd.CategoricalDtype(), pd.IntervalDtype, pd.DatetimeTZDtype("ns", "UTC"), pd.PeriodDtype("D"), diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index e486235f044f5..92881d8a3c550 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -138,7 +138,18 @@ behavior: 1. Categories are inferred from the data. 2. Categories are unordered. -To control those behaviors, instead of passing ``'category'``, use an instance +It is also possible to give a dtype inside bracket to ensure the dtype of the categories, like this: + +.. ipython:: python + + s = pd.Series(["a", "b", "c", "a"], dtype="category[string]") + s.dtype.categories + +.. versionadded:: 2.1.0 + + The ability to a specify the categories dtype in the dtype string was added in :ref:`v2.1.0 ` + +To control those behaviors even more, instead of passing ``'category'``, use an instance of :class:`~pandas.api.types.CategoricalDtype`. .. ipython:: python @@ -146,7 +157,7 @@ of :class:`~pandas.api.types.CategoricalDtype`. from pandas.api.types import CategoricalDtype s = pd.Series(["a", "b", "c", "a"]) - cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True) + cat_type = CategoricalDtype(["b", "c", "d"], ordered=True, categories_dtype="string") s_cat = s.astype(cat_type) s_cat @@ -257,11 +268,18 @@ unordered categoricals, the order of the ``categories`` is not considered. # Unequal, since the second CategoricalDtype is ordered c1 == CategoricalDtype(["a", "b", "c"], ordered=True) -All instances of ``CategoricalDtype`` compare equal to the string ``'category'``. +All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` or the +string ``'category'`` with the string dtype for the categories inside square brackets. .. ipython:: python c1 == "category" + c1 == "category[object]" + +.. versionadded:: 2.1.0 + + The ability to a specify the categories dtype inside square brackets in the dtype + string was added in :ref:`v2.1.0 ` Description ----------- diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2c5263f447951..c7b26e2a28d3c 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -14,12 +14,39 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_210.enhancements.enhancement1: +.. _whatsnew_210.enhancements.category_subtype: + +Specific categorical string dtypes like ``dtype="category[string]"`` now works +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When giving the string ``"category"`` as a dtype it is now possible to specify the dtype +of the categories as part of the dtype string: + +.. ipython:: python + + ser = pd.Series(["a", "b", np.nan], dtype="category[string]") + ser + +The expression inside the brackets can be any string that Pandas accepts for a dtype and +whose data can be stored in an :class:`Index` (:issue:`48515`). + +The categories dtype will also now be part of the dtype repr: + +.. ipython:: python + + df = pd.DataFrame({"a": ser, "b": pd.array([1, 2, 3], dtype="category[Int8]")}) + df.dtypes + +We can now also compare categorical dtypes to a string with the dtype of the categories inside brackets in order to get more precise comparisons: + +.. ipython:: python + + ser.dtype == "category[string]" + ser.dtype == "category" # also works, but doesn't check the categories dtype + ser.dtype == "category[object]" # fails, wrong categories dtype -enhancement1 -^^^^^^^^^^^^ -.. _whatsnew_210.enhancements.enhancement2: +.. _whatsnew_210.enhancements.map_na_action: ``map(func, na_action="ignore")`` now works for all array types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -96,6 +123,7 @@ Other enhancements - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"`` +- :class:`CategoricalDtype` has gotten a new parameter and attribute named :meth:`CategoricalDtype.categories_dtype` (:issue:`48515`) - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc2886e5b531c..a634290fd3d62 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1388,6 +1388,12 @@ cdef object _try_infer_map(object dtype): val = getattr(dtype, attr, None) if val in _TYPE_MAP: return _TYPE_MAP[val] + + # CategoricalDtype may have name category[dtype], so not caught above + name = getattr(dtype, "name", None) + if name.startswith("category["): + return _TYPE_MAP["category"] + return None diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6eb21fae29612..c85cab583b883 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -453,7 +453,7 @@ def __init__( ) from err # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) + dtype = CategoricalDtype(categories, dtype.ordered, dtype.categories_dtype) elif isinstance(values.dtype, CategoricalDtype): old_codes = extract_array(values)._codes @@ -1999,7 +1999,7 @@ def _repr_categories_info(self) -> str: def _repr_footer(self) -> str: info = self._repr_categories_info() - return f"Length: {len(self)}\n{info}" + return f"Length: {len(self)}, dtype: {self.dtype}\n{info}" def _get_repr( self, length: bool = True, na_rep: str = "NaN", footer: bool = True @@ -2524,7 +2524,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (3, object): ['a', 'b', 'c'] >>> s.cat.categories @@ -2537,7 +2537,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 a 4 a 5 a - dtype: category + dtype: category[object] Categories (3, object): ['c', 'b', 'a'] >>> s.cat.reorder_categories(list("cba")) @@ -2547,7 +2547,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (3, object): ['c', 'b', 'a'] >>> s.cat.add_categories(["d", "e"]) @@ -2557,7 +2557,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (5, object): ['a', 'b', 'c', 'd', 'e'] >>> s.cat.remove_categories(["a", "c"]) @@ -2567,7 +2567,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 NaN 4 NaN 5 NaN - dtype: category + dtype: category[object] Categories (1, object): ['b'] >>> s1 = s.cat.add_categories(["d", "e"]) @@ -2578,7 +2578,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (3, object): ['a', 'b', 'c'] >>> s.cat.set_categories(list("abcde")) @@ -2588,7 +2588,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (5, object): ['a', 'b', 'c', 'd', 'e'] >>> s.cat.as_ordered() @@ -2598,7 +2598,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (3, object): ['a' < 'b' < 'c'] >>> s.cat.as_unordered() @@ -2608,7 +2608,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (3, object): ['a', 'b', 'c'] """ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3931b12e06f9b..376a6ee61079c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -502,9 +502,8 @@ def is_categorical_dtype(arr_or_dtype) -> bool: FutureWarning, stacklevel=find_stack_level(), ) - if isinstance(arr_or_dtype, ExtensionDtype): - # GH#33400 fastpath for dtype object - return arr_or_dtype.name == "category" + if isinstance(arr_or_dtype, CategoricalDtype): + return True if arr_or_dtype is None: return False diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 2d0ec66dbc9cb..0c4f87550e883 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -152,13 +152,17 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): ---------- categories : sequence, optional Must be unique, and must not contain any nulls. - The categories are stored in an Index, - and if an index is provided the dtype of that index will be used. + The categories are stored in an Index. ordered : bool or None, default False Whether or not this categorical is treated as a ordered categorical. None can be used to maintain the ordered value of existing categoricals when used in operations that combine categoricals, e.g. astype, and will resolve to False if there is no existing ordered to maintain. + categories_dtype : dtype, optional + If given, will be the dtype of the categories. + If not given, the categories dtype will be inferred. + + .. versionadded:: 2.1.0 Attributes ---------- @@ -187,7 +191,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): 1 b 2 a 3 NaN - dtype: category + dtype: category[object] Categories (2, object): ['b' < 'a'] An empty CategoricalDtype with a specific dtype can be created @@ -198,15 +202,25 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): """ # TODO: Document public vs. private API - name = "category" type: type[CategoricalDtypeType] = CategoricalDtypeType kind: str_type = "O" str = "|O08" base = np.dtype("O") _metadata = ("categories", "ordered") _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} + _categories_dtype: Dtype + _match = re.compile(r"category\[(?P.+)\]") + + def __init__( + self, + categories=None, + ordered: Ordered = False, + categories_dtype: Dtype | None = None, + ) -> None: + if categories_dtype is not None: + from pandas.core.dtypes.common import pandas_dtype - def __init__(self, categories=None, ordered: Ordered = False) -> None: + self._categories_dtype = pandas_dtype(categories_dtype) self._finalize(categories, ordered, fastpath=False) @classmethod @@ -300,12 +314,12 @@ def _from_values_or_dtype( if dtype is not None: # The dtype argument takes precedence over values.dtype (if any) if isinstance(dtype, str): - if dtype == "category": + if dtype.startswith("category"): if ordered is None and cls.is_dtype(values): # GH#49309 preserve orderedness ordered = values.dtype.ordered - - dtype = CategoricalDtype(categories, ordered) + cat_dtype = cls._get_categories_dtype_from_string(dtype) + dtype = CategoricalDtype(categories, ordered, cat_dtype) else: raise ValueError(f"Unknown dtype {repr(dtype)}") elif categories is not None or ordered is not None: @@ -352,12 +366,31 @@ def construct_from_string(cls, string: str_type) -> CategoricalDtype: raise TypeError( f"'construct_from_string' expects a string, got {type(string)}" ) - if string != cls.name: - raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'") # need ordered=None to ensure that operations specifying dtype="category" don't # override the ordered value for existing categoricals - return cls(ordered=None) + + if string == "category": + return cls(ordered=None) + + msg = f"Cannot construct a '{cls.__name__}' from '{string}'" + categories_dtype = cls._get_categories_dtype_from_string(string) + if categories_dtype is None: + raise TypeError(msg) + try: + return cls(categories_dtype=categories_dtype) + except (KeyError, TypeError, ValueError) as err: + # keyError is if "categories_dtype" key is not found + # TypeError if we pass a nonsense; + raise TypeError(msg) from err + + @classmethod + def _get_categories_dtype_from_string(cls, string: str_type) -> str_type | None: + match = cls._match.match(string) + if match is None: + return None + d = match.groupdict() + return d.get("categories_dtype") def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None: if ordered is not None: @@ -401,7 +434,7 @@ def __eq__(self, other: Any) -> bool: 6) Any other comparison returns False """ if isinstance(other, str): - return other == self.name + return other in [self.name, "category"] elif other is self: return True elif not (hasattr(other, "ordered") and hasattr(other, "categories")): @@ -451,20 +484,27 @@ def __eq__(self, other: Any) -> bool: def __repr__(self) -> str_type: if self.categories is None: data = "None" - dtype = "None" else: data = self.categories._format_data(name=type(self).__name__) if data is None: # self.categories is RangeIndex data = str(self.categories._range) data = data.rstrip(", ") - dtype = self.categories.dtype return ( f"CategoricalDtype(categories={data}, ordered={self.ordered}, " - f"categories_dtype={dtype})" + f"categories_dtype={self.categories_dtype})" ) + @property + def name(self) -> str_type: + if self.categories is not None: + return f"category[{self.categories.dtype}]" + elif self.categories_dtype is not None: + return f"category[{self.categories_dtype}]" + else: + return "category" + @cache_readonly def _hash_categories(self) -> int: from pandas.core.util.hashing import ( @@ -537,8 +577,7 @@ def validate_ordered(ordered: Ordered) -> None: if not is_bool(ordered): raise TypeError("'ordered' must either be 'True' or 'False'") - @staticmethod - def validate_categories(categories, fastpath: bool = False) -> Index: + def validate_categories(self, categories, fastpath: bool = False) -> Index: """ Validates that we have good categories @@ -558,8 +597,11 @@ def validate_categories(categories, fastpath: bool = False) -> Index: raise TypeError( f"Parameter 'categories' must be list-like, was {repr(categories)}" ) + dtype = self.categories_dtype if not isinstance(categories, ABCIndex): - categories = Index._with_infer(categories, tupleize_cols=False) + categories = Index._with_infer(categories, dtype=dtype, tupleize_cols=False) + elif dtype is not None: + categories = categories.astype(dtype) if not fastpath: if categories.hasnans: @@ -613,6 +655,13 @@ def categories(self) -> Index: """ return self._categories + @property + def categories_dtype(self) -> Dtype | None: + try: + return self.categories.dtype + except AttributeError: + return getattr(self, "_categories_dtype", None) + @property def ordered(self) -> Ordered: """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bcfbfa1a2b713..5a712797f7a4d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6343,7 +6343,7 @@ def astype( >>> ser.astype('category') 0 1 1 2 - dtype: category + dtype: category[int32] Categories (2, int32): [1, 2] Convert to ordered categorical type with custom ordering: @@ -6354,7 +6354,7 @@ def astype( >>> ser.astype(cat_dtype) 0 1 1 2 - dtype: category + dtype: category[int64] Categories (2, int64): [2 < 1] Create a series of dates: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d2ef607635abb..fb68f207d287c 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -147,16 +147,16 @@ class CategoricalIndex(NDArrayBackedExtensionIndex): Examples -------- - >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) + >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"], ordered=True) CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], - categories=['a', 'b', 'c'], ordered=False, dtype='category') + categories=['a', 'b', 'c'], ordered=True, dtype='category[object]') ``CategoricalIndex`` can also be instantiated from a ``Categorical``: - >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"]) + >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"], ordered=True) >>> pd.CategoricalIndex(c) CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], - categories=['a', 'b', 'c'], ordered=False, dtype='category') + categories=['a', 'b', 'c'], ordered=True, dtype='category[object]') Ordered ``CategoricalIndex`` can have a min and max value. @@ -165,7 +165,7 @@ class CategoricalIndex(NDArrayBackedExtensionIndex): ... ) >>> ci CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], - categories=['c', 'b', 'a'], ordered=True, dtype='category') + categories=['c', 'b', 'a'], ordered=True, dtype='category[object]') >>> ci.min() 'c' """ @@ -438,13 +438,13 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): >>> idx = pd.CategoricalIndex(['a', 'b', 'c']) >>> idx CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], - ordered=False, dtype='category') + ordered=False, dtype='category[object]') >>> idx.map(lambda x: x.upper()) CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], - ordered=False, dtype='category') + ordered=False, dtype='category[object]') >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'}) CategoricalIndex(['first', 'second', 'third'], categories=['first', - 'second', 'third'], ordered=False, dtype='category') + 'second', 'third'], ordered=False, dtype='category[object]') If the mapping is one-to-one the ordering of the categories is preserved: @@ -452,10 +452,10 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True) >>> idx CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], - ordered=True, dtype='category') + ordered=True, dtype='category[object]') >>> idx.map({'a': 3, 'b': 2, 'c': 1}) CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True, - dtype='category') + dtype='category[int64]') If the mapping is not one-to-one an :class:`~pandas.Index` is returned: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 83d004c8b8e3e..2e44e8eba16d5 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -199,7 +199,7 @@ def cut( c (4.667, 7.333] d (7.333, 10.0] e (7.333, 10.0] - dtype: category + dtype: category[interval[float64, right]] Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ... Passing a Series as an input returns a Series with mapping value. diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index de88960280102..13f67b0d9ea6c 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -169,7 +169,7 @@ def test_comparison_with_unknown_scalars(self): # for unequal comps, but not for equal/not equal cat = Categorical([1, 2, 3], ordered=True) - msg = "Invalid comparison between dtype=category and int" + msg = r"Invalid comparison between dtype=category\[int64\] and int" with pytest.raises(TypeError, match=msg): cat < 4 with pytest.raises(TypeError, match=msg): @@ -398,6 +398,6 @@ def test_numeric_like_ops_series_arith(self, op, str_rep): def test_numeric_like_ops_series_invalid(self): # invalid ufunc s = Series(Categorical([1, 2, 3, 4])) - msg = "Object with dtype category cannot perform the numpy op log" + msg = r"Object with dtype category\[int64\] cannot perform the numpy op log" with pytest.raises(TypeError, match=msg): np.log(s) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index cdf5d967d9c3d..101efc28279a4 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -30,7 +30,7 @@ def test_big_print(self): factor = Categorical.from_codes(codes, dtype=dtype) expected = [ "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']", - "Length: 600", + "Length: 600, dtype: category[object]", "Categories (3, object): ['a', 'b', 'c']", ] expected = "\n".join(expected) @@ -60,7 +60,7 @@ def test_print_none_width(self): a = Series(Categorical([1, 2, 3, 4])) exp = ( "0 1\n1 2\n2 3\n3 4\n" - "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + "dtype: category[int64]\nCategories (4, int64): [1, 2, 3, 4]" ) with option_context("display.width", None): @@ -70,7 +70,7 @@ def test_unicode_print(self): c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ ['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc'] -Length: 60 +Length: 60, dtype: category[object] Categories (3, object): ['aaaaa', 'bb', 'cccc']""" assert repr(c) == expected @@ -78,7 +78,7 @@ def test_unicode_print(self): c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) expected = """\ ['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう'] -Length: 60 +Length: 60, dtype: category[object] Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 assert repr(c) == expected @@ -88,7 +88,7 @@ def test_unicode_print(self): with option_context("display.unicode.east_asian_width", True): c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう'] -Length: 60 +Length: 60, dtype: category[object] Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 assert repr(c) == expected @@ -108,14 +108,14 @@ def test_categorical_repr(self): c = Categorical([1, 2, 3, 4, 5] * 10) exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] -Length: 50 +Length: 50, dtype: category[int64] Categories (5, int64): [1, 2, 3, 4, 5]""" assert repr(c) == exp c = Categorical(np.arange(20, dtype=np.int64)) exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] -Length: 20 +Length: 20, dtype: category[int64] Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]""" assert repr(c) == exp @@ -135,14 +135,14 @@ def test_categorical_repr_ordered(self): c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True) exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] -Length: 50 +Length: 50, dtype: category[int64] Categories (5, int64): [1 < 2 < 3 < 4 < 5]""" assert repr(c) == exp c = Categorical(np.arange(20, dtype=np.int64), ordered=True) exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] -Length: 20 +Length: 20, dtype: category[int64] Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]""" assert repr(c) == exp @@ -249,7 +249,7 @@ def test_categorical_repr_int_with_nan(self): s = Series([1, 2, np.nan], dtype="object").astype("category") s_exp = """0 1\n1 2\n2 NaN -dtype: category +dtype: category[int64] Categories (2, int64): [1, 2]""" assert repr(s) == s_exp @@ -328,7 +328,7 @@ def test_categorical_repr_timedelta(self): idx = timedelta_range("1 hours", periods=20) c = Categorical(idx) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] -Length: 20 +Length: 20, dtype: category[timedelta64[ns]] Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501 @@ -337,7 +337,7 @@ def test_categorical_repr_timedelta(self): c = Categorical(idx.append(idx), categories=idx) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] -Length: 40 +Length: 40, dtype: category[timedelta64[ns]] Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501 @@ -361,7 +361,7 @@ def test_categorical_repr_timedelta_ordered(self): idx = timedelta_range("1 hours", periods=20) c = Categorical(idx, ordered=True) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] -Length: 20 +Length: 20, dtype: category[timedelta64[ns]] Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < 18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501 @@ -370,7 +370,7 @@ def test_categorical_repr_timedelta_ordered(self): c = Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] -Length: 40 +Length: 40, dtype: category[timedelta64[ns]] Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < 18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501 @@ -379,20 +379,20 @@ def test_categorical_repr_timedelta_ordered(self): def test_categorical_index_repr(self): idx = CategoricalIndex(Categorical([1, 2, 3])) - exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category[int64]')""" # noqa: E501 assert repr(idx) == exp i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64))) - exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=False, dtype='category[int64]')""" # noqa: E501 assert repr(i) == exp def test_categorical_index_repr_ordered(self): i = CategoricalIndex(Categorical([1, 2, 3], ordered=True)) - exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category[int64]')""" # noqa: E501 assert repr(i) == exp i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64), ordered=True)) - exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=True, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=True, dtype='category[int64]')""" # noqa: E501 assert repr(i) == exp def test_categorical_index_repr_datetime(self): @@ -401,7 +401,7 @@ def test_categorical_index_repr_datetime(self): exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', '2011-01-01 13:00:00'], - categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category[datetime64[ns]]')""" # noqa: E501 assert repr(i) == exp @@ -410,7 +410,7 @@ def test_categorical_index_repr_datetime(self): exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], - categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category[datetime64[ns, US/Eastern]]')""" # noqa: E501 assert repr(i) == exp @@ -420,7 +420,7 @@ def test_categorical_index_repr_datetime_ordered(self): exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', '2011-01-01 13:00:00'], - categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category[datetime64[ns]]')""" # noqa: E501 assert repr(i) == exp @@ -429,7 +429,7 @@ def test_categorical_index_repr_datetime_ordered(self): exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], - categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category[datetime64[ns, US/Eastern]]')""" # noqa: E501 assert repr(i) == exp @@ -439,7 +439,7 @@ def test_categorical_index_repr_datetime_ordered(self): '2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], - categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category[datetime64[ns, US/Eastern]]')""" # noqa: E501 assert repr(i) == exp @@ -447,24 +447,24 @@ def test_categorical_index_repr_period(self): # test all length idx = period_range("2011-01-01 09:00", freq="H", periods=1) i = CategoricalIndex(Categorical(idx)) - exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp idx = period_range("2011-01-01 09:00", freq="H", periods=2) i = CategoricalIndex(Categorical(idx)) - exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp idx = period_range("2011-01-01 09:00", freq="H", periods=3) i = CategoricalIndex(Categorical(idx)) - exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp idx = period_range("2011-01-01 09:00", freq="H", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], - categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp @@ -473,13 +473,13 @@ def test_categorical_index_repr_period(self): '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], - categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp idx = period_range("2011-01", freq="M", periods=5) i = CategoricalIndex(Categorical(idx)) - exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category[period[M]]')""" # noqa: E501 assert repr(i) == exp def test_categorical_index_repr_period_ordered(self): @@ -487,19 +487,19 @@ def test_categorical_index_repr_period_ordered(self): i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], - categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp idx = period_range("2011-01", freq="M", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) - exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category[period[M]]')""" # noqa: E501 assert repr(i) == exp def test_categorical_index_repr_timedelta(self): idx = timedelta_range("1 days", periods=5) i = CategoricalIndex(Categorical(idx)) - exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=False, dtype='category[timedelta64[ns]]')""" # noqa: E501 assert repr(i) == exp idx = timedelta_range("1 hours", periods=10) @@ -508,14 +508,14 @@ def test_categorical_index_repr_timedelta(self): '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', '9 days 01:00:00'], - categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=False, dtype='category')""" # noqa: E501 + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=False, dtype='category[timedelta64[ns]]')""" # noqa: E501 assert repr(i) == exp def test_categorical_index_repr_timedelta_ordered(self): idx = timedelta_range("1 days", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) - exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=True, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=True, dtype='category[timedelta64[ns]]')""" # noqa: E501 assert repr(i) == exp idx = timedelta_range("1 hours", periods=10) @@ -524,7 +524,7 @@ def test_categorical_index_repr_timedelta_ordered(self): '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', '9 days 01:00:00'], - categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=True, dtype='category')""" # noqa: E501 + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=True, dtype='category[timedelta64[ns]]')""" # noqa: E501 assert repr(i) == exp diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 155c61508b706..52c331cbab70e 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1019,7 +1019,7 @@ def test_from_categorical_dtype_both(self): def test_str_vs_repr(self, ordered): c1 = CategoricalDtype(["a", "b"], ordered=ordered) - assert str(c1) == "category" + assert str(c1) == "category[object]" # Py2 will have unicode prefixes pat = ( r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index f2eea452764a6..97230c63af2ab 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -374,7 +374,10 @@ def test_to_records_with_categorical(self): "index": False, "column_dtypes": {"A": "int32", "B": CategoricalDtype(["a", "b"])}, }, - (ValueError, "Invalid dtype category specified for column B"), + ( + ValueError, + "Invalid dtype category\\[object\\] specified for column B", + ), ), # Check that bad types raise ( diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index b4a4324593d22..068e5549cf5d7 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1322,15 +1322,12 @@ def test_any_all_np_func(self, func, data, expected): data = DataFrame(data) if any(isinstance(x, CategoricalDtype) for x in data.dtypes): - with pytest.raises( - TypeError, match="dtype category does not support reduction" - ): + msg = "dtype category\\[int64\\] does not support reduction" + with pytest.raises(TypeError, match=msg): func(data) # method version - with pytest.raises( - TypeError, match="dtype category does not support reduction" - ): + with pytest.raises(TypeError, match=msg): getattr(DataFrame(data), func.__name__)(axis=None) else: msg = "'(any|all)' with datetime64 dtypes is deprecated" diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 98fce9d668e44..21d232dfe515a 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -258,7 +258,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): elif method in ("sum", "mean", "median", "prod"): msg = "|".join( [ - "category type does not support sum operations", + "category\\[object\\] type does not support sum operations", "[Cc]ould not convert", "can't multiply sequence by non-int of type 'str'", ] @@ -276,7 +276,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): [ "[Cc]ould not convert", "Categorical is not ordered", - "category type does not support", + "category\\[object\\] type does not support", "can't multiply sequence", "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0c6661b49d917..c7268173b2eb8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1999,7 +1999,7 @@ def get_categorical_invalid_expected(): elif is_per: msg = "Period type does not support" else: - msg = "category type does not support" + msg = "category\\[.+\\] type does not support" if op == "skew": msg = "|".join([msg, "does not support reduction 'skew'"]) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 6fb903b02b62f..348cc1510b8fb 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -379,25 +379,25 @@ def test_groupby_raises_category( "cumcount": (None, ""), "cummax": ( (NotImplementedError, TypeError), - "(category type does not support cummax operations|" + "(category\\[object\\] type does not support cummax operations|" "category dtype not supported|" "cummax is not supported for category dtype)", ), "cummin": ( (NotImplementedError, TypeError), - "(category type does not support cummin operations|" + "(category\\[object\\] type does not support cummin operations|" "category dtype not supported|" "cummin is not supported for category dtype)", ), "cumprod": ( (NotImplementedError, TypeError), - "(category type does not support cumprod operations|" + "(category\\[object\\] type does not support cumprod operations|" "category dtype not supported|" "cumprod is not supported for category dtype)", ), "cumsum": ( (NotImplementedError, TypeError), - "(category type does not support cumsum operations|" + "(category\\[object\\] type does not support cumsum operations|" "category dtype not supported|" "cumsum is not supported for category dtype)", ), @@ -423,7 +423,7 @@ def test_groupby_raises_category( "|".join( [ "'Categorical' .* does not support reduction 'mean'", - "category dtype does not support aggregation 'mean'", + r"category\[object\] dtype does not support aggregation 'mean'", ] ), ), @@ -432,7 +432,7 @@ def test_groupby_raises_category( "|".join( [ "'Categorical' .* does not support reduction 'median'", - "category dtype does not support aggregation 'median'", + r"category\[object\] dtype does not support aggregation 'median'", ] ), ), @@ -443,7 +443,10 @@ def test_groupby_raises_category( TypeError, r"unsupported operand type\(s\) for /: 'Categorical' and 'Categorical'", ), - "prod": (TypeError, "category type does not support prod operations"), + "prod": ( + TypeError, + r"category\[object\] type does not support prod operations", + ), "quantile": (TypeError, "No matching signature found"), "rank": (None, ""), "sem": ( @@ -451,7 +454,7 @@ def test_groupby_raises_category( "|".join( [ "'Categorical' .* does not support reduction 'sem'", - "category dtype does not support aggregation 'sem'", + r"category\[object\] dtype does not support aggregation 'sem'", ] ), ), @@ -462,7 +465,7 @@ def test_groupby_raises_category( "|".join( [ "dtype category does not support reduction 'skew'", - "category type does not support skew operations", + r"category\[object\] type does not support skew operations", ] ), ), @@ -471,17 +474,17 @@ def test_groupby_raises_category( "|".join( [ "'Categorical' .* does not support reduction 'std'", - "category dtype does not support aggregation 'std'", + r"category\[object\] dtype does not support aggregation 'std'", ] ), ), - "sum": (TypeError, "category type does not support sum operations"), + "sum": (TypeError, r"category\[object\] type does not support sum operations"), "var": ( TypeError, "|".join( [ "'Categorical' .* does not support reduction 'var'", - "category dtype does not support aggregation 'var'", + r"category\[object\] dtype does not support aggregation 'var'", ] ), ), @@ -519,10 +522,10 @@ def test_groupby_raises_category_np( gb = gb["d"] klass, msg = { - np.sum: (TypeError, "category type does not support sum operations"), + np.sum: (TypeError, r"category\[object\] type does not support sum operations"), np.mean: ( TypeError, - "category dtype does not support aggregation 'mean'", + r"category\[object\] dtype does not support aggregation 'mean'", ), }[groupby_func_np] @@ -572,25 +575,25 @@ def test_groupby_raises_category_on_category( (NotImplementedError, TypeError), "(cummax is not supported for category dtype|" "category dtype not supported|" - "category type does not support cummax operations)", + r"category\[object\] type does not support cummax operations)", ), "cummin": ( (NotImplementedError, TypeError), "(cummin is not supported for category dtype|" "category dtype not supported|" - "category type does not support cummin operations)", + r"category\[object\] type does not support cummin operations)", ), "cumprod": ( (NotImplementedError, TypeError), "(cumprod is not supported for category dtype|" "category dtype not supported|" - "category type does not support cumprod operations)", + r"category\[object\] type does not support cumprod operations)", ), "cumsum": ( (NotImplementedError, TypeError), "(cumsum is not supported for category dtype|" "category dtype not supported|" - "category type does not support cumsum operations)", + r"category\[object\] type does not support cumsum operations)", ), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), @@ -610,13 +613,22 @@ def test_groupby_raises_category_on_category( else (None, ""), "last": (None, ""), "max": (None, ""), - "mean": (TypeError, "category dtype does not support aggregation 'mean'"), - "median": (TypeError, "category dtype does not support aggregation 'median'"), + "mean": ( + TypeError, + r"category\[object\] dtype does not support aggregation 'mean'", + ), + "median": ( + TypeError, + r"category\[object\] dtype does not support aggregation 'median'", + ), "min": (None, ""), "ngroup": (None, ""), "nunique": (None, ""), "pct_change": (TypeError, "unsupported operand type"), - "prod": (TypeError, "category type does not support prod operations"), + "prod": ( + TypeError, + r"category\[object\] type does not support prod operations", + ), "quantile": (TypeError, ""), "rank": (None, ""), "sem": ( @@ -624,7 +636,7 @@ def test_groupby_raises_category_on_category( "|".join( [ "'Categorical' .* does not support reduction 'sem'", - "category dtype does not support aggregation 'sem'", + r"category\[object\] dtype does not support aggregation 'sem'", ] ), ), @@ -634,7 +646,7 @@ def test_groupby_raises_category_on_category( TypeError, "|".join( [ - "category type does not support skew operations", + r"category\[object\] type does not support skew operations", "dtype category does not support reduction 'skew'", ] ), @@ -644,17 +656,17 @@ def test_groupby_raises_category_on_category( "|".join( [ "'Categorical' .* does not support reduction 'std'", - "category dtype does not support aggregation 'std'", + r"category\[object\] dtype does not support aggregation 'std'", ] ), ), - "sum": (TypeError, "category type does not support sum operations"), + "sum": (TypeError, r"category\[object\] type does not support sum operations"), "var": ( TypeError, "|".join( [ "'Categorical' .* does not support reduction 'var'", - "category dtype does not support aggregation 'var'", + r"category\[object\] dtype does not support aggregation 'var'", ] ), ), diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 7dbcaaa8d4ba6..19a10153c5391 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -16,7 +16,7 @@ def test_format_different_scalar_lengths(self): def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) - expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 + expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected # multiple lines @@ -24,7 +24,7 @@ def test_string_categorical_index_repr(self): expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected @@ -33,7 +33,7 @@ def test_string_categorical_index_repr(self): expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa: E501 + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category[object]', length=300)""" # noqa: E501 assert repr(idx) == expected @@ -41,13 +41,13 @@ def test_string_categorical_index_repr(self): idx = CategoricalIndex(list("abcdefghijklmmo")) expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'm', 'o'], - categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o'], ordered=False, dtype='category')""" # noqa: E501 + categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected # short idx = CategoricalIndex(["あ", "いい", "ううう"]) - expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected # multiple lines @@ -55,7 +55,7 @@ def test_string_categorical_index_repr(self): expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected @@ -64,7 +64,7 @@ def test_string_categorical_index_repr(self): expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]', length=300)""" # noqa: E501 assert repr(idx) == expected @@ -72,7 +72,7 @@ def test_string_categorical_index_repr(self): idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], - categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 + categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected @@ -80,7 +80,7 @@ def test_string_categorical_index_repr(self): with cf.option_context("display.unicode.east_asian_width", True): # short idx = CategoricalIndex(["あ", "いい", "ううう"]) - expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected # multiple lines @@ -89,7 +89,7 @@ def test_string_categorical_index_repr(self): 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected @@ -100,7 +100,7 @@ def test_string_categorical_index_repr(self): ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]', length=300)""" # noqa: E501 assert repr(idx) == expected @@ -108,6 +108,6 @@ def test_string_categorical_index_repr(self): idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], - categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 + categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index ecc5d3060c0a2..99ae73df68418 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -93,12 +93,11 @@ class TestSeriesNLargestNSmallest: # Series([3., 2, 1, 2, 5], dtype='complex256'), Series([3.0, 2, 1, 2, 5], dtype="complex128"), Series(list("abcde")), - Series(list("abcde"), dtype="category"), ], ) def test_nlargest_error(self, r): dt = r.dtype - msg = f"Cannot use method 'n(largest|smallest)' with dtype {dt}" + msg = rf"Cannot use method 'n(largest|smallest)' with dtype {dt}" args = 2, len(r), 0, -1 methods = r.nlargest, r.nsmallest for method, arg in product(methods, args): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index a0edfae606e3f..795bc8847b4aa 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -576,7 +576,7 @@ def test_unequal_categorical_comparison_raises_type_error(self): # for unequal comps, but not for equal/not equal cat = Series(Categorical(list("abc"), ordered=True)) - msg = "Invalid comparison between dtype=category and str" + msg = r"Invalid comparison between dtype=category\[object\] and str" with pytest.raises(TypeError, match=msg): cat < "d" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index c42b9f056878d..0fdffb8ee6271 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -290,7 +290,7 @@ def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) exp = ( "0 1\n1 2\n2 3\n3 4\n" - "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + "dtype: category[int64]\nCategories (4, int64): [1, 2, 3, 4]" ) assert exp == a.__str__() @@ -300,7 +300,7 @@ def test_categorical_repr(self): "0 a\n1 b\n" " ..\n" "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" + "Length: 50, dtype: category[object]\nCategories (2, object): ['a', 'b']" ) with option_context("display.max_rows", 5): assert exp == repr(a) @@ -309,7 +309,7 @@ def test_categorical_repr(self): a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) exp = ( "0 a\n1 b\n" - "dtype: category\n" + "dtype: category[object]\n" "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... 'w' < 'x' < 'y' < 'z']" ) assert exp == a.__str__() @@ -319,7 +319,7 @@ def test_categorical_series_repr(self): exp = """0 1 1 2 2 3 -dtype: category +dtype: category[int64] Categories (3, int64): [1, 2, 3]""" assert repr(s) == exp @@ -335,7 +335,7 @@ def test_categorical_series_repr(self): 7 7 8 8 9 9 -dtype: category +dtype: category[{np.int_().dtype}] Categories (10, {np.int_().dtype}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" assert repr(s) == exp @@ -345,7 +345,7 @@ def test_categorical_series_repr_ordered(self): exp = """0 1 1 2 2 3 -dtype: category +dtype: category[int64] Categories (3, int64): [1 < 2 < 3]""" assert repr(s) == exp @@ -361,7 +361,7 @@ def test_categorical_series_repr_ordered(self): 7 7 8 8 9 9 -dtype: category +dtype: category[{np.int_().dtype}] Categories (10, {np.int_().dtype}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" assert repr(s) == exp @@ -374,7 +374,7 @@ def test_categorical_series_repr_datetime(self): 2 2011-01-01 11:00:00 3 2011-01-01 12:00:00 4 2011-01-01 13:00:00 -dtype: category +dtype: category[datetime64[ns]] Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" # noqa: E501 @@ -387,7 +387,7 @@ def test_categorical_series_repr_datetime(self): 2 2011-01-01 11:00:00-05:00 3 2011-01-01 12:00:00-05:00 4 2011-01-01 13:00:00-05:00 -dtype: category +dtype: category[datetime64[ns, US/Eastern]] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]""" # noqa: E501 @@ -402,7 +402,7 @@ def test_categorical_series_repr_datetime_ordered(self): 2 2011-01-01 11:00:00 3 2011-01-01 12:00:00 4 2011-01-01 13:00:00 -dtype: category +dtype: category[datetime64[ns]] Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501 @@ -415,7 +415,7 @@ def test_categorical_series_repr_datetime_ordered(self): 2 2011-01-01 11:00:00-05:00 3 2011-01-01 12:00:00-05:00 4 2011-01-01 13:00:00-05:00 -dtype: category +dtype: category[datetime64[ns, US/Eastern]] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < 2011-01-01 13:00:00-05:00]""" # noqa: E501 @@ -430,7 +430,7 @@ def test_categorical_series_repr_period(self): 2 2011-01-01 11:00 3 2011-01-01 12:00 4 2011-01-01 13:00 -dtype: category +dtype: category[period[H]] Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa: E501 @@ -443,7 +443,7 @@ def test_categorical_series_repr_period(self): 2 2011-03 3 2011-04 4 2011-05 -dtype: category +dtype: category[period[M]] Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" assert repr(s) == exp @@ -456,7 +456,7 @@ def test_categorical_series_repr_period_ordered(self): 2 2011-01-01 11:00 3 2011-01-01 12:00 4 2011-01-01 13:00 -dtype: category +dtype: category[period[H]] Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa: E501 @@ -469,7 +469,7 @@ def test_categorical_series_repr_period_ordered(self): 2 2011-03 3 2011-04 4 2011-05 -dtype: category +dtype: category[period[M]] Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" assert repr(s) == exp @@ -482,7 +482,7 @@ def test_categorical_series_repr_timedelta(self): 2 3 days 3 4 days 4 5 days -dtype: category +dtype: category[timedelta64[ns]] Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" assert repr(s) == exp @@ -499,7 +499,7 @@ def test_categorical_series_repr_timedelta(self): 7 7 days 01:00:00 8 8 days 01:00:00 9 9 days 01:00:00 -dtype: category +dtype: category[timedelta64[ns]] Categories (10, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00]""" # noqa: E501 @@ -514,7 +514,7 @@ def test_categorical_series_repr_timedelta_ordered(self): 2 3 days 3 4 days 4 5 days -dtype: category +dtype: category[timedelta64[ns]] Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" assert repr(s) == exp @@ -531,7 +531,7 @@ def test_categorical_series_repr_timedelta_ordered(self): 7 7 days 01:00:00 8 8 days 01:00:00 9 9 days 01:00:00 -dtype: category +dtype: category[timedelta64[ns]] Categories (10, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < 3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 < 8 days 01:00:00 < 9 days 01:00:00]""" # noqa: E501