From fe30db4740f05bdf433712122107cd43267d32ae Mon Sep 17 00:00:00 2001 From: jschendel Date: Sat, 9 Dec 2017 21:25:32 -0700 Subject: [PATCH 1/3] BUG: Fix Series.astype and Categorical.astype to update existing Categorical data --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/categorical.py | 23 ++++++++-- pandas/core/internals.py | 34 +++++---------- pandas/tests/categorical/test_dtypes.py | 56 ++++++++++++++++++++++--- pandas/tests/series/test_dtypes.py | 39 +++++++++++++++++ 5 files changed, 120 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index c2da0c420f643..f02c389252f47 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -263,6 +263,7 @@ Conversion - Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) - Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) - Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) +- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) Indexing diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index e34755e665f8d..20005c2ad7a12 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -27,7 +27,8 @@ is_categorical_dtype, is_list_like, is_sequence, is_scalar, - is_dict_like) + is_dict_like, + pandas_dtype) from pandas.core.common import is_null_slice, _maybe_box_datetimelike from pandas.core.algorithms import factorize, take_1d, unique1d @@ -435,10 +436,24 @@ def astype(self, dtype, copy=True): .. versionadded:: 0.19.0 """ + if isinstance(dtype, compat.string_types) and dtype == 'category': + # GH 18593: astype('category') should not change anything + return self.copy() if copy else self + + dtype = pandas_dtype(dtype) if is_categorical_dtype(dtype): - if copy is True: - return self.copy() - return self + # GH 18593: keep current categories if None (ordered can't be None) + if dtype.categories is None: + new_categories = self.categories + else: + new_categories = dtype.categories + dtype = CategoricalDtype(new_categories, dtype.ordered) + + self = self.copy() if copy else self + if dtype == self.dtype: + # fastpath if dtypes are equal + return self + return self._set_dtype(dtype) return np.array(self, dtype=dtype, copy=copy) @cache_readonly diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4169a001655cb..6785354c08fce 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -54,7 +54,7 @@ import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex -from pandas.core.common import is_null_slice +from pandas.core.common import is_null_slice, _any_not_none import pandas.core.algorithms as algos from pandas.core.index import Index, MultiIndex, _ensure_index @@ -589,13 +589,16 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, "CategoricalDtype instead", FutureWarning, stacklevel=7) - kwargs = kwargs.copy() - categories = getattr(dtype, 'categories', None) - ordered = getattr(dtype, 'ordered', False) + categories = kwargs.get('categories', None) + ordered = kwargs.get('ordered', None) + if _any_not_none(categories, ordered): + dtype = CategoricalDtype(categories, ordered) - kwargs.setdefault('categories', categories) - kwargs.setdefault('ordered', ordered) - return self.make_block(Categorical(self.values, **kwargs)) + if is_categorical_dtype(self.values): + # GH 10696/18593: update an existing categorical efficiently + return self.make_block(self.values.astype(dtype, copy=copy)) + + return self.make_block(Categorical(self.values, dtype=dtype)) # astype processing dtype = np.dtype(dtype) @@ -2427,23 +2430,6 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) - def _astype(self, dtype, copy=False, errors='raise', values=None, - klass=None, mgr=None): - """ - Coerce to the new type (if copy=True, return a new copy) - raise on an except if raise == True - """ - - if self.is_categorical_astype(dtype): - values = self.values - else: - values = np.asarray(self.values).astype(dtype, copy=False) - - if copy: - values = values.copy() - - return self.make_block(values) - def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ diff --git a/pandas/tests/categorical/test_dtypes.py b/pandas/tests/categorical/test_dtypes.py index 0a41b628bc057..356c5e6589ff9 100644 --- a/pandas/tests/categorical/test_dtypes.py +++ b/pandas/tests/categorical/test_dtypes.py @@ -99,10 +99,56 @@ def test_codes_dtypes(self): result = result.remove_categories(['foo%05d' % i for i in range(300)]) assert result.codes.dtype == 'int8' - def test_astype_categorical(self): + @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize('copy', [True, False]) + def test_astype(self, copy, ordered): + # string + cat = Categorical(list('abbaaccc'), ordered=ordered) + result = cat.astype(object, copy=copy) + expected = np.array(cat) + tm.assert_numpy_array_equal(result, expected) + + msg = 'could not convert string to float' + with tm.assert_raises_regex(ValueError, msg): + cat.astype(float, copy=copy) + + # numeric + cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered) + result = cat.astype(object, copy=copy) + expected = np.array(cat, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + result = cat.astype(int, copy=copy) + expected = np.array(cat, dtype=np.int) + tm.assert_numpy_array_equal(result, expected) + + result = cat.astype(float, copy=copy) + expected = np.array(cat, dtype=np.float) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('copy', [True, False]) + @pytest.mark.parametrize('dtype_ordered', [True, False]) + @pytest.mark.parametrize('cat_ordered', [True, False]) + def test_astype_category(self, copy, dtype_ordered, cat_ordered): + # GH 10696/18593 + data = list('abcaacbab') + cat = Categorical(data, categories=list('bac'), ordered=cat_ordered) + + # standard categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = cat.astype(dtype, copy=copy) + expected = Categorical( + data, categories=cat.categories, ordered=dtype_ordered) + tm.assert_categorical_equal(result, expected) - cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) - tm.assert_categorical_equal(cat, cat.astype('category')) - tm.assert_almost_equal(np.array(cat), cat.astype('object')) + # non-standard categories + dtype = CategoricalDtype(list('adc'), dtype_ordered) + result = cat.astype(dtype, copy=copy) + expected = Categorical(data, dtype=dtype) + tm.assert_categorical_equal(result, expected) - pytest.raises(ValueError, lambda: cat.astype(float)) + if dtype_ordered is False: + # dtype='category' can't specify ordered, so only test once + result = cat.astype('category', copy=copy) + expected = cat + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 12d0267005f19..077ba6d5dfdba 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -322,6 +322,45 @@ def cmp(a, b): lambda x: x.astype('object').astype(Categorical)]: pytest.raises(TypeError, lambda: invalid(s)) + @pytest.mark.parametrize('copy', [True, False]) + @pytest.mark.parametrize('name', [None, 'foo']) + @pytest.mark.parametrize('dtype_ordered', [True, False]) + @pytest.mark.parametrize('series_ordered', [True, False]) + def test_astype_categorical_to_categorical(self, copy, name, dtype_ordered, + series_ordered): + # GH 10696/18593 + s_data = list('abcaacbab') + s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered) + s = Series(s_data, dtype=s_dtype, name=name) + + # unspecified categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = s.astype(dtype, copy=copy) + exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) + expected = Series(s_data, name=name, dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = s.astype('category', ordered=dtype_ordered) + tm.assert_series_equal(result, expected) + + # different categories + dtype = CategoricalDtype(list('adc'), dtype_ordered) + result = s.astype(dtype, copy=copy) + expected = Series(s_data, name=name, dtype=dtype) + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = s.astype( + 'category', categories=list('adc'), ordered=dtype_ordered) + tm.assert_series_equal(result, expected) + + if dtype_ordered is False: + # not specifying ordered, so only test once + expected = s + result = s.astype('category', copy=copy) + tm.assert_series_equal(result, expected) + def test_astype_categoricaldtype(self): s = Series(['a', 'b', 'a']) result = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) From 821d79d3b1ca0ea29aa9b206dfd3c8d2c4ed3944 Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 11 Dec 2017 23:52:04 -0700 Subject: [PATCH 2/3] rebase and use _update_dtype --- pandas/core/categorical.py | 18 +++--------------- pandas/core/internals.py | 1 - 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 20005c2ad7a12..356e76df366b4 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -27,8 +27,7 @@ is_categorical_dtype, is_list_like, is_sequence, is_scalar, - is_dict_like, - pandas_dtype) + is_dict_like) from pandas.core.common import is_null_slice, _maybe_box_datetimelike from pandas.core.algorithms import factorize, take_1d, unique1d @@ -436,22 +435,11 @@ def astype(self, dtype, copy=True): .. versionadded:: 0.19.0 """ - if isinstance(dtype, compat.string_types) and dtype == 'category': - # GH 18593: astype('category') should not change anything - return self.copy() if copy else self - - dtype = pandas_dtype(dtype) if is_categorical_dtype(dtype): - # GH 18593: keep current categories if None (ordered can't be None) - if dtype.categories is None: - new_categories = self.categories - else: - new_categories = dtype.categories - dtype = CategoricalDtype(new_categories, dtype.ordered) - + # GH 10696/18593 + dtype = self.dtype._update_dtype(dtype) self = self.copy() if copy else self if dtype == self.dtype: - # fastpath if dtypes are equal return self return self._set_dtype(dtype) return np.array(self, dtype=dtype, copy=copy) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 6785354c08fce..3a64a0ef84e3d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -573,7 +573,6 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, raise TypeError(msg) # may need to convert to categorical - # this is only called for non-categoricals if self.is_categorical_astype(dtype): # deprecated 17636 From 6702f909d2843cf6c0923c9392af59eaf43981d2 Mon Sep 17 00:00:00 2001 From: jschendel Date: Tue, 12 Dec 2017 20:35:55 -0700 Subject: [PATCH 3/3] remove copy from tests --- pandas/tests/categorical/test_dtypes.py | 22 ++++++++++------------ pandas/tests/indexes/test_category.py | 9 ++++----- pandas/tests/series/test_dtypes.py | 9 ++++----- 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/pandas/tests/categorical/test_dtypes.py b/pandas/tests/categorical/test_dtypes.py index 356c5e6589ff9..bad2c27026b31 100644 --- a/pandas/tests/categorical/test_dtypes.py +++ b/pandas/tests/categorical/test_dtypes.py @@ -100,55 +100,53 @@ def test_codes_dtypes(self): assert result.codes.dtype == 'int8' @pytest.mark.parametrize('ordered', [True, False]) - @pytest.mark.parametrize('copy', [True, False]) - def test_astype(self, copy, ordered): + def test_astype(self, ordered): # string cat = Categorical(list('abbaaccc'), ordered=ordered) - result = cat.astype(object, copy=copy) + result = cat.astype(object) expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) msg = 'could not convert string to float' with tm.assert_raises_regex(ValueError, msg): - cat.astype(float, copy=copy) + cat.astype(float) # numeric cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered) - result = cat.astype(object, copy=copy) + result = cat.astype(object) expected = np.array(cat, dtype=object) tm.assert_numpy_array_equal(result, expected) - result = cat.astype(int, copy=copy) + result = cat.astype(int) expected = np.array(cat, dtype=np.int) tm.assert_numpy_array_equal(result, expected) - result = cat.astype(float, copy=copy) + result = cat.astype(float) expected = np.array(cat, dtype=np.float) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('copy', [True, False]) @pytest.mark.parametrize('dtype_ordered', [True, False]) @pytest.mark.parametrize('cat_ordered', [True, False]) - def test_astype_category(self, copy, dtype_ordered, cat_ordered): + def test_astype_category(self, dtype_ordered, cat_ordered): # GH 10696/18593 data = list('abcaacbab') cat = Categorical(data, categories=list('bac'), ordered=cat_ordered) # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) - result = cat.astype(dtype, copy=copy) + result = cat.astype(dtype) expected = Categorical( data, categories=cat.categories, ordered=dtype_ordered) tm.assert_categorical_equal(result, expected) # non-standard categories dtype = CategoricalDtype(list('adc'), dtype_ordered) - result = cat.astype(dtype, copy=copy) + result = cat.astype(dtype) expected = Categorical(data, dtype=dtype) tm.assert_categorical_equal(result, expected) if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once - result = cat.astype('category', copy=copy) + result = cat.astype('category') expected = cat tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index ae9e011d76597..543f59013ff12 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -411,11 +411,10 @@ def test_astype(self): result = IntervalIndex.from_intervals(result.values) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('copy', [True, False]) @pytest.mark.parametrize('name', [None, 'foo']) @pytest.mark.parametrize('dtype_ordered', [True, False]) @pytest.mark.parametrize('index_ordered', [True, False]) - def test_astype_category(self, copy, name, dtype_ordered, index_ordered): + def test_astype_category(self, name, dtype_ordered, index_ordered): # GH 18630 index = self.create_index(ordered=index_ordered) if name: @@ -423,7 +422,7 @@ def test_astype_category(self, copy, name, dtype_ordered, index_ordered): # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) - result = index.astype(dtype, copy=copy) + result = index.astype(dtype) expected = CategoricalIndex(index.tolist(), name=name, categories=index.categories, @@ -432,13 +431,13 @@ def test_astype_category(self, copy, name, dtype_ordered, index_ordered): # non-standard categories dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) - result = index.astype(dtype, copy=copy) + result = index.astype(dtype) expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) tm.assert_index_equal(result, expected) if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once - result = index.astype('category', copy=copy) + result = index.astype('category') expected = index tm.assert_index_equal(result, expected) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 077ba6d5dfdba..441e811706487 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -322,11 +322,10 @@ def cmp(a, b): lambda x: x.astype('object').astype(Categorical)]: pytest.raises(TypeError, lambda: invalid(s)) - @pytest.mark.parametrize('copy', [True, False]) @pytest.mark.parametrize('name', [None, 'foo']) @pytest.mark.parametrize('dtype_ordered', [True, False]) @pytest.mark.parametrize('series_ordered', [True, False]) - def test_astype_categorical_to_categorical(self, copy, name, dtype_ordered, + def test_astype_categorical_to_categorical(self, name, dtype_ordered, series_ordered): # GH 10696/18593 s_data = list('abcaacbab') @@ -335,7 +334,7 @@ def test_astype_categorical_to_categorical(self, copy, name, dtype_ordered, # unspecified categories dtype = CategoricalDtype(ordered=dtype_ordered) - result = s.astype(dtype, copy=copy) + result = s.astype(dtype) exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) expected = Series(s_data, name=name, dtype=exp_dtype) tm.assert_series_equal(result, expected) @@ -346,7 +345,7 @@ def test_astype_categorical_to_categorical(self, copy, name, dtype_ordered, # different categories dtype = CategoricalDtype(list('adc'), dtype_ordered) - result = s.astype(dtype, copy=copy) + result = s.astype(dtype) expected = Series(s_data, name=name, dtype=dtype) tm.assert_series_equal(result, expected) @@ -358,7 +357,7 @@ def test_astype_categorical_to_categorical(self, copy, name, dtype_ordered, if dtype_ordered is False: # not specifying ordered, so only test once expected = s - result = s.astype('category', copy=copy) + result = s.astype('category') tm.assert_series_equal(result, expected) def test_astype_categoricaldtype(self):