diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index af47ee878b1c3..ee573da00abdb 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -656,7 +656,7 @@ Categoricals in Series/DataFrame :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, -:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`). +:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`, :issue:`8518`). For full docs, see the :ref:`categorical introduction ` and the :ref:`API documentation `. diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index aa5fa29784912..b35cfdcf7c8f1 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -187,6 +187,8 @@ class Categorical(PandasObject): # For comparisons, so that numpy uses our implementation if the compare ops, which raise __array_priority__ = 1000 + ordered = False + name = None def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False, levels=None): @@ -718,6 +720,21 @@ def __array__(self, dtype=None): return np.asarray(ret, dtype) return ret + def __setstate__(self, state): + """Necessary for making this object picklable""" + if not isinstance(state, dict): + raise Exception('invalid pickle state') + + # Provide compatibility with pre-0.15.0 Categoricals. + if '_codes' not in state and 'labels' in state: + state['_codes'] = state.pop('labels') + if '_categories' not in state and '_levels' in state: + state['_categories'] = \ + self._validate_categories(state.pop('_levels')) + + for k, v in compat.iteritems(state): + setattr(self, k, v) + @property def T(self): return self diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c88d799a54fed..9be680d998216 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1070,16 +1070,19 @@ class NonConsolidatableMixIn(object): def __init__(self, values, placement, ndim=None, fastpath=False,): + # Placement must be converted to BlockPlacement via property setter + # before ndim logic, because placement may be a slice which doesn't + # have a length. + self.mgr_locs = placement + # kludgetastic if ndim is None: - if len(placement) != 1: + if len(self.mgr_locs) != 1: ndim = 1 else: ndim = 2 self.ndim = ndim - self.mgr_locs = placement - if not isinstance(values, self._holder): raise TypeError("values must be {0}".format(self._holder.__name__)) @@ -1852,6 +1855,7 @@ def get_values(self, dtype=None): .reshape(self.values.shape) return self.values + class SparseBlock(NonConsolidatableMixIn, Block): """ implement as a list of sparse arrays of the same dtype """ __slots__ = () @@ -1861,27 +1865,6 @@ class SparseBlock(NonConsolidatableMixIn, Block): _ftype = 'sparse' _holder = SparseArray - def __init__(self, values, placement, - ndim=None, fastpath=False,): - - # Placement must be converted to BlockPlacement via property setter - # before ndim logic, because placement may be a slice which doesn't - # have a length. - self.mgr_locs = placement - - # kludgetastic - if ndim is None: - if len(self.mgr_locs) != 1: - ndim = 1 - else: - ndim = 2 - self.ndim = ndim - - if not isinstance(values, SparseArray): - raise TypeError("values must be SparseArray") - - self.values = values - @property def shape(self): return (len(self.mgr_locs), self.sp_index.length) diff --git a/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle b/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle new file mode 100644 index 0000000000000..d7d20b06df305 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle differ diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py index b20a1e5b60b86..56ef1aa9b0f19 100644 --- a/pandas/io/tests/generate_legacy_pickles.py +++ b/pandas/io/tests/generate_legacy_pickles.py @@ -60,7 +60,7 @@ def create_data(): from pandas import (Series,TimeSeries,DataFrame,Panel, SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel, Index,MultiIndex,PeriodIndex, - date_range,period_range,bdate_range,Timestamp) + date_range,period_range,bdate_range,Timestamp,Categorical) nan = np.nan data = { @@ -85,7 +85,8 @@ def create_data(): mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2], [3,4,3,4,5]])), names=['one','two'])), - dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A'])) + dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), + cat=Series(Categorical(['foo', 'bar', 'baz']))) frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), @@ -95,7 +96,11 @@ def create_data(): ['one','two','one','two','three']])), names=['first','second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), - columns=['A', 'B', 'A'])) + columns=['A', 'B', 'A']), + cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), + cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), + B=np.arange(3))), + ) panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)), dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A'])) diff --git a/pandas/tests/data/categorical_0_14_1.pickle b/pandas/tests/data/categorical_0_14_1.pickle new file mode 100644 index 0000000000000..94f882b2f3027 --- /dev/null +++ b/pandas/tests/data/categorical_0_14_1.pickle @@ -0,0 +1,94 @@ +ccopy_reg +_reconstructor +p0 +(cpandas.core.categorical +Categorical +p1 +c__builtin__ +object +p2 +Ntp3 +Rp4 +(dp5 +S'_levels' +p6 +cnumpy.core.multiarray +_reconstruct +p7 +(cpandas.core.index +Index +p8 +(I0 +tp9 +S'b' +p10 +tp11 +Rp12 +((I1 +(I4 +tp13 +cnumpy +dtype +p14 +(S'O8' +p15 +I0 +I1 +tp16 +Rp17 +(I3 +S'|' +p18 +NNNI-1 +I-1 +I63 +tp19 +bI00 +(lp20 +S'a' +p21 +ag10 +aS'c' +p22 +aS'd' +p23 +atp24 +(Ntp25 +tp26 +bsS'labels' +p27 +g7 +(cnumpy +ndarray +p28 +(I0 +tp29 +g10 +tp30 +Rp31 +(I1 +(I3 +tp32 +g14 +(S'i8' +p33 +I0 +I1 +tp34 +Rp35 +(I3 +S'<' +p36 +NNNI-1 +I-1 +I0 +tp37 +bI00 +S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00' +p38 +tp39 +bsS'name' +p40 +S'foobar' +p41 +sb. \ No newline at end of file diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index a2643b38e4133..03c73232f13bb 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2,6 +2,8 @@ from datetime import datetime from pandas.compat import range, lrange, u +import os +import pickle import re from distutils.version import LooseVersion @@ -21,16 +23,6 @@ def setUp(self): self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) - def assert_categorical_equal(self, res, exp): - if not com.array_equivalent(res.categories, exp.categories): - raise AssertionError('categories not equivalent: {0} vs {1}.'.format(res.categories, - exp.categories)) - if not com.array_equivalent(res.codes, exp.codes): - raise AssertionError('codes not equivalent: {0} vs {1}.'.format(res.codes, - exp.codes)) - self.assertEqual(res.ordered, exp.ordered, "ordered not the same") - self.assertEqual(res.name, exp.name, "name not the same") - def test_getitem(self): self.assertEqual(self.factor[0], 'a') self.assertEqual(self.factor[-1], 'c') @@ -2268,6 +2260,21 @@ def get_dir(s): results = get_dir(s) tm.assert_almost_equal(results,list(sorted(set(ok_for_cat)))) + def test_pickle_v0_14_1(self): + cat = pd.Categorical(values=['a', 'b', 'c'], + levels=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_14_1.pickle') + # This code was executed once on v0.14.1 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + self.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index a523df4cc2461..5bc7558efb471 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -11,7 +11,7 @@ import pandas.util.testing as tm import pandas as pd from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, randn) + assert_almost_equal, assert_frame_equal, randn, assert_series_equal) from pandas.compat import zip, u @@ -363,6 +363,15 @@ def test_non_unique_pickle(self): mgr2 = self.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + def test_categorical_block_pickle(self): + mgr = create_mgr('a: category') + mgr2 = self.round_trip_pickle(mgr) + assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + + smgr = create_single_mgr('category') + smgr2 = self.round_trip_pickle(smgr) + assert_series_equal(Series(smgr), Series(smgr2)) + def test_get_scalar(self): for item in self.mgr.items: for i, index in enumerate(self.mgr.axes[1]): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d8cc39908a31f..b34bcc3c12890 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -121,6 +121,16 @@ def assert_numpy_array_equivalent(self, np_array, assert_equal, strict_nan=False return raise AssertionError('{0} is not equivalent to {1}.'.format(np_array, assert_equal)) + def assert_categorical_equal(self, res, exp): + if not array_equivalent(res.categories, exp.categories): + raise AssertionError('categories not equivalent: {0} vs {1}.'.format(res.categories, + exp.categories)) + if not array_equivalent(res.codes, exp.codes): + raise AssertionError('codes not equivalent: {0} vs {1}.'.format(res.codes, + exp.codes)) + self.assertEqual(res.ordered, exp.ordered, "ordered not the same") + self.assertEqual(res.name, exp.name, "name not the same") + def assertIs(self, first, second, msg=''): """Checks that 'first' is 'second'""" a, b = first, second