Skip to content

BUG: fix CategoricalBlock pickling #8519

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 10, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ Categoricals in Series/DataFrame
:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`,
:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`).
:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`, :issue:`8518`).

For full docs, see the :ref:`categorical introduction <categorical>` and the
:ref:`API documentation <api.categorical>`.
Expand Down
17 changes: 17 additions & 0 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ class Categorical(PandasObject):

# For comparisons, so that numpy uses our implementation if the compare ops, which raise
__array_priority__ = 1000
ordered = False
name = None

def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False,
levels=None):
Expand Down Expand Up @@ -718,6 +720,21 @@ def __array__(self, dtype=None):
return np.asarray(ret, dtype)
return ret

def __setstate__(self, state):
"""Necessary for making this object picklable"""
if not isinstance(state, dict):
raise Exception('invalid pickle state')

# Provide compatibility with pre-0.15.0 Categoricals.
if '_codes' not in state and 'labels' in state:
state['_codes'] = state.pop('labels')
if '_categories' not in state and '_levels' in state:
state['_categories'] = \
self._validate_categories(state.pop('_levels'))

for k, v in compat.iteritems(state):
setattr(self, k, v)

@property
def T(self):
return self
Expand Down
31 changes: 7 additions & 24 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,16 +1070,19 @@ class NonConsolidatableMixIn(object):
def __init__(self, values, placement,
ndim=None, fastpath=False,):

# Placement must be converted to BlockPlacement via property setter
# before ndim logic, because placement may be a slice which doesn't
# have a length.
self.mgr_locs = placement

# kludgetastic
if ndim is None:
if len(placement) != 1:
if len(self.mgr_locs) != 1:
ndim = 1
else:
ndim = 2
self.ndim = ndim

self.mgr_locs = placement

if not isinstance(values, self._holder):
raise TypeError("values must be {0}".format(self._holder.__name__))

Expand Down Expand Up @@ -1852,6 +1855,7 @@ def get_values(self, dtype=None):
.reshape(self.values.shape)
return self.values


class SparseBlock(NonConsolidatableMixIn, Block):
""" implement as a list of sparse arrays of the same dtype """
__slots__ = ()
Expand All @@ -1861,27 +1865,6 @@ class SparseBlock(NonConsolidatableMixIn, Block):
_ftype = 'sparse'
_holder = SparseArray

def __init__(self, values, placement,
ndim=None, fastpath=False,):

# Placement must be converted to BlockPlacement via property setter
# before ndim logic, because placement may be a slice which doesn't
# have a length.
self.mgr_locs = placement

# kludgetastic
if ndim is None:
if len(self.mgr_locs) != 1:
ndim = 1
else:
ndim = 2
self.ndim = ndim

if not isinstance(values, SparseArray):
raise TypeError("values must be SparseArray")

self.values = values

@property
def shape(self):
return (len(self.mgr_locs), self.sp_index.length)
Expand Down
Binary file not shown.
11 changes: 8 additions & 3 deletions pandas/io/tests/generate_legacy_pickles.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def create_data():
from pandas import (Series,TimeSeries,DataFrame,Panel,
SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel,
Index,MultiIndex,PeriodIndex,
date_range,period_range,bdate_range,Timestamp)
date_range,period_range,bdate_range,Timestamp,Categorical)
nan = np.nan

data = {
Expand All @@ -85,7 +85,8 @@ def create_data():
mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2],
[3,4,3,4,5]])),
names=['one','two'])),
dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']))
dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']),
cat=Series(Categorical(['foo', 'bar', 'baz'])))

frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)),
Expand All @@ -95,7 +96,11 @@ def create_data():
['one','two','one','two','three']])),
names=['first','second'])),
dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
columns=['A', 'B', 'A']))
columns=['A', 'B', 'A']),
cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))),
cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']),
B=np.arange(3))),
)
panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
items=['A', 'B', 'A']))
Expand Down
94 changes: 94 additions & 0 deletions pandas/tests/data/categorical_0_14_1.pickle
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
ccopy_reg
_reconstructor
p0
(cpandas.core.categorical
Categorical
p1
c__builtin__
object
p2
Ntp3
Rp4
(dp5
S'_levels'
p6
cnumpy.core.multiarray
_reconstruct
p7
(cpandas.core.index
Index
p8
(I0
tp9
S'b'
p10
tp11
Rp12
((I1
(I4
tp13
cnumpy
dtype
p14
(S'O8'
p15
I0
I1
tp16
Rp17
(I3
S'|'
p18
NNNI-1
I-1
I63
tp19
bI00
(lp20
S'a'
p21
ag10
aS'c'
p22
aS'd'
p23
atp24
(Ntp25
tp26
bsS'labels'
p27
g7
(cnumpy
ndarray
p28
(I0
tp29
g10
tp30
Rp31
(I1
(I3
tp32
g14
(S'i8'
p33
I0
I1
tp34
Rp35
(I3
S'<'
p36
NNNI-1
I-1
I0
tp37
bI00
S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00'
p38
tp39
bsS'name'
p40
S'foobar'
p41
sb.
27 changes: 17 additions & 10 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from datetime import datetime
from pandas.compat import range, lrange, u
import os
import pickle
import re
from distutils.version import LooseVersion

Expand All @@ -21,16 +23,6 @@ def setUp(self):
self.factor = Categorical.from_array(['a', 'b', 'b', 'a',
'a', 'c', 'c', 'c'])

def assert_categorical_equal(self, res, exp):
if not com.array_equivalent(res.categories, exp.categories):
raise AssertionError('categories not equivalent: {0} vs {1}.'.format(res.categories,
exp.categories))
if not com.array_equivalent(res.codes, exp.codes):
raise AssertionError('codes not equivalent: {0} vs {1}.'.format(res.codes,
exp.codes))
self.assertEqual(res.ordered, exp.ordered, "ordered not the same")
self.assertEqual(res.name, exp.name, "name not the same")

def test_getitem(self):
self.assertEqual(self.factor[0], 'a')
self.assertEqual(self.factor[-1], 'c')
Expand Down Expand Up @@ -2268,6 +2260,21 @@ def get_dir(s):
results = get_dir(s)
tm.assert_almost_equal(results,list(sorted(set(ok_for_cat))))

def test_pickle_v0_14_1(self):
cat = pd.Categorical(values=['a', 'b', 'c'],
levels=['a', 'b', 'c', 'd'],
name='foobar', ordered=False)
pickle_path = os.path.join(tm.get_data_path(),
'categorical_0_14_1.pickle')
# This code was executed once on v0.14.1 to generate the pickle:
#
# cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
# name='foobar')
# with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
#
self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down
11 changes: 10 additions & 1 deletion pandas/tests/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pandas.util.testing as tm
import pandas as pd
from pandas.util.testing import (
assert_almost_equal, assert_frame_equal, randn)
assert_almost_equal, assert_frame_equal, randn, assert_series_equal)
from pandas.compat import zip, u


Expand Down Expand Up @@ -363,6 +363,15 @@ def test_non_unique_pickle(self):
mgr2 = self.round_trip_pickle(mgr)
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))

def test_categorical_block_pickle(self):
mgr = create_mgr('a: category')
mgr2 = self.round_trip_pickle(mgr)
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))

smgr = create_single_mgr('category')
smgr2 = self.round_trip_pickle(smgr)
assert_series_equal(Series(smgr), Series(smgr2))

def test_get_scalar(self):
for item in self.mgr.items:
for i, index in enumerate(self.mgr.axes[1]):
Expand Down
10 changes: 10 additions & 0 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,16 @@ def assert_numpy_array_equivalent(self, np_array, assert_equal, strict_nan=False
return
raise AssertionError('{0} is not equivalent to {1}.'.format(np_array, assert_equal))

def assert_categorical_equal(self, res, exp):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh, this is defined in tests/test_categorical.py, pls remove from there and change the usages to use this one (or I can do after), lmk

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've moved it from there to here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, except the tests in categorical use self.assert_categorical_equal (easiest to maybe define this in utils/testing/TestCase to call this is ok too

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

easiest to maybe define this in utils/testing/TestCase

which is what this patch does, no? :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

duh! wasn't really looking at it! (side issue - we have BOTH module level and TestCase level functions for some of the comparators...I think their is an issue out there somewhere)

e.g. you can do tm.assert_series_equal but not self.assert_series_equal...but ok with the cat test changes

if not array_equivalent(res.categories, exp.categories):
raise AssertionError('categories not equivalent: {0} vs {1}.'.format(res.categories,
exp.categories))
if not array_equivalent(res.codes, exp.codes):
raise AssertionError('codes not equivalent: {0} vs {1}.'.format(res.codes,
exp.codes))
self.assertEqual(res.ordered, exp.ordered, "ordered not the same")
self.assertEqual(res.name, exp.name, "name not the same")

def assertIs(self, first, second, msg=''):
"""Checks that 'first' is 'second'"""
a, b = first, second
Expand Down