diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 244af3a577fe2..bf1e1b3f40ab0 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,4 +1,8 @@ from .pandas_vb_common import * +try: + from pandas.types.concat import union_categoricals +except ImportError: + pass import string @@ -12,6 +16,17 @@ def time_concat_categorical(self): concat([self.s, self.s]) +class union_categorical(object): + goal_time = 0.2 + + def setup(self): + self.a = pd.Categorical((list('aabbcd') * 1000000)) + self.b = pd.Categorical((list('bbcdjk') * 1000000)) + + def time_union_categorical(self): + union_categoricals([self.a, self.b]) + + class categorical_value_counts(object): goal_time = 1 diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index b518bc947c2da..e971f1f28903f 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -648,6 +648,31 @@ In this case the categories are not the same and so an error is raised: The same applies to ``df.append(df_different)``. +.. _categorical.union: + +Unioning +~~~~~~~~ + +.. versionadded:: 0.18.2 + +If you want to combine categoricals that do not necessarily have +the same categories, the `union_categorical` function will +combine a list-like of categoricals. The new categories +will be the union of the categories being combined. + +.. ipython:: python + + from pandas.types.concat import union_categoricals + a = pd.Categorical(["b", "c"]) + b = pd.Categorical(["a", "b"]) + union_categoricals([a, b]) + +.. note:: + + `union_categoricals` only works with unordered categoricals + and will raise if any are ordered. + + Getting Data In/Out ------------------- diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 7493150370e9f..c45a1704e228a 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -90,7 +90,7 @@ Other enhancements - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - +- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 9d9b0635e0f35..a8c86657a48cc 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -9,7 +9,8 @@ from pandas import (DataFrame, concat, read_csv, isnull, Series, date_range, Index, Panel, MultiIndex, Timestamp, - DatetimeIndex) + DatetimeIndex, Categorical) +from pandas.types.concat import union_categoricals from pandas.util import testing as tm from pandas.util.testing import (assert_frame_equal, makeCustomDataframe as mkdf, @@ -919,6 +920,54 @@ def test_concat_keys_with_none(self): keys=['b', 'c', 'd', 'e']) tm.assert_frame_equal(result, expected) + def test_union_categorical(self): + # GH 13361 + data = [ + (list('abc'), list('abd'), list('abcabd')), + ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), + ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), + + (pd.date_range('2014-01-01', '2014-01-05'), + pd.date_range('2014-01-06', '2014-01-07'), + pd.date_range('2014-01-01', '2014-01-07')), + + (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), + pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), + pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), + + (pd.period_range('2014-01-01', '2014-01-05'), + pd.period_range('2014-01-06', '2014-01-07'), + pd.period_range('2014-01-01', '2014-01-07')), + ] + + for a, b, combined in data: + result = union_categoricals([Categorical(a), Categorical(b)]) + expected = Categorical(combined) + tm.assert_categorical_equal(result, expected, + check_category_order=True) + + # new categories ordered by appearance + s = Categorical(['x', 'y', 'z']) + s2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([s, s2]).categories + expected = Index(['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_index_equal(result, expected) + + # can't be ordered + s = Categorical([0, 1.2, 2], ordered=True) + s2 = Categorical([0, 1.2, 2], ordered=True) + with tm.assertRaises(TypeError): + union_categoricals([s, s2]) + + # must exactly match types + s = Categorical([0, 1.2, 2]) + s2 = Categorical([2, 3, 4]) + with tm.assertRaises(TypeError): + union_categoricals([s, s2]) + + with tm.assertRaises(ValueError): + union_categoricals([]) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 5cd7abb6889b7..53db9ddf79a5c 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -201,6 +201,57 @@ def convert_categorical(x): return Categorical(concatted, rawcats) +def union_categoricals(to_union): + """ + Combine list-like of Categoricals, unioning categories. All + must have the same dtype, and none can be ordered. + + .. versionadded 0.18.2 + + Parameters + ---------- + to_union : list-like of Categoricals + + Returns + ------- + Categorical + A single array, categories will be ordered as they + appear in the list + + Raises + ------ + TypeError + If any of the categoricals are ordered or all do not + have the same dtype + ValueError + Emmpty list of categoricals passed + """ + from pandas import Index, Categorical + + if len(to_union) == 0: + raise ValueError('No Categoricals to union') + + first = to_union[0] + if any(c.ordered for c in to_union): + raise TypeError("Can only combine unordered Categoricals") + + if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype) + for c in to_union): + raise TypeError("dtype of categories must be the same") + + cats = first.categories + unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() + categories = Index(unique_cats) + + new_codes = [] + for c in to_union: + indexer = categories.get_indexer(c.categories) + new_codes.append(indexer.take(c.codes)) + codes = np.concatenate(new_codes) + return Categorical(codes, categories=categories, ordered=False, + fastpath=True) + + def _concat_datetime(to_concat, axis=0, typs=None): """ provide concatenation of an datetimelike array of arrays each of which is a diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 03ccfcab24f58..d13873fcf2c84 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -963,14 +963,40 @@ def assertNotIsInstance(obj, cls, msg=''): def assert_categorical_equal(left, right, check_dtype=True, - obj='Categorical'): + obj='Categorical', check_category_order=True): + """Test that categoricals are eqivalent + + Parameters + ---------- + left, right : Categorical + Categoricals to compare + check_dtype : bool, default True + Check that integer dtype of the codes are the same + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message + check_category_order : bool, default True + Whether the order of the categories should be compared, which + implies identical integer codes. If False, only the resulting + values are compared. The ordered attribute is + checked regardless. + """ assertIsInstance(left, pd.Categorical, '[Categorical] ') assertIsInstance(right, pd.Categorical, '[Categorical] ') - assert_index_equal(left.categories, right.categories, - obj='{0}.categories'.format(obj)) - assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype, - obj='{0}.codes'.format(obj)) + if check_category_order: + assert_index_equal(left.categories, right.categories, + obj='{0}.categories'.format(obj)) + assert_numpy_array_equal(left.codes, right.codes, + check_dtype=check_dtype, + obj='{0}.codes'.format(obj)) + else: + assert_index_equal(left.categories.sort_values(), + right.categories.sort_values(), + obj='{0}.categories'.format(obj)) + assert_index_equal(left.categories.take(left.codes), + right.categories.take(right.codes), + obj='{0}.values'.format(obj)) assert_attr_equal('ordered', left, right, obj=obj)