diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5a5ea827e74ad..40908b4fc3b02 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -311,6 +311,9 @@ Reshaping - Bug in merging with categorical dtypes with datetimelikes incorrectly raised a ``TypeError`` (:issue:`16900`) - Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`) - Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) +- Bug in ``pd.crosstab(normalize=True, margins=True)`` when at least one axis has a multi-index (:issue:`15150`) + +>>>>>>> added whatsnew and reformatted tests to be more readable Numeric ^^^^^^^ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index c2fb81178433e..fcfb78974f16f 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -200,22 +200,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', def _add_margins(table, data, values, rows, cols, aggfunc, margins_name='All'): - if not isinstance(margins_name, compat.string_types): - raise ValueError('margins_name argument must be a string') - exception_msg = 'Conflicting name "{0}" in margins'.format(margins_name) - for level in table.index.names: - if margins_name in table.index.get_level_values(level): - raise ValueError(exception_msg) + _check_margins_name(margins_name, table) grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) - # could be passed a Series object with no 'columns' - if hasattr(table, 'columns'): - for level in table.columns.names[1:]: - if margins_name in table.columns.get_level_values(level): - raise ValueError(exception_msg) - if len(rows) > 1: key = (margins_name,) + ('',) * (len(rows) - 1) else: @@ -264,6 +253,35 @@ def _add_margins(table, data, values, rows, cols, aggfunc, return result +def _check_margins_name(margins_name, table): + """ + Checks if margins_name is a correct input argument for pivot_table + or crosstab. + + Parameters + ---------- + + margins_name : string, default 'All' + Name of the row / column that will contain the totals + when margins is True. + table : DataFrame + """ + + if not isinstance(margins_name, compat.string_types): + raise ValueError('margins_name argument must be a string') + + exception_msg = 'Conflicting name "{0}" in margins'.format(margins_name) + for level in table.index.names: + if margins_name in table.index.get_level_values(level): + raise ValueError(exception_msg) + + # could be passed a Series object with no 'columns' + if hasattr(table, 'columns'): + for level in table.columns.names[1:]: + if margins_name in table.columns.get_level_values(level): + raise ValueError(exception_msg) + + def _compute_grand_margin(data, values, aggfunc, margins_name='All'): @@ -521,13 +539,31 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, kwargs = {'aggfunc': aggfunc} table = df.pivot_table('__dummy__', index=rownames, columns=colnames, - margins=margins, margins_name=margins_name, + margins=False, margins_name=margins_name, dropna=dropna, **kwargs) # GH 17013: if values is None and margins: table = table.fillna(0).astype(np.int64) + if margins: + _check_margins_name(margins_name, table) + + if normalize != 'index': + # add margin column + table[margins_name] = table.sum(axis=1) + + if normalize != 'columns': + # add margin row + if isinstance(table.index, MultiIndex): + # workaround for adding a margins row to a MultiIndex object + # to be removed when GH 17024 is fixed + new_index = _add_margins_to_multiindex(table.index, + margins_name) + table.loc[margins_name] = table.sum(axis=0) + table.index = new_index + else: + table.loc[margins_name] = table.sum(axis=0) # Post-process if normalize is not False: table = _normalize(table, normalize=normalize, margins=margins, @@ -536,6 +572,25 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, return table +def _add_margins_to_multiindex(index, margins_name): + # workaround for adding a margins row to a MultiIndex object + # to be removed when GH 17024 is fixed + levels = list(index.levels) + labels = list(index.labels) + + levels[0] = levels[0].append(Index([margins_name])) + for i in range(1, len(levels)): + levels[i] = levels[i].append(Index([''])) + for i in range(len(labels)): + lbl = list(labels[i]) + lbl.append(max(labels[i] + 1)) + labels[i] = lbl + + return MultiIndex(levels=levels, + labels=labels, + names=index.names) + + def _normalize(table, normalize, margins, margins_name='All'): if not isinstance(normalize, bool) and not isinstance(normalize, @@ -544,67 +599,32 @@ def _normalize(table, normalize, margins, margins_name='All'): try: normalize = axis_subs[normalize] except KeyError: - raise ValueError("Not a valid normalize argument") + raise ValueError( + "Not a valid normalize argument: {!r}".format(normalize)) - if margins is False: + # Actual Normalizations + normalizers = { + 'columns': lambda x: x / x.sum(), + 'index': lambda x: x.div(x.sum(axis=1), axis=0) + } - # Actual Normalizations - normalizers = { - 'all': lambda x: x / x.sum(axis=1).sum(axis=0), - 'columns': lambda x: x / x.sum(), - 'index': lambda x: x.div(x.sum(axis=1), axis=0) - } - - normalizers[True] = normalizers['all'] - - try: - f = normalizers[normalize] - except KeyError: - raise ValueError("Not a valid normalize argument") - - table = f(table) - table = table.fillna(0) + if margins is False: + normalizers['all'] = lambda x: x / x.sum(axis=1).sum(axis=0) elif margins is True: + # skip margin rows and cols for normalization + normalizers['all'] = lambda x: x / x.iloc[:-1, :-1].sum(axis=1)\ + .sum(axis=0) - column_margin = table.loc[:, margins_name].drop(margins_name) - index_margin = table.loc[margins_name, :].drop(margins_name) - table = table.drop(margins_name, axis=1).drop(margins_name) - # to keep index and columns names - table_index_names = table.index.names - table_columns_names = table.columns.names - - # Normalize core - table = _normalize(table, normalize=normalize, margins=False) - - # Fix Margins - if normalize == 'columns': - column_margin = column_margin / column_margin.sum() - table = concat([table, column_margin], axis=1) - table = table.fillna(0) - - elif normalize == 'index': - index_margin = index_margin / index_margin.sum() - table = table.append(index_margin) - table = table.fillna(0) - - elif normalize == "all" or normalize is True: - column_margin = column_margin / column_margin.sum() - index_margin = index_margin / index_margin.sum() - index_margin.loc[margins_name] = 1 - table = concat([table, column_margin], axis=1) - table = table.append(index_margin) - - table = table.fillna(0) + else: + raise ValueError("Not a valid margins argument: {!r}".format(margins)) - else: - raise ValueError("Not a valid normalize argument") + normalizers[True] = normalizers['all'] - table.index.names = table_index_names - table.columns.names = table_columns_names + f = normalizers[normalize] - else: - raise ValueError("Not a valid margins argument") + table = f(table) + table = table.fillna(0) return table diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 5e5852ac5381d..534d66414185a 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1218,7 +1218,7 @@ def test_margin_dropna(self): df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], 'b': [3, 3, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) - expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) + expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) expected.index = Index([1.0, 2.0, 'All'], name='a') expected.columns = Index([3, 4, 'All'], name='b') tm.assert_frame_equal(actual, expected) @@ -1226,7 +1226,7 @@ def test_margin_dropna(self): df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan], 'b': [3, np.nan, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) + expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, 'All'], name='a') expected.columns = Index([3.0, 4.0, 'All'], name='b') tm.assert_frame_equal(actual, expected) @@ -1243,8 +1243,8 @@ def test_margin_dropna(self): m = MultiIndex.from_arrays([['one', 'one', 'two', 'two', 'All'], ['dull', 'shiny', 'dull', 'shiny', '']], names=['b', 'c']) - expected = DataFrame([[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], - [3, 0, 2, 1, 7]], columns=m) + expected = DataFrame([[1, 0, 1, 0, 2], [2, 0, 1, 1, 4], + [3, 0, 2, 1, 6]], columns=m) expected.index = Index(['bar', 'foo', 'All'], name='a') tm.assert_frame_equal(actual, expected) @@ -1254,7 +1254,7 @@ def test_margin_dropna(self): ['one', 'two', 'one', 'two', '']], names=['a', 'b']) expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], - [5, 2, 7]], index=m) + [5, 1, 6]], index=m) expected.columns = Index(['dull', 'shiny', 'All'], name='c') tm.assert_frame_equal(actual, expected) @@ -1300,12 +1300,10 @@ def test_crosstab_normalize(self): [0.25, 0.75], [0.4, 0.6]], index=pd.Index([1, 2, 'All'], - name='a', - dtype='object'), + name='a'), columns=pd.Index([3, 4], name='b')) col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]], - index=pd.Index([1, 2], name='a', - dtype='object'), + index=pd.Index([1, 2], name='a'), columns=pd.Index([3, 4, 'All'], name='b')) @@ -1313,8 +1311,7 @@ def test_crosstab_normalize(self): [0.2, 0.6, 0.8], [0.4, 0.6, 1]], index=pd.Index([1, 2, 'All'], - name='a', - dtype='object'), + name='a'), columns=pd.Index([3, 4, 'All'], name='b')) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', @@ -1359,6 +1356,87 @@ def test_crosstab_normalize(self): margins=True) tm.assert_frame_equal(test_case, norm_sum) + def test_crosstab_norm_margins_with_multiindex(self): + # GH 15150 + a = np.array(['foo', 'bar', 'foo', 'bar', 'bar', 'foo']) + b = np.array(['one', 'one', 'two', 'one', 'two', 'two']) + c = np.array(['dull', 'shiny', 'dull', 'dull', 'dull', 'shiny']) + d = np.array(['a', 'a', 'b', 'a', 'b', 'b']) + + # test for normalize == 'columns' + expected_columns = MultiIndex(levels=[['All', 'dull', 'shiny'], + ['', 'a', 'b']], + labels=[[1, 1, 2, 2, 0], + [1, 2, 1, 2, 0]], + names=['col_0', 'col_1']) + expected_index = MultiIndex(levels=[['All', 'bar', 'foo'], + ['', 'one', 'two']], + labels=[[1, 1, 2, 2], + [1, 2, 1, 2]], + names=['row_0', 'row_1']) + expected_data = np.array([[.5, 0., 1., 0., .333333], + [0., .5, 0., 0., .166667], + [.5, 0., 0., 0., .166667], + [0., .5, 0., 1., .333333]]) + expected = pd.DataFrame(expected_data, + index=expected_index, + columns=expected_columns) + result = pd.crosstab([a, b], [c, d], normalize='columns', + margins=True) + tm.assert_frame_equal(result, expected) + + # test for normalize == 'index' + expected_columns = MultiIndex(levels=[['All', 'dull', 'shiny'], + ['', 'a', 'b']], + labels=[[1, 1, 2, 2], + [1, 2, 1, 2]], + names=['col_0', 'col_1']) + expected_index = MultiIndex(levels=[['All', 'bar', 'foo'], + ['', 'one', 'two']], + labels=[[1, 1, 2, 2, 0], + [1, 2, 1, 2, 0]], + names=['row_0', 'row_1']) + expected_data = np.array([[.5, 0., .5, 0.], + [0., 1., 0., 0.], + [1., 0., 0., 0.], + [0., .5, 0., .5], + [.33333333, .33333333, + .16666667, .16666667]]) + expected = pd.DataFrame(expected_data, + index=expected_index, + columns=expected_columns) + result = pd.crosstab([a, b], [c, d], normalize='index', + margins=True) + tm.assert_frame_equal(result, expected) + + # test for normalize == 'all' + expected_columns = MultiIndex(levels=[['All', 'dull', 'shiny'], + ['', 'a', 'b']], + labels=[[1, 1, 2, 2, 0], + [1, 2, 1, 2, 0]], + names=['col_0', 'col_1']) + expected_index = MultiIndex(levels=[['All', 'bar', 'foo'], + ['', 'one', 'two']], + labels=[[1, 1, 2, 2, 0], + [1, 2, 1, 2, 0]], + names=['row_0', 'row_1']) + expected_data = np.array([[0.16666667, 0., .16666667, + 0., .33333333], + [0., .16666667, 0., + 0., .16666667], + [.16666667, 0., 0., + 0., .16666667], + [0., .16666667, 0., + .16666667, .33333333], + [0.33333333, .33333333, .16666667, + .16666667, 1.]]) + expected = pd.DataFrame(expected_data, + index=expected_index, + columns=expected_columns) + result = pd.crosstab([a, b], [c, d], normalize='all', + margins=True) + tm.assert_frame_equal(result, expected) + def test_crosstab_with_empties(self): # Check handling of empties df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], @@ -1391,22 +1469,23 @@ def test_crosstab_errors(self): df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], 'c': [1, 1, np.nan, 1, 1]}) - error = 'values cannot be used without an aggfunc.' + error = "values cannot be used without an aggfunc." with tm.assert_raises_regex(ValueError, error): pd.crosstab(df.a, df.b, values=df.c) - error = 'aggfunc cannot be used without values' + error = "aggfunc cannot be used without values" with tm.assert_raises_regex(ValueError, error): pd.crosstab(df.a, df.b, aggfunc=np.mean) - error = 'Not a valid normalize argument' - with tm.assert_raises_regex(ValueError, error): + error = "'42'" + with tm.assert_raises_regex(KeyError, error): pd.crosstab(df.a, df.b, normalize='42') + error = "Not a valid normalize argument: 42" with tm.assert_raises_regex(ValueError, error): pd.crosstab(df.a, df.b, normalize=42) - error = 'Not a valid margins argument' + error = "Not a valid margins argument: 42" with tm.assert_raises_regex(ValueError, error): pd.crosstab(df.a, df.b, normalize='all', margins=42) @@ -1465,6 +1544,7 @@ def test_crosstab_with_numpy_size(self): expected = pd.DataFrame(expected_data, index=expected_index, columns=expected_column) + tm.assert_frame_equal(result, expected)