Skip to content

BUG #15150 normalization of crosstable with multiindex and margins #16599

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,9 @@ Reshaping
- Bug in merging with categorical dtypes with datetimelikes incorrectly raised a ``TypeError`` (:issue:`16900`)
- Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`)
- Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`)
- Bug in ``pd.crosstab(normalize=True, margins=True)`` when at least one axis has a multi-index (:issue:`15150`)

>>>>>>> added whatsnew and reformatted tests to be more readable
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh i missed that..


Numeric
^^^^^^^
Expand Down
152 changes: 86 additions & 66 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,22 +200,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',

def _add_margins(table, data, values, rows, cols, aggfunc,
margins_name='All'):
if not isinstance(margins_name, compat.string_types):
raise ValueError('margins_name argument must be a string')

exception_msg = 'Conflicting name "{0}" in margins'.format(margins_name)
for level in table.index.names:
if margins_name in table.index.get_level_values(level):
raise ValueError(exception_msg)
_check_margins_name(margins_name, table)

grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)

# could be passed a Series object with no 'columns'
if hasattr(table, 'columns'):
for level in table.columns.names[1:]:
if margins_name in table.columns.get_level_values(level):
raise ValueError(exception_msg)

if len(rows) > 1:
key = (margins_name,) + ('',) * (len(rows) - 1)
else:
Expand Down Expand Up @@ -264,6 +253,35 @@ def _add_margins(table, data, values, rows, cols, aggfunc,
return result


def _check_margins_name(margins_name, table):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Docstring here would be good (for developers)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I can do that

"""
Checks if margins_name is a correct input argument for pivot_table
or crosstab.

Parameters
----------

margins_name : string, default 'All'
Name of the row / column that will contain the totals
when margins is True.
table : DataFrame
"""

if not isinstance(margins_name, compat.string_types):
raise ValueError('margins_name argument must be a string')

exception_msg = 'Conflicting name "{0}" in margins'.format(margins_name)
for level in table.index.names:
if margins_name in table.index.get_level_values(level):
raise ValueError(exception_msg)

# could be passed a Series object with no 'columns'
if hasattr(table, 'columns'):
for level in table.columns.names[1:]:
if margins_name in table.columns.get_level_values(level):
raise ValueError(exception_msg)


def _compute_grand_margin(data, values, aggfunc,
margins_name='All'):

Expand Down Expand Up @@ -521,13 +539,31 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
kwargs = {'aggfunc': aggfunc}

table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
margins=margins, margins_name=margins_name,
margins=False, margins_name=margins_name,
dropna=dropna, **kwargs)

# GH 17013:
if values is None and margins:
table = table.fillna(0).astype(np.int64)

if margins:
_check_margins_name(margins_name, table)

if normalize != 'index':
# add margin column
table[margins_name] = table.sum(axis=1)

if normalize != 'columns':
# add margin row
if isinstance(table.index, MultiIndex):
# workaround for adding a margins row to a MultiIndex object
# to be removed when GH 17024 is fixed
new_index = _add_margins_to_multiindex(table.index,
margins_name)
table.loc[margins_name] = table.sum(axis=0)
table.index = new_index
else:
table.loc[margins_name] = table.sum(axis=0)
# Post-process
if normalize is not False:
table = _normalize(table, normalize=normalize, margins=margins,
Expand All @@ -536,6 +572,25 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
return table


def _add_margins_to_multiindex(index, margins_name):
# workaround for adding a margins row to a MultiIndex object
# to be removed when GH 17024 is fixed
levels = list(index.levels)
labels = list(index.labels)

levels[0] = levels[0].append(Index([margins_name]))
for i in range(1, len(levels)):
levels[i] = levels[i].append(Index(['']))
for i in range(len(labels)):
lbl = list(labels[i])
lbl.append(max(labels[i] + 1))
labels[i] = lbl

return MultiIndex(levels=levels,
labels=labels,
names=index.names)


def _normalize(table, normalize, margins, margins_name='All'):

if not isinstance(normalize, bool) and not isinstance(normalize,
Expand All @@ -544,67 +599,32 @@ def _normalize(table, normalize, margins, margins_name='All'):
try:
normalize = axis_subs[normalize]
except KeyError:
raise ValueError("Not a valid normalize argument")
raise ValueError(
"Not a valid normalize argument: {!r}".format(normalize))

if margins is False:
# Actual Normalizations
normalizers = {
'columns': lambda x: x / x.sum(),
'index': lambda x: x.div(x.sum(axis=1), axis=0)
}

# Actual Normalizations
normalizers = {
'all': lambda x: x / x.sum(axis=1).sum(axis=0),
'columns': lambda x: x / x.sum(),
'index': lambda x: x.div(x.sum(axis=1), axis=0)
}

normalizers[True] = normalizers['all']

try:
f = normalizers[normalize]
except KeyError:
raise ValueError("Not a valid normalize argument")

table = f(table)
table = table.fillna(0)
if margins is False:
normalizers['all'] = lambda x: x / x.sum(axis=1).sum(axis=0)

elif margins is True:
# skip margin rows and cols for normalization
normalizers['all'] = lambda x: x / x.iloc[:-1, :-1].sum(axis=1)\
.sum(axis=0)

column_margin = table.loc[:, margins_name].drop(margins_name)
index_margin = table.loc[margins_name, :].drop(margins_name)
table = table.drop(margins_name, axis=1).drop(margins_name)
# to keep index and columns names
table_index_names = table.index.names
table_columns_names = table.columns.names

# Normalize core
table = _normalize(table, normalize=normalize, margins=False)

# Fix Margins
if normalize == 'columns':
column_margin = column_margin / column_margin.sum()
table = concat([table, column_margin], axis=1)
table = table.fillna(0)

elif normalize == 'index':
index_margin = index_margin / index_margin.sum()
table = table.append(index_margin)
table = table.fillna(0)

elif normalize == "all" or normalize is True:
column_margin = column_margin / column_margin.sum()
index_margin = index_margin / index_margin.sum()
index_margin.loc[margins_name] = 1
table = concat([table, column_margin], axis=1)
table = table.append(index_margin)

table = table.fillna(0)
else:
raise ValueError("Not a valid margins argument: {!r}".format(margins))

else:
raise ValueError("Not a valid normalize argument")
normalizers[True] = normalizers['all']

table.index.names = table_index_names
table.columns.names = table_columns_names
f = normalizers[normalize]

else:
raise ValueError("Not a valid margins argument")
table = f(table)
table = table.fillna(0)

return table

Expand Down
Loading