Skip to content

ENH: support for removing unused levels of a MultiIndex (interally) #15700

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,10 @@ def setup(self):
self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S')
self.ts3 = Series(1, index=self.rng3)

def time_sort_index(self):
def time_sort_index_monotonic(self):
self.ts2.sort_index()

def time_sort_index_non_monotonic(self):
self.ts.sort_index()

def time_timeseries_slice_minutely(self):
Expand Down
74 changes: 73 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,78 @@ If indicated, a deprecation warning will be issued if you reference that module.
"pandas._hash", "pandas.tools.libhash", ""
"pandas._window", "pandas.core.libwindow", ""

.. _whatsnew_0200.api_breaking.sort_index:

DataFrame.sort_index changes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort.
This would happen with a ``lexsorted``, but non-montonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`)

This is UNCHANGED between versions, but showing for illustration purposes:

.. ipython:: python

df = DataFrame(np.arange(6), columns=['value'], index=MultiIndex.from_product([list('BA'), range(3)]))
df

.. ipython:: python

df.index.is_lexsorted()
df.index.is_monotonic

Sorting works as expected

.. ipython:: python

df.sort_index()

.. ipython:: python

df.sort_index().index.is_lexsorted()
df.sort_index().index.is_monotonic

However, this example, which has a monotonic level, doesn't behave as desired.

.. ipython:: python
df = pd.DataFrame({'value': [1, 2, 3, 4]},
index=pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))

Previous Behavior:

.. ipython:: python

In [11]: df.sort_index()
Out[11]:
value
a bb 1
aa 2
b bb 3
aa 4

In [14]: df.sort_index().index.is_lexsorted()
Out[14]: True

In [15]: df.sort_index().index.is_monotonic
Out[15]: False

New Behavior:

.. ipython:: python

df.sort_index()
df.sort_index().index.is_lexsorted()
df.sort_index().index.is_monotonic

Previous Behavior:

.. code-block:: ipython

New Behavior:

.. ipython:: python


.. _whatsnew_0200.api_breaking.groupby_describe:

Expand Down Expand Up @@ -830,7 +902,7 @@ Performance Improvements
- Improved performance when using ``.unstack()`` (:issue:`15503`)
- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
- Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`)

- Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`)

.. _whatsnew_0200.bug_fixes:

Expand Down
18 changes: 10 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3308,6 +3308,10 @@ def trans(v):
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
kind='quicksort', na_position='last', sort_remaining=True,
by=None):

# TODO: this can be combined with Series.sort_index impl as
# almost identical

inplace = validate_bool_kwarg(inplace, 'inplace')
# 10726
if by is not None:
Expand All @@ -3321,8 +3325,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)

# sort by the index
if level is not None:
if level:

new_axis, indexer = labels.sortlevel(level, ascending=ascending,
sort_remaining=sort_remaining)
Expand All @@ -3332,17 +3335,15 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,

# make sure that the axis is lexsorted to start
# if not we need to reconstruct to get the correct indexer
if not labels.is_lexsorted():
labels = MultiIndex.from_tuples(labels.values)
labels = labels._reconstruct(sort=True)

indexer = lexsort_indexer(labels.labels, orders=ascending,
na_position=na_position)
else:
from pandas.core.sorting import nargsort

# GH11080 - Check monotonic-ness before sort an index
# if monotonic (already sorted), return None or copy() according
# to 'inplace'
# Check monotonic-ness before sort an index
# GH11080
if ((ascending and labels.is_monotonic_increasing) or
(not ascending and labels.is_monotonic_decreasing)):
if inplace:
Expand All @@ -3353,8 +3354,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
indexer = nargsort(labels, kind=kind, ascending=ascending,
na_position=na_position)

baxis = self._get_block_manager_axis(axis)
new_data = self._data.take(indexer,
axis=self._get_block_manager_axis(axis),
axis=baxis,
convert=False, verify=False)

if inplace:
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1808,6 +1808,13 @@ def get_group_levels(self):
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
}

def _is_builtin_func(self, arg):
"""
if we define an builtin function for this argument, return it,
otherwise return the arg
"""
return SelectionMixin._builtin_table.get(arg, arg)

def _get_cython_function(self, kind, how, values, is_numeric):

dtype_str = values.dtype.name
Expand Down Expand Up @@ -2033,7 +2040,7 @@ def _aggregate_series_fast(self, obj, func):
# avoids object / Series creation overhead
dummy = obj._get_values(slice(None, 0)).to_dense()
indexer = get_group_index_sorter(group_index, ngroups)
obj = obj.take(indexer, convert=False)
obj = obj.take(indexer, convert=False).to_dense()
group_index = algorithms.take_nd(
group_index, indexer, allow_fill=False)
grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
Expand Down
9 changes: 2 additions & 7 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
from pandas.sparse.libsparse import IntIndex

from pandas.core.categorical import Categorical, _factorize_from_iterable
from pandas.core.sorting import (get_group_index, compress_group_index,
decons_obs_group_ids)
from pandas.core.sorting import (get_group_index, get_compressed_ids,
compress_group_index, decons_obs_group_ids)

import pandas.core.algorithms as algos
from pandas._libs import algos as _algos, reshape as _reshape
Expand Down Expand Up @@ -494,11 +494,6 @@ def _unstack_frame(obj, level, fill_value=None):
return unstacker.get_result()


def get_compressed_ids(labels, sizes):
ids = get_group_index(labels, sizes, sort=True, xnull=False)
return compress_group_index(ids, sort=True)


def stack(frame, level=-1, dropna=True):
"""
Convert DataFrame to Series with multi-level Index. Columns become the
Expand Down
18 changes: 16 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1752,17 +1752,31 @@ def _try_kind_sort(arr):
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
kind='quicksort', na_position='last', sort_remaining=True):

# TODO: this can be combined with DataFrame.sort_index impl as
# almost identical
inplace = validate_bool_kwarg(inplace, 'inplace')
axis = self._get_axis_number(axis)
index = self.index
if level is not None:

if level:
new_index, indexer = index.sortlevel(level, ascending=ascending,
sort_remaining=sort_remaining)
elif isinstance(index, MultiIndex):
from pandas.core.sorting import lexsort_indexer
indexer = lexsort_indexer(index.labels, orders=ascending)
labels = index._reconstruct(sort=True)
indexer = lexsort_indexer(labels.labels, orders=ascending)
else:
from pandas.core.sorting import nargsort

# Check monotonic-ness before sort an index
# GH11080
if ((ascending and index.is_monotonic_increasing) or
(not ascending and index.is_monotonic_decreasing)):
if inplace:
return
else:
return self.copy()

indexer = nargsort(index, kind=kind, ascending=ascending,
na_position=na_position)

Expand Down
5 changes: 5 additions & 0 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ def maybe_lift(lab, size): # pormote nan values
return loop(list(labels), list(shape))


def get_compressed_ids(labels, sizes):
ids = get_group_index(labels, sizes, sort=True, xnull=False)
return compress_group_index(ids, sort=True)


def is_int64_overflow_possible(shape):
the_prod = long(1)
for x in shape:
Expand Down
102 changes: 97 additions & 5 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1173,9 +1173,100 @@ def from_product(cls, iterables, sortorder=None, names=None):

labels, levels = _factorize_from_iterables(iterables)
labels = cartesian_product(labels)
return MultiIndex(levels, labels, sortorder=sortorder, names=names)

return MultiIndex(levels=levels, labels=labels, sortorder=sortorder,
names=names)
def _reconstruct(self, sort=False, remove_unused=False):
"""
reconstruct the MultiIndex

The MultiIndex will have the same outward appearance (e.g. values)
and will also .equals()

Parameters
----------
sort: boolean, default False
monotonically sort the levels
remove_unused: boolean, default False
remove unsued levels

Returns
-------
MultiIndex

"""

if sort and remove_unused:
raise ValueError("only support one of sort / remove_unused")

if not (sort or remove_unused):
raise ValueError("must supply one of sort / remove_unsued")

levels = self.levels
labels = self.labels

new_levels = []
new_labels = []

if sort:

if self.is_lexsorted() and self.is_monotonic:
return self

for lev, lab in zip(levels, labels):

if lev.is_monotonic:
new_levels.append(lev)
new_labels.append(lab)
continue

# indexer to reorder the levels
indexer = lev.argsort()
lev = lev.take(indexer)

# indexer to reorder the labels
ri = lib.get_reverse_indexer(indexer, len(indexer))
lab = algos.take_1d(ri, lab)

new_levels.append(lev)
new_labels.append(lab)

elif remove_unused:

changed = np.zeros(self.nlevels, dtype=bool)
for i, (lev, lab) in enumerate(zip(levels, labels)):

uniques = np.sort(algos.unique(lab))

# nothing unused
if len(uniques) == len(lev):
new_levels.append(lev)
new_labels.append(lab)
changed[i] = True
continue

unused = list(reversed(sorted(set(
np.arange(len(lev))) - set(uniques))))

# new levels are simple
lev = lev.take(uniques)

# new labels, we remove the unsued
# by decrementing the labels for that value
# prob a better way
for u in unused:

lab = np.where(lab > u, lab - 1, lab)

new_levels.append(lev)
new_labels.append(lab)

# nothing changed
if not changed.any():
return self

return MultiIndex(new_levels, new_labels,
names=self.names, sortorder=self.sortorder,
verify_integrity=False)

@property
def nlevels(self):
Expand Down Expand Up @@ -1746,9 +1837,10 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):

def _partial_tup_index(self, tup, side='left'):
if len(tup) > self.lexsort_depth:
raise KeyError('Key length (%d) was greater than MultiIndex'
' lexsort depth (%d)' %
(len(tup), self.lexsort_depth))
raise UnsortedIndexError(
'Key length (%d) was greater than MultiIndex'
' lexsort depth (%d)' %
(len(tup), self.lexsort_depth))

n = len(tup)
start, end = 0, len(self)
Expand Down
Loading