Skip to content

ENH: Sorting of ExtensionArrays #19957

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Mar 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np

from pandas.errors import AbstractMethodError
from pandas.compat.numpy import function as nv

_not_implemented_message = "{} does not implement {}."

Expand Down Expand Up @@ -236,6 +237,57 @@ def isna(self):
"""
raise AbstractMethodError(self)

def _values_for_argsort(self):
# type: () -> ndarray
"""Return values for sorting.

Returns
-------
ndarray
The transformed values should maintain the ordering between values
within the array.

See Also
--------
ExtensionArray.argsort
"""
# Note: this is used in `ExtensionArray.argsort`.
return np.array(self)

def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
"""
Return the indices that would sort this array.

Parameters
----------
ascending : bool, default True
Whether the indices should result in an ascending
or descending sort.
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
Sorting algorithm.
*args, **kwargs:
passed through to :func:`numpy.argsort`.

Returns
-------
index_array : ndarray
Array of indices that sort ``self``.

See Also
--------
numpy.argsort : Sorting implementation used internally.
"""
# Implementor note: You have two places to override the behavior of
# argsort.
# 1. _values_for_argsort : construct the values passed to np.argsort
# 2. argsort : total control over sorting.
ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)
values = self._values_for_argsort()
result = np.argsort(values, kind=kind, **kwargs)
if not ascending:
result = result[::-1]
return result

def fillna(self, value=None, method=None, limit=None):
""" Fill NA/NaN values using the specified method.

Expand Down
53 changes: 38 additions & 15 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1378,17 +1378,24 @@ def check_for_ordered(self, op):
"you can use .as_ordered() to change the "
"Categorical to an ordered one\n".format(op=op))

def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
"""
Returns the indices that would sort the Categorical instance if
'sort_values' was called. This function is implemented to provide
compatibility with numpy ndarray objects.
def _values_for_argsort(self):
return self._codes.copy()

While an ordering is applied to the category values, arg-sorting
in this context refers more to organizing and grouping together
based on matching category values. Thus, this function can be
called on an unordered Categorical instance unlike the functions
'Categorical.min' and 'Categorical.max'.
def argsort(self, *args, **kwargs):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this changes our opinion on _values_for_argsort, but the apparently Python2 has issues with passing through the arguments correctly to the super() call.

____________________ TestCategoricalSort.test_numpy_argsort ____________________

self = <pandas.tests.categorical.test_sorting.TestCategoricalSort object at 0x7efcb391f950>

    def test_numpy_argsort(self):
        c = Categorical([5, 3, 1, 4, 2], ordered=True)
    
        expected = np.array([2, 4, 1, 3, 0])
>       tm.assert_numpy_array_equal(np.argsort(c), expected,
                                    check_dtype=False)

pandas/tests/categorical/test_sorting.py:26: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../miniconda3/envs/pandas/lib/python2.7/site-packages/numpy/core/fromnumeric.py:886: in argsort
    return argsort(axis, kind, order)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = [5, 3, 1, 4, 2]
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
ascending = -1, kind = 'quicksort', args = (None,), kwargs = {}

    def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
        """
            Returns the indices that would sort the Categorical instance if
            'sort_values' was called. This function is implemented to provide
            compatibility with numpy ndarray objects.
    
            While an ordering is applied to the category values, arg-sorting
            in this context refers more to organizing and grouping together
            based on matching category values. Thus, this function can be
            called on an unordered Categorical instance unlike the functions
            'Categorical.min' and 'Categorical.max'.
    
            Returns
            -------
            argsorted : numpy array
    
            See also
            --------
            numpy.ndarray.argsort
            """
        # Keep the implementation here just for the docstring.
        return super(Categorical, self).argsort(ascending=ascending, kind=kind,
>                                               *args, **kwargs)
E       TypeError: argsort() got multiple values for keyword argument 'ascending'

Changing the Categorical.argsort to accept just *args, **kwargs fixes things, since ExtensionArray does the argument validation, but it's a bit unfortunate.

# TODO(PY2): use correct signature
# We have to do *args, **kwargs to avoid a a py2-only signature
# issue since np.argsort differs from argsort.
"""Return the indicies that would sort the Categorical.

Parameters
----------
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do these doc-strings meet the new standards?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7bbe796 does, aside from examples which isn't really possible.

ascending : bool, default True
Whether the indices should result in an ascending
or descending sort.
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
Sorting algorithm.
*args, **kwargs:
passed through to :func:`numpy.argsort`.

Returns
-------
Expand All @@ -1397,12 +1404,28 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
See also
--------
numpy.ndarray.argsort

Notes
-----
While an ordering is applied to the category values, arg-sorting
in this context refers more to organizing and grouping together
based on matching category values. Thus, this function can be
called on an unordered Categorical instance unlike the functions
'Categorical.min' and 'Categorical.max'.

Examples
--------
>>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()
array([2, 0, 1, 3])

>>> cat = pd.Categorical(['b', 'b', 'a', 'c'],
... categories=['c', 'b', 'a'],
... ordered=True)
>>> cat.argsort()
array([3, 0, 1, 2])
"""
ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)
result = np.argsort(self._codes.copy(), kind=kind, **kwargs)
if not ascending:
result = result[::-1]
return result
# Keep the implementation here just for the docstring.
return super(Categorical, self).argsort(*args, **kwargs)

def sort_values(self, inplace=False, ascending=True, na_position='last'):
""" Sorts the Categorical by category value returning a new
Expand Down
40 changes: 40 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,46 @@ def test_apply_simple_series(self, data):
result = pd.Series(data).apply(id)
assert isinstance(result, pd.Series)

def test_argsort(self, data_for_sorting):
result = pd.Series(data_for_sorting).argsort()
expected = pd.Series(np.array([2, 0, 1], dtype=np.int64))
self.assert_series_equal(result, expected)

def test_argsort_missing(self, data_missing_for_sorting):
result = pd.Series(data_missing_for_sorting).argsort()
expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))
self.assert_series_equal(result, expected)

@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values(self, data_for_sorting, ascending):
ser = pd.Series(data_for_sorting)
result = ser.sort_values(ascending=ascending)
expected = ser.iloc[[2, 0, 1]]
if not ascending:
expected = expected[::-1]

self.assert_series_equal(result, expected)

@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values_missing(self, data_missing_for_sorting, ascending):
ser = pd.Series(data_missing_for_sorting)
result = ser.sort_values(ascending=ascending)
if ascending:
expected = ser.iloc[[2, 0, 1]]
else:
expected = ser.iloc[[0, 2, 1]]
self.assert_series_equal(result, expected)

@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values_frame(self, data_for_sorting, ascending):
df = pd.DataFrame({"A": [1, 2, 1],
"B": data_for_sorting})
result = df.sort_values(['A', 'B'])
expected = pd.DataFrame({"A": [1, 1, 2],
'B': data_for_sorting.take([2, 0, 1])},
index=[2, 0, 1])
self.assert_frame_equal(result, expected)

@pytest.mark.parametrize('box', [pd.Series, lambda x: x])
@pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique])
def test_unique(self, data, box, method):
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/extension/category/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@ def data_missing():
return Categorical([np.nan, 'A'])


@pytest.fixture
def data_for_sorting():
return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'],
ordered=True)


@pytest.fixture
def data_missing_for_sorting():
return Categorical(['A', None, 'B'], categories=['B', 'A'],
ordered=True)


@pytest.fixture
def na_value():
return np.nan
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/extension/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,26 @@ def all_data(request, data, data_missing):
return data_missing


@pytest.fixture
def data_for_sorting():
"""Length-3 array with a known sort order.

This should be three items [B, C, A] with
A < B < C
"""
raise NotImplementedError


@pytest.fixture
def data_missing_for_sorting():
"""Length-3 array with a known sort order.

This should be three items [B, NA, A] with
A < B and NA missing.
"""
raise NotImplementedError


@pytest.fixture
def na_cmp():
"""Binary operator for comparing NA values.
Expand Down
24 changes: 22 additions & 2 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,20 @@ def data_missing():
return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)])


@pytest.fixture
def data_for_sorting():
return DecimalArray([decimal.Decimal('1'),
decimal.Decimal('2'),
decimal.Decimal('0')])


@pytest.fixture
def data_missing_for_sorting():
return DecimalArray([decimal.Decimal('1'),
decimal.Decimal('NaN'),
decimal.Decimal('0')])


@pytest.fixture
def na_cmp():
return lambda x, y: x.is_nan() and y.is_nan()
Expand All @@ -48,11 +62,17 @@ def assert_series_equal(self, left, right, *args, **kwargs):
*args, **kwargs)

def assert_frame_equal(self, left, right, *args, **kwargs):
self.assert_series_equal(left.dtypes, right.dtypes)
for col in left.columns:
# TODO(EA): select_dtypes
decimals = (left.dtypes == 'decimal').index

for col in decimals:
self.assert_series_equal(left[col], right[col],
*args, **kwargs)

left = left.drop(columns=decimals)
right = right.drop(columns=decimals)
tm.assert_frame_equal(left, right, *args, **kwargs)


class TestDtype(BaseDecimal, base.BaseDtypeTests):
pass
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/extension/json/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ def __getitem__(self, item):
return self._constructor_from_sequence([
x for x, m in zip(self, item) if m
])
elif isinstance(item, collections.Iterable):
# fancy indexing
return type(self)([self.data[i] for i in item])
else:
# slice
return type(self)(self.data[item])

def __setitem__(self, key, value):
Expand Down Expand Up @@ -104,6 +108,13 @@ def _concat_same_type(cls, to_concat):
data = list(itertools.chain.from_iterable([x.data for x in to_concat]))
return cls(data)

def _values_for_argsort(self):
# Disable NumPy's shape inference by including an empty tuple...
# If all the elemnts of self are the same size P, NumPy will
# cast them to an (N, P) array, instead of an (N,) array of tuples.
frozen = [()] + list(tuple(x.items()) for x in self)
return np.array(frozen, dtype=object)[1:]


def make_data():
# TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
Expand Down
41 changes: 40 additions & 1 deletion pandas/tests/extension/json/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ def data_missing():
return JSONArray([{}, {'a': 10}])


@pytest.fixture
def data_for_sorting():
return JSONArray([{'b': 1}, {'c': 4}, {'a': 2, 'c': 3}])


@pytest.fixture
def data_missing_for_sorting():
return JSONArray([{'b': 1}, {}, {'a': 4}])


@pytest.fixture
def na_value():
return {}
Expand Down Expand Up @@ -70,10 +80,39 @@ def test_fillna_frame(self):


class TestMethods(base.BaseMethodsTests):
@pytest.mark.skip(reason="Unhashable")
unhashable = pytest.mark.skip(reason="Unhashable")
unstable = pytest.mark.skipif(sys.version_info <= (3, 5),
reason="Dictionary order unstable")

@unhashable
def test_value_counts(self, all_data, dropna):
pass

@unhashable
def test_sort_values_frame(self):
# TODO (EA.factorize): see if _values_for_factorize allows this.
pass

@unstable
def test_argsort(self, data_for_sorting):
super(TestMethods, self).test_argsort(data_for_sorting)

@unstable
def test_argsort_missing(self, data_missing_for_sorting):
super(TestMethods, self).test_argsort_missing(
data_missing_for_sorting)

@unstable
@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values(self, data_for_sorting, ascending):
super(TestMethods, self).test_sort_values(
data_for_sorting, ascending)

@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values_missing(self, data_missing_for_sorting, ascending):
super(TestMethods, self).test_sort_values_missing(
data_missing_for_sorting, ascending)


class TestCasting(base.BaseCastingTests):
pass