From 0ec36001a4284c21b7830ade629ce4544f426f2c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 19 Feb 2018 15:12:55 -0600 Subject: [PATCH 01/14] ENH: Sorting of ExtensionArrays This enables {Series,DataFrame}.sort_values and {Series,DataFrame}.argsort --- pandas/core/arrays/base.py | 21 +++++++++ pandas/tests/extension/base/methods.py | 40 +++++++++++++++++ .../extension/category/test_categorical.py | 12 ++++++ pandas/tests/extension/conftest.py | 20 +++++++++ .../tests/extension/decimal/test_decimal.py | 43 +++++++++++++++---- pandas/tests/extension/json/test_json.py | 30 +++++++++++++ 6 files changed, 158 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cec881394a021..8a49b673c4145 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -216,6 +216,27 @@ def isna(self): """ raise AbstractMethodError(self) + def argsort(self, axis=-1, kind='quicksort', order=None): + """Returns the indices that would sort this array. + + Parameters + ---------- + axis : int or None, optional + Axis along which to sort. ExtensionArrays are 1-dimensional, + so this is only included for compatibility with NumPy. + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + Sorting algorithm. + order : str or list of str, optional + Included for NumPy compatibility. + + Returns + ------- + index_array : ndarray + Array of indices that sort ``self``. + + """ + return np.array(self).argsort(kind=kind) + # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 74e5d180b1aa3..5f4604f8f17af 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -31,3 +31,43 @@ def test_count(self, data_missing): def test_apply_simple_series(self, data): result = pd.Series(data).apply(id) assert isinstance(result, pd.Series) + + def test_argsort(self, data_for_sorting): + result = pd.Series(data_for_sorting).argsort() + expected = pd.Series(np.array([2, 0, 1])) + self.assert_series_equal(result, expected) + + def test_argsort_missing(self, data_missing_for_sorting): + result = pd.Series(data_missing_for_sorting).argsort() + expected = pd.Series(np.array([1, -1, 0])) + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values(self, data_for_sorting, ascending): + ser = pd.Series(data_for_sorting) + result = ser.sort_values(ascending=ascending) + expected = ser.iloc[[2, 0, 1]] + if not ascending: + expected = expected[::-1] + + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values_missing(self, data_missing_for_sorting, ascending): + ser = pd.Series(data_missing_for_sorting) + result = ser.sort_values(ascending=ascending) + if ascending: + expected = ser.iloc[[2, 0, 1]] + else: + expected = ser.iloc[[0, 2, 1]] + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values_frame(self, data_for_sorting, ascending): + df = pd.DataFrame({"A": [1, 2, 1], + "B": data_for_sorting}) + result = df.sort_values(['A', 'B']) + expected = pd.DataFrame({"A": [1, 1, 2], + 'B': data_for_sorting.take([2, 0, 1])}, + index=[2, 0, 1]) + self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 8f413b4a19730..2400e7654e294 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -29,6 +29,18 @@ def data_missing(): return Categorical([np.nan, 'A']) +@pytest.fixture +def data_for_sorting(): + return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'], + ordered=True) + + +@pytest.fixture +def data_missing_for_sorting(): + return Categorical(['A', None, 'B'], categories=['B', 'A'], + ordered=True) + + @pytest.fixture def na_value(): return np.nan diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 21ed8894e8ebb..04dfb408fc378 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -30,6 +30,26 @@ def all_data(request, data, data_missing): return data_missing +@pytest.fixture +def data_for_sorting(): + """Length-3 array with a known sort order. + + This should be three items [B, C, A] with + A < B < C + """ + raise NotImplementedError + + +@pytest.fixture +def data_missing_for_sorting(): + """Length-3 array with a known sort order. + + This should be three items [B, NA, A] with + A < B and NA missing. + """ + raise NotImplementedError + + @pytest.fixture def na_cmp(): """Binary operator for comparing NA values. diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 7b4d079ecad87..ca496a014651c 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -25,6 +25,20 @@ def data_missing(): return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) +@pytest.fixture +def data_for_sorting(): + return DecimalArray([decimal.Decimal('1'), + decimal.Decimal('2'), + decimal.Decimal('0')]) + + +@pytest.fixture +def data_missing_for_sorting(): + return DecimalArray([decimal.Decimal('1'), + decimal.Decimal('NaN'), + decimal.Decimal('0')]) + + @pytest.fixture def na_cmp(): return lambda x, y: x.is_nan() and y.is_nan() @@ -35,19 +49,32 @@ def na_value(): return decimal.Decimal("NaN") -class TestDtype(base.BaseDtypeTests): +class BaseDecimal(object): + @staticmethod + def assert_series_equal(left, right, *args, **kwargs): + + left_na = left.isna() + right_na = right.isna() + + tm.assert_series_equal(left_na, right_na) + return tm.assert_series_equal(left[~left_na], + right[~right_na], + *args, **kwargs) + + +class TestDtype(BaseDecimal, base.BaseDtypeTests): pass -class TestInterface(base.BaseInterfaceTests): +class TestInterface(BaseDecimal, base.BaseInterfaceTests): pass -class TestConstructors(base.BaseConstructorsTests): +class TestConstructors(BaseDecimal, base.BaseConstructorsTests): pass -class TestReshaping(base.BaseReshapingTests): +class TestReshaping(BaseDecimal, base.BaseReshapingTests): def test_align(self, data, na_value): # Have to override since assert_series_equal doesn't @@ -88,15 +115,15 @@ def test_align_frame(self, data, na_value): assert e2.loc[0, 'A'].is_nan() -class TestGetitem(base.BaseGetitemTests): +class TestGetitem(BaseDecimal, base.BaseGetitemTests): pass -class TestMissing(base.BaseMissingTests): +class TestMissing(BaseDecimal, base.BaseMissingTests): pass -class TestMethods(base.BaseMethodsTests): +class TestMethods(BaseDecimal, base.BaseMethodsTests): @pytest.mark.parametrize('dropna', [True, False]) @pytest.mark.xfail(reason="value_counts not implemented yet.") def test_value_counts(self, all_data, dropna): @@ -112,7 +139,7 @@ def test_value_counts(self, all_data, dropna): tm.assert_series_equal(result, expected) -class TestCasting(base.BaseCastingTests): +class TestCasting(BaseDecimal, base.BaseCastingTests): pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index e0721bb1d8d1a..8356adf42c6d7 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -29,6 +29,16 @@ def data_missing(): return JSONArray([{}, {'a': 10}]) +@pytest.fixture +def data_for_sorting(): + return JSONArray([{'b': 1}, {'c': 4}, {'a': 2, 'c': 3}]) + + +@pytest.fixture +def data_missing_for_sorting(): + return JSONArray([{'b': 1}, {}, {'c': 4}]) + + @pytest.fixture def na_value(): return {} @@ -68,6 +78,26 @@ class TestMethods(base.BaseMethodsTests): def test_value_counts(self, all_data, dropna): pass + @pytest.mark.skip(reason="Dictionaries are not orderable.") + def test_argsort(self): + pass + + @pytest.mark.skip(reason="Dictionaries are not orderable.") + def test_argsort_missing(self): + pass + + @pytest.mark.skip(reason="Dictionaries are not orderable.") + def test_sort_values(self): + pass + + @pytest.mark.skip(reason="Dictionaries are not orderable.") + def test_sort_values_missing(self): + pass + + @pytest.mark.skip(reason="Dictionaries are not orderable.") + def test_sort_values_frame(self): + pass + class TestCasting(base.BaseCastingTests): pass From 47072735cc68a1538fb1ef953dd359056e5c5af9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 06:29:01 -0600 Subject: [PATCH 02/14] REF: Split argsort into two parts --- pandas/core/arrays/base.py | 29 ++++++++++++++++++++++++++--- pandas/core/arrays/categorical.py | 11 ++++++----- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8a49b673c4145..01ebf9b95d71a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2,6 +2,7 @@ import numpy as np from pandas.errors import AbstractMethodError +from pandas.compat.numpy import function as nv _not_implemented_message = "{} does not implement {}." @@ -216,11 +217,24 @@ def isna(self): """ raise AbstractMethodError(self) - def argsort(self, axis=-1, kind='quicksort', order=None): + def _values_for_argsort(self): + # type: () -> ndarray + """Get the ndarray to be passed to np.argsort. + + This is called from within 'ExtensionArray.argsort'. + + Returns + ------- + values : ndarray + """ + return np.array(self) + + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """Returns the indices that would sort this array. Parameters ---------- + ascending : bool, default True axis : int or None, optional Axis along which to sort. ExtensionArrays are 1-dimensional, so this is only included for compatibility with NumPy. @@ -233,9 +247,18 @@ def argsort(self, axis=-1, kind='quicksort', order=None): ------- index_array : ndarray Array of indices that sort ``self``. - """ - return np.array(self).argsort(kind=kind) + # Implementor note: You have two places to override the behavior of + # argsort. + # 1. _values_for_argsort : construct the values passed to np.argsort + # 2. argsort : total control over sorting. + + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + values = self._values_for_argsort() + result = np.argsort(values, kind=kind, **kwargs) + if not ascending: + result = result[::-1] + return result # ------------------------------------------------------------------------ # Indexing methods diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c6eeabf0148d0..d2a357fa22d55 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1386,6 +1386,9 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) + def _values_for_argsort(self): + return self._codes.copy() + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Returns the indices that would sort the Categorical instance if @@ -1406,11 +1409,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): -------- numpy.ndarray.argsort """ - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - result = np.argsort(self._codes.copy(), kind=kind, **kwargs) - if not ascending: - result = result[::-1] - return result + # Keep the implementation here just for the docstring. + return super(Categorical, self).argsort(ascending=ascending, kind=kind, + *args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new From b61fb8d5971d1accc2f306ae9f2cef986e885f20 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 10:03:57 -0600 Subject: [PATCH 03/14] Fixed docstring --- pandas/core/arrays/base.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 01ebf9b95d71a..479e821586067 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -235,18 +235,21 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): Parameters ---------- ascending : bool, default True - axis : int or None, optional - Axis along which to sort. ExtensionArrays are 1-dimensional, - so this is only included for compatibility with NumPy. + Whether the indices should result in an ascending + or descending sort. kind : {'quicksort', 'mergesort', 'heapsort'}, optional Sorting algorithm. - order : str or list of str, optional - Included for NumPy compatibility. + args, kwargs: + passed through to :func:`numpy.argsort`. Returns ------- index_array : ndarray Array of indices that sort ``self``. + + See Also + -------- + numpy.argsort """ # Implementor note: You have two places to override the behavior of # argsort. From 44b6d72be5b2431bbe38454a7fc7439244575925 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 13:23:18 -0600 Subject: [PATCH 04/14] Remove _values_for_argsort --- pandas/core/arrays/base.py | 21 ++------------------- pandas/core/arrays/categorical.py | 11 +++++------ 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 479e821586067..cfc9d54c31c55 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -217,20 +217,8 @@ def isna(self): """ raise AbstractMethodError(self) - def _values_for_argsort(self): - # type: () -> ndarray - """Get the ndarray to be passed to np.argsort. - - This is called from within 'ExtensionArray.argsort'. - - Returns - ------- - values : ndarray - """ - return np.array(self) - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """Returns the indices that would sort this array. + """Return the indices that would sort this array. Parameters ---------- @@ -251,13 +239,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): -------- numpy.argsort """ - # Implementor note: You have two places to override the behavior of - # argsort. - # 1. _values_for_argsort : construct the values passed to np.argsort - # 2. argsort : total control over sorting. - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() + values = self.astype(object) result = np.argsort(values, kind=kind, **kwargs) if not ascending: result = result[::-1] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d2a357fa22d55..c6eeabf0148d0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1386,9 +1386,6 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) - def _values_for_argsort(self): - return self._codes.copy() - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Returns the indices that would sort the Categorical instance if @@ -1409,9 +1406,11 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): -------- numpy.ndarray.argsort """ - # Keep the implementation here just for the docstring. - return super(Categorical, self).argsort(ascending=ascending, kind=kind, - *args, **kwargs) + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + result = np.argsort(self._codes.copy(), kind=kind, **kwargs) + if not ascending: + result = result[::-1] + return result def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new From 5be3917db210ff5f8c644e11411597d190aa4270 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 15:12:15 -0600 Subject: [PATCH 05/14] Revert "Remove _values_for_argsort" This reverts commit 44b6d72be5b2431bbe38454a7fc7439244575925. --- pandas/core/arrays/base.py | 21 +++++++++++++++++++-- pandas/core/arrays/categorical.py | 11 ++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cfc9d54c31c55..479e821586067 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -217,8 +217,20 @@ def isna(self): """ raise AbstractMethodError(self) + def _values_for_argsort(self): + # type: () -> ndarray + """Get the ndarray to be passed to np.argsort. + + This is called from within 'ExtensionArray.argsort'. + + Returns + ------- + values : ndarray + """ + return np.array(self) + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """Return the indices that would sort this array. + """Returns the indices that would sort this array. Parameters ---------- @@ -239,8 +251,13 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): -------- numpy.argsort """ + # Implementor note: You have two places to override the behavior of + # argsort. + # 1. _values_for_argsort : construct the values passed to np.argsort + # 2. argsort : total control over sorting. + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self.astype(object) + values = self._values_for_argsort() result = np.argsort(values, kind=kind, **kwargs) if not ascending: result = result[::-1] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c6eeabf0148d0..d2a357fa22d55 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1386,6 +1386,9 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) + def _values_for_argsort(self): + return self._codes.copy() + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Returns the indices that would sort the Categorical instance if @@ -1406,11 +1409,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): -------- numpy.ndarray.argsort """ - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - result = np.argsort(self._codes.copy(), kind=kind, **kwargs) - if not ascending: - result = result[::-1] - return result + # Keep the implementation here just for the docstring. + return super(Categorical, self).argsort(ascending=ascending, kind=kind, + *args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new From c2578c3942d37ea88dbddf8324a9e75c1f4fe637 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 16:28:19 -0600 Subject: [PATCH 06/14] Workaround Py2 --- pandas/core/arrays/categorical.py | 46 +++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e98dbb750884a..dcb6945a196fa 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1393,17 +1393,21 @@ def check_for_ordered(self, op): def _values_for_argsort(self): return self._codes.copy() - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """ - Returns the indices that would sort the Categorical instance if - 'sort_values' was called. This function is implemented to provide - compatibility with numpy ndarray objects. + def argsort(self, *args, **kwargs): + # TODO(PY2): use correct signature + # We have to do *args, **kwargs to avoid a a py2-only signature + # issue since np.argsort differs from argsort. + """Return the indicies that would sort the Categorical. - While an ordering is applied to the category values, arg-sorting - in this context refers more to organizing and grouping together - based on matching category values. Thus, this function can be - called on an unordered Categorical instance unlike the functions - 'Categorical.min' and 'Categorical.max'. + Parameters + ---------- + ascending : bool, default True + Whether the indices should result in an ascending + or descending sort. + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + Sorting algorithm. + args, kwargs: + passed through to :func:`numpy.argsort`. Returns ------- @@ -1412,10 +1416,28 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): See also -------- numpy.ndarray.argsort + + Notes + ----- + While an ordering is applied to the category values, arg-sorting + in this context refers more to organizing and grouping together + based on matching category values. Thus, this function can be + called on an unordered Categorical instance unlike the functions + 'Categorical.min' and 'Categorical.max'. + + Examples + -------- + >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() + array([2, 0, 1, 3]) + + >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], + ... categories=['c', 'b', 'a'], + ... ordered=True) + >>> cat.argsort() + array([3, 0, 1, 2]) """ # Keep the implementation here just for the docstring. - return super(Categorical, self).argsort(ascending=ascending, kind=kind, - *args, **kwargs) + return super(Categorical, self).argsort(*args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new From b73e303b247f19c3fa7849fa9a37e44e63e7a64f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 16:31:01 -0600 Subject: [PATCH 07/14] Indexer as array --- pandas/tests/extension/decimal/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 736556e4be20d..f1852542088ff 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -71,6 +71,7 @@ def isna(self): return np.array([x.is_nan() for x in self.values]) def take(self, indexer, allow_fill=True, fill_value=None): + indexer = np.asarray(indexer) mask = indexer == -1 indexer = _ensure_platform_int(indexer) From 0db9e97a3f60114f757b0cb9a849749f26308fbb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 4 Mar 2018 06:58:01 -0600 Subject: [PATCH 08/14] Fixed dtypes --- pandas/tests/extension/base/methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 5f4604f8f17af..d3348549db0bd 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -34,12 +34,12 @@ def test_apply_simple_series(self, data): def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() - expected = pd.Series(np.array([2, 0, 1])) + expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) self.assert_series_equal(result, expected) def test_argsort_missing(self, data_missing_for_sorting): result = pd.Series(data_missing_for_sorting).argsort() - expected = pd.Series(np.array([1, -1, 0])) + expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) self.assert_series_equal(result, expected) @pytest.mark.parametrize('ascending', [True, False]) From baf624c1148393bcb42eda4c6ad3c1270963258d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Mar 2018 09:05:41 -0500 Subject: [PATCH 09/14] Fixed docstring --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index dcb6945a196fa..4b1238f4fbf05 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1406,7 +1406,7 @@ def argsort(self, *args, **kwargs): or descending sort. kind : {'quicksort', 'mergesort', 'heapsort'}, optional Sorting algorithm. - args, kwargs: + *args, **kwargs: passed through to :func:`numpy.argsort`. Returns From 7bbe796f45331444d7c3f038e2bcbf16b9f6247f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Mar 2018 21:14:30 -0500 Subject: [PATCH 10/14] Update docs --- pandas/core/arrays/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7f78ca6e29da2..322f71ad83580 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -250,7 +250,8 @@ def _values_for_argsort(self): return np.array(self) def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """Returns the indices that would sort this array. + """ + Return the indices that would sort this array. Parameters ---------- @@ -259,7 +260,7 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): or descending sort. kind : {'quicksort', 'mergesort', 'heapsort'}, optional Sorting algorithm. - args, kwargs: + *args, **kwargs: passed through to :func:`numpy.argsort`. Returns @@ -269,7 +270,7 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): See Also -------- - numpy.argsort + numpy.argsort : Sorting implementation used internally. """ # Implementor note: You have two places to override the behavior of # argsort. From 35a8977381559571f1eeabe52523b3ddc2791fcb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Mar 2018 06:55:22 -0500 Subject: [PATCH 11/14] Change name --- pandas/core/arrays/base.py | 25 ++++++++++++++++++++----- pandas/core/arrays/categorical.py | 2 +- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 322f71ad83580..7bb6165d0b760 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -237,16 +237,31 @@ def isna(self): """ raise AbstractMethodError(self) - def _values_for_argsort(self): + def _simple_ndarray(self): # type: () -> ndarray - """Get the ndarray to be passed to np.argsort. + """Convert the array to a simple ndarray representaiton. - This is called from within 'ExtensionArray.argsort'. + Many methods can operate indirectly on a cheap-to-compute array that + is somehow representative of the extension array. For example, rather + than sorting an ExtensionArray directly, which might be expensive, + we could convert the ExtensionArray to a representative ndarray of + integers, sort the integers, and perform a ``take``. + + The coversion between ExtensionArray and the simple ndarray should be + strictly monotonic https://en.wikipedia.org/wiki/Monotonic_function, + and as cheap to compute as possible. Returns ------- values : ndarray + + See Also + -------- + ExtensionArray.argsort """ + # Implemnetor note: This method is currently used in + # - ExtensionArray.argsort + return np.array(self) def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): @@ -274,11 +289,11 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ # Implementor note: You have two places to override the behavior of # argsort. - # 1. _values_for_argsort : construct the values passed to np.argsort + # 1. _simple_ndarray : construct the values passed to np.argsort # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() + values = self._simple_ndarray() result = np.argsort(values, kind=kind, **kwargs) if not ascending: result = result[::-1] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4b1238f4fbf05..814bf70c102e6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1390,7 +1390,7 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) - def _values_for_argsort(self): + def _simple_ndarray(self): return self._codes.copy() def argsort(self, *args, **kwargs): From d5e819822d9e980021ea17bfd197a300422d7015 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 19 Mar 2018 11:18:41 -0500 Subject: [PATCH 12/14] Back to _values_for_argsort --- pandas/core/arrays/base.py | 27 ++++++++------------------- pandas/core/arrays/categorical.py | 2 +- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8b539e20894a9..f4c77263bf21b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -237,31 +237,21 @@ def isna(self): """ raise AbstractMethodError(self) - def _simple_ndarray(self): + def _values_for_argsort(self): # type: () -> ndarray - """Convert the array to a simple ndarray representaiton. - - Many methods can operate indirectly on a cheap-to-compute array that - is somehow representative of the extension array. For example, rather - than sorting an ExtensionArray directly, which might be expensive, - we could convert the ExtensionArray to a representative ndarray of - integers, sort the integers, and perform a ``take``. - - The coversion between ExtensionArray and the simple ndarray should be - strictly monotonic https://en.wikipedia.org/wiki/Monotonic_function, - and as cheap to compute as possible. + """Return values for sorting. Returns ------- - values : ndarray + ndarray + The transformed values should maintain the ordering between values + within the array. See Also -------- ExtensionArray.argsort """ - # Implemnetor note: This method is currently used in - # - ExtensionArray.argsort - + # Note: this is used in `ExtensionArray.argsort`. return np.array(self) def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): @@ -289,11 +279,10 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ # Implementor note: You have two places to override the behavior of # argsort. - # 1. _simple_ndarray : construct the values passed to np.argsort + # 1. _values_for_argsort : construct the values passed to np.argsort # 2. argsort : total control over sorting. - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._simple_ndarray() + values = self._values_for_argsort() result = np.argsort(values, kind=kind, **kwargs) if not ascending: result = result[::-1] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 43b985686af89..13384dd56a9c1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1378,7 +1378,7 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) - def _simple_ndarray(self): + def _values_for_argsort(self): return self._codes.copy() def argsort(self, *args, **kwargs): From c776133d42beb81d94ab5000da8bb2ad56f53e90 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Mar 2018 13:58:25 -0500 Subject: [PATCH 13/14] Unskip most JSON tests --- pandas/tests/extension/json/array.py | 11 +++++++++++ pandas/tests/extension/json/test_json.py | 25 ++++++------------------ 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 322944129146a..ee0951812b8f0 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -44,7 +44,11 @@ def __getitem__(self, item): return self._constructor_from_sequence([ x for x, m in zip(self, item) if m ]) + elif isinstance(item, collections.Iterable): + # fancy indexing + return type(self)([self.data[i] for i in item]) else: + # slice return type(self)(self.data[item]) def __setitem__(self, key, value): @@ -104,6 +108,13 @@ def _concat_same_type(cls, to_concat): data = list(itertools.chain.from_iterable([x.data for x in to_concat])) return cls(data) + def _values_for_argsort(self): + # Disable NumPy's shape inference by including an empty tuple... + # If all the elemnts of self are the same size P, NumPy will + # cast them to an (N, P) array, instead of an (N,) array of tuples. + frozen = [()] + list(tuple(x.items()) for x in self) + return np.array(frozen, dtype=object)[1:] + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index eab75c306e02a..339be8a2633cb 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -36,7 +36,7 @@ def data_for_sorting(): @pytest.fixture def data_missing_for_sorting(): - return JSONArray([{'b': 1}, {}, {'c': 4}]) + return JSONArray([{'b': 1}, {}, {'a': 4}]) @pytest.fixture @@ -80,28 +80,15 @@ def test_fillna_frame(self): class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="Unhashable") - def test_value_counts(self, all_data, dropna): - pass + unhashable = pytest.mark.skip(reason="Unhashable") - @pytest.mark.skip(reason="Dictionaries are not orderable.") - def test_argsort(self): - pass - - @pytest.mark.skip(reason="Dictionaries are not orderable.") - def test_argsort_missing(self): - pass - - @pytest.mark.skip(reason="Dictionaries are not orderable.") - def test_sort_values(self): - pass - - @pytest.mark.skip(reason="Dictionaries are not orderable.") - def test_sort_values_missing(self): + @unhashable + def test_value_counts(self, all_data, dropna): pass - @pytest.mark.skip(reason="Dictionaries are not orderable.") + @unhashable def test_sort_values_frame(self): + # TODO (EA.factorize): see if _values_for_factorize allows this. pass From 48852457373eff62847ff976bbdcced12923cc94 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Mar 2018 14:36:08 -0500 Subject: [PATCH 14/14] Skip tests on 3.5 and lower Require stable dictionary order --- pandas/tests/extension/json/test_json.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 339be8a2633cb..aec561ece8573 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -81,6 +81,8 @@ def test_fillna_frame(self): class TestMethods(base.BaseMethodsTests): unhashable = pytest.mark.skip(reason="Unhashable") + unstable = pytest.mark.skipif(sys.version_info <= (3, 5), + reason="Dictionary order unstable") @unhashable def test_value_counts(self, all_data, dropna): @@ -91,6 +93,26 @@ def test_sort_values_frame(self): # TODO (EA.factorize): see if _values_for_factorize allows this. pass + @unstable + def test_argsort(self, data_for_sorting): + super(TestMethods, self).test_argsort(data_for_sorting) + + @unstable + def test_argsort_missing(self, data_missing_for_sorting): + super(TestMethods, self).test_argsort_missing( + data_missing_for_sorting) + + @unstable + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values(self, data_for_sorting, ascending): + super(TestMethods, self).test_sort_values( + data_for_sorting, ascending) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values_missing(self, data_missing_for_sorting, ascending): + super(TestMethods, self).test_sort_values_missing( + data_missing_for_sorting, ascending) + class TestCasting(base.BaseCastingTests): pass