From 50d2c6979dfd933b9bfc6de2e6c646a5f655730f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 21 Apr 2017 18:57:45 -0400 Subject: [PATCH 1/2] TST: separate out groupby/test_nth --- pandas/tests/groupby/test_groupby.py | 225 +------------------------- pandas/tests/groupby/test_nth.py | 234 +++++++++++++++++++++++++++ 2 files changed, 235 insertions(+), 224 deletions(-) create mode 100644 pandas/tests/groupby/test_nth.py diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 25ebfef327476..752c0689b0660 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -9,7 +9,7 @@ from numpy import nan from pandas import (date_range, bdate_range, Timestamp, - isnull, Index, MultiIndex, DataFrame, Series, + Index, MultiIndex, DataFrame, Series, concat, Panel, DatetimeIndex) from pandas.errors import UnsupportedFunctionCall, PerformanceWarning from pandas.util.testing import (assert_panel_equal, assert_frame_equal, @@ -87,229 +87,6 @@ def test_select_bad_cols(self): # will have to rethink regex if you change message! g[['A', 'C']] - def test_first_last_nth(self): - # tests for first / last / nth - grouped = self.df.groupby('A') - first = grouped.first() - expected = self.df.loc[[1, 0], ['B', 'C', 'D']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(first, expected) - - nth = grouped.nth(0) - assert_frame_equal(nth, expected) - - last = grouped.last() - expected = self.df.loc[[5, 7], ['B', 'C', 'D']] - expected.index = Index(['bar', 'foo'], name='A') - assert_frame_equal(last, expected) - - nth = grouped.nth(-1) - assert_frame_equal(nth, expected) - - nth = grouped.nth(1) - expected = self.df.loc[[2, 3], ['B', 'C', 'D']].copy() - expected.index = Index(['foo', 'bar'], name='A') - expected = expected.sort_index() - assert_frame_equal(nth, expected) - - # it works! - grouped['B'].first() - grouped['B'].last() - grouped['B'].nth(0) - - self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan - self.assertTrue(isnull(grouped['B'].first()['foo'])) - self.assertTrue(isnull(grouped['B'].last()['foo'])) - self.assertTrue(isnull(grouped['B'].nth(0)['foo'])) - - # v0.14.0 whatsnew - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - result = g.first() - expected = df.iloc[[1, 2]].set_index('A') - assert_frame_equal(result, expected) - - expected = df.iloc[[1, 2]].set_index('A') - result = g.nth(0, dropna='any') - assert_frame_equal(result, expected) - - def test_first_last_nth_dtypes(self): - - df = self.df_mixed_floats.copy() - df['E'] = True - df['F'] = 1 - - # tests for first / last / nth - grouped = df.groupby('A') - first = grouped.first() - expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(first, expected) - - last = grouped.last() - expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(last, expected) - - nth = grouped.nth(1) - expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(nth, expected) - - # GH 2763, first/last shifting dtypes - idx = lrange(10) - idx.append(9) - s = Series(data=lrange(11), index=idx, name='IntCol') - self.assertEqual(s.dtype, 'int64') - f = s.groupby(level=0).first() - self.assertEqual(f.dtype, 'int64') - - def test_nth(self): - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - - assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) - assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) - assert_frame_equal(g.nth(2), df.loc[[]].set_index('A')) - assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) - assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) - assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A')) - assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) - assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) - assert_frame_equal(g[['B']].nth(0), - df.loc[[0, 2], ['A', 'B']].set_index('A')) - - exp = df.set_index('A') - assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) - assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) - - exp['B'] = np.nan - assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) - assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) - - # out of bounds, regression from 0.13.1 - # GH 6621 - df = DataFrame({'color': {0: 'green', - 1: 'green', - 2: 'red', - 3: 'red', - 4: 'red'}, - 'food': {0: 'ham', - 1: 'eggs', - 2: 'eggs', - 3: 'ham', - 4: 'pork'}, - 'two': {0: 1.5456590000000001, - 1: -0.070345000000000005, - 2: -2.4004539999999999, - 3: 0.46206000000000003, - 4: 0.52350799999999997}, - 'one': {0: 0.56573799999999996, - 1: -0.9742360000000001, - 2: 1.033801, - 3: -0.78543499999999999, - 4: 0.70422799999999997}}).set_index(['color', - 'food']) - - result = df.groupby(level=0, as_index=False).nth(2) - expected = df.iloc[[-1]] - assert_frame_equal(result, expected) - - result = df.groupby(level=0, as_index=False).nth(3) - expected = df.loc[[]] - assert_frame_equal(result, expected) - - # GH 7559 - # from the vbench - df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64') - s = df[1] - g = df[0] - expected = s.groupby(g).first() - expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) - assert_series_equal(expected2, expected, check_names=False) - self.assertTrue(expected.name, 0) - self.assertEqual(expected.name, 1) - - # validate first - v = s[g == 1].iloc[0] - self.assertEqual(expected.iloc[0], v) - self.assertEqual(expected2.iloc[0], v) - - # this is NOT the same as .first (as sorted is default!) - # as it keeps the order in the series (and not the group order) - # related GH 7287 - expected = s.groupby(g, sort=False).first() - result = s.groupby(g, sort=False).nth(0, dropna='all') - assert_series_equal(result, expected) - - # doc example - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - result = g.B.nth(0, dropna=True) - expected = g.B.first() - assert_series_equal(result, expected) - - # test multiple nth values - df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], - columns=['A', 'B']) - g = df.groupby('A') - - assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A')) - assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A')) - assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) - assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) - assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A')) - - business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', - freq='B') - df = DataFrame(1, index=business_dates, columns=['a', 'b']) - # get the first, fourth and last two business days for each month - key = (df.index.year, df.index.month) - result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) - expected_dates = pd.to_datetime( - ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1', - '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5', - '2014/6/27', '2014/6/30']) - expected = DataFrame(1, columns=['a', 'b'], index=expected_dates) - assert_frame_equal(result, expected) - - def test_nth_multi_index(self): - # PR 9090, related to issue 8979 - # test nth on MultiIndex, should match .first() - grouped = self.three_group.groupby(['A', 'B']) - result = grouped.nth(0) - expected = grouped.first() - assert_frame_equal(result, expected) - - def test_nth_multi_index_as_expected(self): - # PR 9090, related to issue 8979 - # test nth on MultiIndex - three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny']}) - grouped = three_group.groupby(['A', 'B']) - result = grouped.nth(0) - expected = DataFrame( - {'C': ['dull', 'dull', 'dull', 'dull']}, - index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'], - ['one', 'two', 'one', 'two']], - names=['A', 'B'])) - assert_frame_equal(result, expected) - def test_group_selection_cache(self): # GH 12839 nth, head, and tail should return same result consistently df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py new file mode 100644 index 0000000000000..90708f5c4522d --- /dev/null +++ b/pandas/tests/groupby/test_nth.py @@ -0,0 +1,234 @@ +import numpy as np +import pandas as pd +from pandas import DataFrame, MultiIndex, Index, Series, isnull +from pandas.compat import lrange +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal, assert_series_equal + +from .common import MixIn + + +class TestNth(MixIn, tm.TestCase): + + def test_first_last_nth(self): + # tests for first / last / nth + grouped = self.df.groupby('A') + first = grouped.first() + expected = self.df.loc[[1, 0], ['B', 'C', 'D']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(first, expected) + + nth = grouped.nth(0) + assert_frame_equal(nth, expected) + + last = grouped.last() + expected = self.df.loc[[5, 7], ['B', 'C', 'D']] + expected.index = Index(['bar', 'foo'], name='A') + assert_frame_equal(last, expected) + + nth = grouped.nth(-1) + assert_frame_equal(nth, expected) + + nth = grouped.nth(1) + expected = self.df.loc[[2, 3], ['B', 'C', 'D']].copy() + expected.index = Index(['foo', 'bar'], name='A') + expected = expected.sort_index() + assert_frame_equal(nth, expected) + + # it works! + grouped['B'].first() + grouped['B'].last() + grouped['B'].nth(0) + + self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan + self.assertTrue(isnull(grouped['B'].first()['foo'])) + self.assertTrue(isnull(grouped['B'].last()['foo'])) + self.assertTrue(isnull(grouped['B'].nth(0)['foo'])) + + # v0.14.0 whatsnew + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + result = g.first() + expected = df.iloc[[1, 2]].set_index('A') + assert_frame_equal(result, expected) + + expected = df.iloc[[1, 2]].set_index('A') + result = g.nth(0, dropna='any') + assert_frame_equal(result, expected) + + def test_first_last_nth_dtypes(self): + + df = self.df_mixed_floats.copy() + df['E'] = True + df['F'] = 1 + + # tests for first / last / nth + grouped = df.groupby('A') + first = grouped.first() + expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(first, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(last, expected) + + nth = grouped.nth(1) + expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(nth, expected) + + # GH 2763, first/last shifting dtypes + idx = lrange(10) + idx.append(9) + s = Series(data=lrange(11), index=idx, name='IntCol') + self.assertEqual(s.dtype, 'int64') + f = s.groupby(level=0).first() + self.assertEqual(f.dtype, 'int64') + + def test_nth(self): + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) + assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) + assert_frame_equal(g.nth(2), df.loc[[]].set_index('A')) + assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) + assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) + assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A')) + assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) + assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) + assert_frame_equal(g[['B']].nth(0), + df.loc[[0, 2], ['A', 'B']].set_index('A')) + + exp = df.set_index('A') + assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) + + exp['B'] = np.nan + assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) + + # out of bounds, regression from 0.13.1 + # GH 6621 + df = DataFrame({'color': {0: 'green', + 1: 'green', + 2: 'red', + 3: 'red', + 4: 'red'}, + 'food': {0: 'ham', + 1: 'eggs', + 2: 'eggs', + 3: 'ham', + 4: 'pork'}, + 'two': {0: 1.5456590000000001, + 1: -0.070345000000000005, + 2: -2.4004539999999999, + 3: 0.46206000000000003, + 4: 0.52350799999999997}, + 'one': {0: 0.56573799999999996, + 1: -0.9742360000000001, + 2: 1.033801, + 3: -0.78543499999999999, + 4: 0.70422799999999997}}).set_index(['color', + 'food']) + + result = df.groupby(level=0, as_index=False).nth(2) + expected = df.iloc[[-1]] + assert_frame_equal(result, expected) + + result = df.groupby(level=0, as_index=False).nth(3) + expected = df.loc[[]] + assert_frame_equal(result, expected) + + # GH 7559 + # from the vbench + df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64') + s = df[1] + g = df[0] + expected = s.groupby(g).first() + expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) + assert_series_equal(expected2, expected, check_names=False) + self.assertTrue(expected.name, 0) + self.assertEqual(expected.name, 1) + + # validate first + v = s[g == 1].iloc[0] + self.assertEqual(expected.iloc[0], v) + self.assertEqual(expected2.iloc[0], v) + + # this is NOT the same as .first (as sorted is default!) + # as it keeps the order in the series (and not the group order) + # related GH 7287 + expected = s.groupby(g, sort=False).first() + result = s.groupby(g, sort=False).nth(0, dropna='all') + assert_series_equal(result, expected) + + # doc example + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + result = g.B.nth(0, dropna=True) + expected = g.B.first() + assert_series_equal(result, expected) + + # test multiple nth values + df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], + columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A')) + assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A')) + assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A')) + assert_frame_equal( + g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A')) + assert_frame_equal( + g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) + assert_frame_equal( + g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) + assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) + assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A')) + + business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', + freq='B') + df = DataFrame(1, index=business_dates, columns=['a', 'b']) + # get the first, fourth and last two business days for each month + key = (df.index.year, df.index.month) + result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) + expected_dates = pd.to_datetime( + ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1', + '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5', + '2014/6/27', '2014/6/30']) + expected = DataFrame(1, columns=['a', 'b'], index=expected_dates) + assert_frame_equal(result, expected) + + def test_nth_multi_index(self): + # PR 9090, related to issue 8979 + # test nth on MultiIndex, should match .first() + grouped = self.three_group.groupby(['A', 'B']) + result = grouped.nth(0) + expected = grouped.first() + assert_frame_equal(result, expected) + + def test_nth_multi_index_as_expected(self): + # PR 9090, related to issue 8979 + # test nth on MultiIndex + three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny']}) + grouped = three_group.groupby(['A', 'B']) + result = grouped.nth(0) + expected = DataFrame( + {'C': ['dull', 'dull', 'dull', 'dull']}, + index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'], + ['one', 'two', 'one', 'two']], + names=['A', 'B'])) + assert_frame_equal(result, expected) From dc7d0227b0f5d54fb8e41eee0fe8658201dfa87a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 21 Apr 2017 19:08:53 -0400 Subject: [PATCH 2/2] BUG: bug in groupby on empty frame with multi groupers xref #14784 --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/indexes/multi.py | 9 +++++---- pandas/tests/groupby/test_nth.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 86d9bef636e17..74c915e39b2b6 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1600,7 +1600,7 @@ Indexing - Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`) -- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`) +- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`) I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d46d2c78fbdb0..c760d2943b823 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1645,10 +1645,11 @@ def _get_labels_for_sorting(self): """ from pandas.core.categorical import Categorical - return [Categorical.from_codes(label, - np.arange(np.array(label).max() + 1, - dtype=label.dtype), - ordered=True) + def cats(label): + return np.arange(np.array(label).max() + 1 if len(label) else 0, + dtype=label.dtype) + + return [Categorical.from_codes(label, cats(label), ordered=True) for label in self.labels] def sortlevel(self, level=0, ascending=True, sort_remaining=True): diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 90708f5c4522d..bf2f1f1f9cbc5 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -232,3 +232,17 @@ def test_nth_multi_index_as_expected(self): ['one', 'two', 'one', 'two']], names=['A', 'B'])) assert_frame_equal(result, expected) + + +def test_nth_empty(): + # GH 16064 + df = DataFrame(index=[0], columns=['a', 'b', 'c']) + result = df.groupby('a').nth(10) + expected = DataFrame(index=Index([], name='a'), columns=['b', 'c']) + assert_frame_equal(result, expected) + + result = df.groupby(['a', 'b']).nth(10) + expected = DataFrame(index=MultiIndex([[], []], [[], []], + names=['a', 'b']), + columns=['c']) + assert_frame_equal(result, expected)