Skip to content

BUG: bug in groupby on empty frame with multi groupers #16090

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1600,7 +1600,7 @@ Indexing
- Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`)
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
- Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`)
- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`)
- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`)

I/O
^^^
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1645,10 +1645,11 @@ def _get_labels_for_sorting(self):
"""
from pandas.core.categorical import Categorical

return [Categorical.from_codes(label,
np.arange(np.array(label).max() + 1,
dtype=label.dtype),
ordered=True)
def cats(label):
return np.arange(np.array(label).max() + 1 if len(label) else 0,
dtype=label.dtype)

return [Categorical.from_codes(label, cats(label), ordered=True)
for label in self.labels]

def sortlevel(self, level=0, ascending=True, sort_remaining=True):
Expand Down
225 changes: 1 addition & 224 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from numpy import nan

from pandas import (date_range, bdate_range, Timestamp,
isnull, Index, MultiIndex, DataFrame, Series,
Index, MultiIndex, DataFrame, Series,
concat, Panel, DatetimeIndex)
from pandas.errors import UnsupportedFunctionCall, PerformanceWarning
from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
Expand Down Expand Up @@ -87,229 +87,6 @@ def test_select_bad_cols(self):
# will have to rethink regex if you change message!
g[['A', 'C']]

def test_first_last_nth(self):
# tests for first / last / nth
grouped = self.df.groupby('A')
first = grouped.first()
expected = self.df.loc[[1, 0], ['B', 'C', 'D']]
expected.index = Index(['bar', 'foo'], name='A')
expected = expected.sort_index()
assert_frame_equal(first, expected)

nth = grouped.nth(0)
assert_frame_equal(nth, expected)

last = grouped.last()
expected = self.df.loc[[5, 7], ['B', 'C', 'D']]
expected.index = Index(['bar', 'foo'], name='A')
assert_frame_equal(last, expected)

nth = grouped.nth(-1)
assert_frame_equal(nth, expected)

nth = grouped.nth(1)
expected = self.df.loc[[2, 3], ['B', 'C', 'D']].copy()
expected.index = Index(['foo', 'bar'], name='A')
expected = expected.sort_index()
assert_frame_equal(nth, expected)

# it works!
grouped['B'].first()
grouped['B'].last()
grouped['B'].nth(0)

self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
self.assertTrue(isnull(grouped['B'].first()['foo']))
self.assertTrue(isnull(grouped['B'].last()['foo']))
self.assertTrue(isnull(grouped['B'].nth(0)['foo']))

# v0.14.0 whatsnew
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')
result = g.first()
expected = df.iloc[[1, 2]].set_index('A')
assert_frame_equal(result, expected)

expected = df.iloc[[1, 2]].set_index('A')
result = g.nth(0, dropna='any')
assert_frame_equal(result, expected)

def test_first_last_nth_dtypes(self):

df = self.df_mixed_floats.copy()
df['E'] = True
df['F'] = 1

# tests for first / last / nth
grouped = df.groupby('A')
first = grouped.first()
expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']]
expected.index = Index(['bar', 'foo'], name='A')
expected = expected.sort_index()
assert_frame_equal(first, expected)

last = grouped.last()
expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']]
expected.index = Index(['bar', 'foo'], name='A')
expected = expected.sort_index()
assert_frame_equal(last, expected)

nth = grouped.nth(1)
expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']]
expected.index = Index(['bar', 'foo'], name='A')
expected = expected.sort_index()
assert_frame_equal(nth, expected)

# GH 2763, first/last shifting dtypes
idx = lrange(10)
idx.append(9)
s = Series(data=lrange(11), index=idx, name='IntCol')
self.assertEqual(s.dtype, 'int64')
f = s.groupby(level=0).first()
self.assertEqual(f.dtype, 'int64')

def test_nth(self):
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')

assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
assert_frame_equal(g[['B']].nth(0),
df.loc[[0, 2], ['A', 'B']].set_index('A'))

exp = df.set_index('A')
assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])

exp['B'] = np.nan
assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])

# out of bounds, regression from 0.13.1
# GH 6621
df = DataFrame({'color': {0: 'green',
1: 'green',
2: 'red',
3: 'red',
4: 'red'},
'food': {0: 'ham',
1: 'eggs',
2: 'eggs',
3: 'ham',
4: 'pork'},
'two': {0: 1.5456590000000001,
1: -0.070345000000000005,
2: -2.4004539999999999,
3: 0.46206000000000003,
4: 0.52350799999999997},
'one': {0: 0.56573799999999996,
1: -0.9742360000000001,
2: 1.033801,
3: -0.78543499999999999,
4: 0.70422799999999997}}).set_index(['color',
'food'])

result = df.groupby(level=0, as_index=False).nth(2)
expected = df.iloc[[-1]]
assert_frame_equal(result, expected)

result = df.groupby(level=0, as_index=False).nth(3)
expected = df.loc[[]]
assert_frame_equal(result, expected)

# GH 7559
# from the vbench
df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
s = df[1]
g = df[0]
expected = s.groupby(g).first()
expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
assert_series_equal(expected2, expected, check_names=False)
self.assertTrue(expected.name, 0)
self.assertEqual(expected.name, 1)

# validate first
v = s[g == 1].iloc[0]
self.assertEqual(expected.iloc[0], v)
self.assertEqual(expected2.iloc[0], v)

# this is NOT the same as .first (as sorted is default!)
# as it keeps the order in the series (and not the group order)
# related GH 7287
expected = s.groupby(g, sort=False).first()
result = s.groupby(g, sort=False).nth(0, dropna='all')
assert_series_equal(result, expected)

# doc example
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')
result = g.B.nth(0, dropna=True)
expected = g.B.first()
assert_series_equal(result, expected)

# test multiple nth values
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
columns=['A', 'B'])
g = df.groupby('A')

assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
assert_frame_equal(
g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
assert_frame_equal(
g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
assert_frame_equal(
g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))

business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
freq='B')
df = DataFrame(1, index=business_dates, columns=['a', 'b'])
# get the first, fourth and last two business days for each month
key = (df.index.year, df.index.month)
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
expected_dates = pd.to_datetime(
['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
'2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
'2014/6/27', '2014/6/30'])
expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
assert_frame_equal(result, expected)

def test_nth_multi_index(self):
# PR 9090, related to issue 8979
# test nth on MultiIndex, should match .first()
grouped = self.three_group.groupby(['A', 'B'])
result = grouped.nth(0)
expected = grouped.first()
assert_frame_equal(result, expected)

def test_nth_multi_index_as_expected(self):
# PR 9090, related to issue 8979
# test nth on MultiIndex
three_group = DataFrame(
{'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
'foo', 'foo', 'foo'],
'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
'two', 'two', 'one'],
'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
'dull', 'shiny', 'shiny', 'shiny']})
grouped = three_group.groupby(['A', 'B'])
result = grouped.nth(0)
expected = DataFrame(
{'C': ['dull', 'dull', 'dull', 'dull']},
index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'],
['one', 'two', 'one', 'two']],
names=['A', 'B']))
assert_frame_equal(result, expected)

def test_group_selection_cache(self):
# GH 12839 nth, head, and tail should return same result consistently
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
Expand Down
Loading