Skip to content

BUG/TST: fix tests for groupby nth on Series (GH7559) #7580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 30, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/v0.14.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ Bug Fixes
- Bug in setitem with list-of-lists and single vs mixed types (:issue:`7551`:)
- Bug in timeops with non-aligned Series (:issue:`7500`)
- Bug in timedelta inference when assigning an incomplete Series (:issue:`7592`)
- Bug in groupby ``.nth`` with a Series and integer-like column name (:issue:`7559`)

- Bug in ``value_counts`` where ``NaT`` did not qualify as missing (``NaN``) (:issue:`7423`)

Expand Down
70 changes: 50 additions & 20 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ def _selected_obj(self):
def _set_selection_from_grouper(self):
""" we may need create a selection if we have non-level groupers """
grp = self.grouper
if self.as_index and getattr(grp,'groupings',None) is not None:
if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1:
ax = self.obj._info_axis
groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
if len(groupers):
Expand Down Expand Up @@ -759,7 +759,7 @@ def nth(self, n, dropna=None):

Examples
--------
>>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
>>> df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
>>> g = df.groupby('A')
>>> g.nth(0)
A B
Expand Down Expand Up @@ -804,7 +804,10 @@ def nth(self, n, dropna=None):
if self.as_index:
ax = self.obj._info_axis
names = self.grouper.names
if all([ n in ax for n in names ]):
if self.obj.ndim == 1:
# this is a pass-thru
pass
elif all([ n in ax for n in names ]):
result.index = Index(self.obj[names][is_nth].values.ravel()).set_names(names)
elif self._group_selection is not None:
result.index = self.obj._get_axis(self.axis)[is_nth]
Expand All @@ -821,17 +824,29 @@ def nth(self, n, dropna=None):
"(was passed %s)." % (dropna),)

# old behaviour, but with all and any support for DataFrames.

# modified in GH 7559 to have better perf
max_len = n if n >= 0 else - 1 - n
dropped = self.obj.dropna(how=dropna, axis=self.axis)

def picker(x):
x = x.dropna(how=dropna) # Note: how is ignored if Series
if len(x) <= max_len:
return np.nan
else:
return x.iloc[n]
# get a new grouper for our dropped obj
grouper, exclusions, obj = _get_grouper(dropped, key=self.keys, axis=self.axis,
level=self.level, sort=self.sort)

sizes = obj.groupby(grouper).size()
result = obj.groupby(grouper).nth(n)
mask = (sizes<max_len).values

# set the results which don't meet the criteria
if len(result) and mask.any():
result.loc[mask] = np.nan

return self.agg(picker)
# reset/reindex to the original groups
if len(self.obj) == len(dropped):
result.index = self.grouper.result_index
else:
result = result.reindex(self.grouper.result_index)

return result

def cumcount(self, **kwargs):
"""
Expand Down Expand Up @@ -942,21 +957,33 @@ def tail(self, n=5):
def _cumcount_array(self, arr=None, **kwargs):
"""
arr is where cumcount gets it's values from

note: this is currently implementing sort=False (though the default is sort=True)
for groupby in general
"""
ascending = kwargs.pop('ascending', True)

if arr is None:
arr = np.arange(self.grouper._max_groupsize, dtype='int64')

len_index = len(self._selected_obj.index)
cumcounts = np.empty(len_index, dtype=arr.dtype)
cumcounts = np.zeros(len_index, dtype=arr.dtype)
if not len_index:
return cumcounts

indices, values = [], []
for v in self.indices.values():
indices.append(v)

if ascending:
values.append(arr[:len(v)])
else:
values.append(arr[len(v)-1::-1])

indices = np.concatenate(indices)
values = np.concatenate(values)
cumcounts[indices] = values

if ascending:
for v in self.indices.values():
cumcounts[v] = arr[:len(v)]
else:
for v in self.indices.values():
cumcounts[v] = arr[len(v)-1::-1]
return cumcounts

def _index_with_as_index(self, b):
Expand Down Expand Up @@ -1270,6 +1297,7 @@ def group_info(self):
comp_ids = com._ensure_int64(comp_ids)
return comp_ids, obs_group_ids, ngroups


def _get_compressed_labels(self):
all_labels = [ping.labels for ping in self.groupings]
if self._overflow_possible:
Expand Down Expand Up @@ -1892,7 +1920,6 @@ def groups(self):
self._groups = self.index.groupby(self.grouper)
return self._groups


def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
"""
create and return a BaseGrouper, which is an internal
Expand Down Expand Up @@ -2141,7 +2168,10 @@ def _wrap_aggregated_output(self, output, names=None):
if names is not None:
return DataFrame(output, index=index, columns=names)
else:
return Series(output, index=index, name=self.name)
name = self.name
if name is None:
name = self._selected_obj.name
return Series(output, index=index, name=name)

def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(keys) == 0:
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,28 @@ def test_nth(self):
expected = df.loc[[]]
assert_frame_equal(result,expected)

# GH 7559
# from the vbench
df = DataFrame(np.random.randint(1, 10, (100, 2)))
s = df[1]
g = df[0]
expected = s.groupby(g).first()
expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
assert_series_equal(expected2,expected)

# validate first
v = s[g==1].iloc[0]
self.assertEqual(expected.iloc[0],v)
self.assertEqual(expected2.iloc[0],v)

# this is NOT the same as .first (as sorted is default!)
# as it keeps the order in the series (and not the group order)
# related GH 7287
expected = s.groupby(g,sort=False).first()
expected.index = range(1,10)
result = s.groupby(g).nth(0,dropna='all')
assert_series_equal(result,expected)

def test_grouper_index_types(self):
# related GH5375
# groupby misbehaving when using a Floatlike index
Expand Down
38 changes: 25 additions & 13 deletions vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,14 @@ def f():
groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup,
start_date=datetime(2013, 1, 1))

groupby_nth_float64 = Benchmark('data.groupby(labels).nth(0)', setup,
start_date=datetime(2012, 5, 1))

groupby_nth_float32 = Benchmark('data2.groupby(labels).nth(0)', setup,
start_date=datetime(2013, 1, 1))
groupby_nth_float64_none = Benchmark('data.groupby(labels).nth(0)', setup,
start_date=datetime(2012, 5, 1))
groupby_nth_float32_none = Benchmark('data2.groupby(labels).nth(0)', setup,
start_date=datetime(2013, 1, 1))
groupby_nth_float64_any = Benchmark('data.groupby(labels).nth(0,dropna="all")', setup,
start_date=datetime(2012, 5, 1))
groupby_nth_float32_any = Benchmark('data2.groupby(labels).nth(0,dropna="all")', setup,
start_date=datetime(2013, 1, 1))

# with datetimes (GH7555)
setup = common_setup + """
Expand All @@ -259,8 +262,10 @@ def f():
start_date=datetime(2013, 5, 1))
groupby_last_datetimes = Benchmark('df.groupby("b").last()', setup,
start_date=datetime(2013, 5, 1))
groupby_nth_datetimes = Benchmark('df.groupby("b").nth(0)', setup,
start_date=datetime(2013, 5, 1))
groupby_nth_datetimes_none = Benchmark('df.groupby("b").nth(0)', setup,
start_date=datetime(2013, 5, 1))
groupby_nth_datetimes_any = Benchmark('df.groupby("b").nth(0,dropna="all")', setup,
start_date=datetime(2013, 5, 1))

# with object
setup = common_setup + """
Expand All @@ -271,8 +276,10 @@ def f():
start_date=datetime(2013, 5, 1))
groupby_last_object = Benchmark('df.groupby("b").last()', setup,
start_date=datetime(2013, 5, 1))
groupby_nth_object = Benchmark('df.groupby("b").nth(0)', setup,
start_date=datetime(2013, 5, 1))
groupby_nth_object_none = Benchmark('df.groupby("b").nth(0)', setup,
start_date=datetime(2013, 5, 1))
groupby_nth_object_any = Benchmark('df.groupby("b").nth(0,dropna="any")', setup,
start_date=datetime(2013, 5, 1))

#----------------------------------------------------------------------
# groupby_indices replacement, chop up Series
Expand Down Expand Up @@ -351,11 +358,16 @@ def f(g):
"""

# Not really a fair test as behaviour has changed!
groupby_frame_nth = Benchmark("df.groupby(0).nth(0)", setup,
start_date=datetime(2014, 3, 1))
groupby_frame_nth_none = Benchmark("df.groupby(0).nth(0)", setup,
start_date=datetime(2014, 3, 1))

groupby_series_nth_none = Benchmark("df[1].groupby(df[0]).nth(0)", setup,
start_date=datetime(2014, 3, 1))
groupby_frame_nth_any= Benchmark("df.groupby(0).nth(0,dropna='any')", setup,
start_date=datetime(2014, 3, 1))

groupby_series_nth = Benchmark("df[1].groupby(df[0]).nth(0)", setup,
start_date=datetime(2014, 3, 1))
groupby_series_nth_any = Benchmark("df[1].groupby(df[0]).nth(0,dropna='any')", setup,
start_date=datetime(2014, 3, 1))


#----------------------------------------------------------------------
Expand Down