Skip to content

Commit 7cd56cd

Browse files
committed
clean warnings
1 parent c61318d commit 7cd56cd

File tree

10 files changed

+77
-40
lines changed

10 files changed

+77
-40
lines changed

pandas/core/arrays/categorical.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,7 @@ def _set_categories(self, categories, fastpath=False):
635635

636636
self._dtype = new_dtype
637637

638-
def _codes_for_groupby(self, sort):
638+
def _codes_for_groupby(self, sort, observed):
639639
"""
640640
If sort=False, return a copy of self, coded with categories as
641641
returned by .unique(), followed by any categories not appearing in
@@ -649,6 +649,8 @@ def _codes_for_groupby(self, sort):
649649
----------
650650
sort : boolean
651651
The value of the sort parameter groupby was called with.
652+
observed : boolean
653+
Account only for the observed values
652654
653655
Returns
654656
-------
@@ -659,6 +661,22 @@ def _codes_for_groupby(self, sort):
659661
categories in the original order.
660662
"""
661663

664+
# we only care about observed values
665+
if observed:
666+
unique_codes = unique1d(self.codes)
667+
cat = self.copy()
668+
669+
take_codes = unique_codes[unique_codes != -1]
670+
if self.ordered:
671+
take_codes = np.sort(take_codes)
672+
673+
# we recode according to the uniques
674+
cat._categories = self.categories.take(take_codes)
675+
cat._codes = _recode_for_categories(self.codes,
676+
self.categories,
677+
cat._categories)
678+
return cat
679+
662680
# Already sorted according to self.categories; all is fine
663681
if sort:
664682
return self
@@ -2117,7 +2135,7 @@ def unique(self):
21172135
# exclude nan from indexer for categories
21182136
take_codes = unique_codes[unique_codes != -1]
21192137
if self.ordered:
2120-
take_codes = sorted(take_codes)
2138+
take_codes = np.sort(take_codes)
21212139
return cat.set_categories(cat.categories.take(take_codes))
21222140

21232141
def _values_for_factorize(self):

pandas/core/groupby/groupby.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -2961,14 +2961,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
29612961
# a passed Categorical
29622962
elif is_categorical_dtype(self.grouper):
29632963

2964-
self.grouper = self.grouper._codes_for_groupby(self.sort)
2965-
codes = self.grouper.codes
2966-
categories = self.grouper.categories
2967-
2968-
# we make a CategoricalIndex out of the cat grouper
2969-
# preserving the categories / ordered attributes
2970-
self._labels = codes
2971-
29722964
# Use the observed values of the grouper if inidcated
29732965
observed = self.observed
29742966
if observed is None:
@@ -2980,8 +2972,16 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
29802972
warnings.warn(msg, FutureWarning, stacklevel=5)
29812973
observed = False
29822974

2975+
grouper = self.grouper
2976+
self.grouper = self.grouper._codes_for_groupby(
2977+
self.sort, observed)
2978+
categories = self.grouper.categories
2979+
2980+
# we make a CategoricalIndex out of the cat grouper
2981+
# preserving the categories / ordered attributes
2982+
self._labels = self.grouper.codes
29832983
if observed:
2984-
codes = algorithms.unique1d(codes)
2984+
codes = algorithms.unique1d(grouper.codes)
29852985
else:
29862986
codes = np.arange(len(categories))
29872987

pandas/core/indexes/category.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -782,9 +782,9 @@ def _concat_same_dtype(self, to_concat, name):
782782
result.name = name
783783
return result
784784

785-
def _codes_for_groupby(self, sort):
785+
def _codes_for_groupby(self, sort, observed):
786786
""" Return a Categorical adjusted for groupby """
787-
return self.values._codes_for_groupby(sort)
787+
return self.values._codes_for_groupby(sort, observed)
788788

789789
@classmethod
790790
def _add_comparison_methods(cls):

pandas/tests/frame/test_sorting.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,7 @@ def test_sort_index_intervalindex(self):
573573
bins=[-3, -0.5, 0, 0.5, 3])
574574
model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])
575575

576-
result = model.groupby(['X1', 'X2']).mean().unstack()
576+
result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
577577
expected = IntervalIndex.from_tuples(
578578
[(-3.0, -0.5), (-0.5, 0.0),
579579
(0.0, 0.5), (0.5, 3.0)],

pandas/tests/groupby/aggregate/test_cython.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -158,35 +158,46 @@ def test__cython_agg_general(op, targop):
158158
('min', np.min),
159159
('max', np.max), ]
160160
)
161-
def test_cython_agg_empty_buckets(op, targop):
161+
def test_cython_agg_empty_buckets(op, targop, observed):
162162
df = pd.DataFrame([11, 12, 13])
163163
grps = range(0, 55, 5)
164164

165165
# calling _cython_agg_general directly, instead of via the user API
166166
# which sets different values for min_count, so do that here.
167-
result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op)
168-
expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x))
167+
g = df.groupby(pd.cut(df[0], grps), observed=observed)
168+
result = g._cython_agg_general(op)
169+
170+
g = df.groupby(pd.cut(df[0], grps), observed=observed)
171+
expected = g.agg(lambda x: targop(x))
169172
tm.assert_frame_equal(result, expected)
170173

171174

172-
def test_cython_agg_empty_buckets_nanops():
175+
def test_cython_agg_empty_buckets_nanops(observed):
173176
# GH-18869 can't call nanops on empty groups, so hardcode expected
174177
# for these
175178
df = pd.DataFrame([11, 12, 13], columns=['a'])
176179
grps = range(0, 25, 5)
177180
# add / sum
178-
result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add')
181+
result = df.groupby(pd.cut(df['a'], grps),
182+
observed=observed)._cython_agg_general('add')
179183
intervals = pd.interval_range(0, 20, freq=5)
180184
expected = pd.DataFrame(
181185
{"a": [0, 0, 36, 0]},
182186
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
187+
if observed:
188+
expected = expected[expected.a != 0]
189+
183190
tm.assert_frame_equal(result, expected)
184191

185192
# prod
186-
result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod')
193+
result = df.groupby(pd.cut(df['a'], grps),
194+
observed=observed)._cython_agg_general('prod')
187195
expected = pd.DataFrame(
188196
{"a": [1, 1, 1716, 1]},
189197
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
198+
if observed:
199+
expected = expected[expected.a != 1]
200+
190201
tm.assert_frame_equal(result, expected)
191202

192203

pandas/tests/groupby/aggregate/test_other.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -488,12 +488,12 @@ def test_agg_structs_series(structure, expected):
488488

489489

490490
@pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.")
491-
def test_agg_category_nansum():
491+
def test_agg_category_nansum(observed):
492492
categories = ['a', 'b', 'c']
493493
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
494494
categories=categories),
495495
'B': [1, 2, 3]})
496-
result = df.groupby("A").B.agg(np.nansum)
496+
result = df.groupby("A", observed=observed).B.agg(np.nansum)
497497
expected = pd.Series([3, 3, 0],
498498
index=pd.CategoricalIndex(['a', 'b', 'c'],
499499
categories=categories,

pandas/tests/groupby/conftest.py

+5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
from pandas.util import testing as tm
55

66

7+
@pytest.fixture(params=[True, False])
8+
def observed(request):
9+
return request.param
10+
11+
712
@pytest.fixture
813
def mframe():
914
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',

pandas/tests/groupby/test_categorical.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,6 @@
1212
import pandas.util.testing as tm
1313

1414

15-
@pytest.fixture(params=[True, False])
16-
def observed(request):
17-
return request.param
18-
19-
2015
def cartesian_product_for_groupers(result, args, names):
2116
""" Reindex to a cartesian production for the groupers,
2217
preserving the nature (Categorical) of each grouper """
@@ -378,8 +373,7 @@ def test_observed(observed):
378373
tm.assert_frame_equal(result, expected)
379374

380375

381-
@pytest.mark.xfail(reason="failing with observed")
382-
def test_observed_failing(observed):
376+
def test_observed_codes_remap(observed):
383377
d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
384378
df = pd.DataFrame(d)
385379
values = pd.cut(df['C1'], [1, 2, 3, 6])

pandas/tests/groupby/test_function.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -313,14 +313,14 @@ def test_cython_median():
313313
tm.assert_frame_equal(rs, xp)
314314

315315

316-
def test_median_empty_bins():
316+
def test_median_empty_bins(observed):
317317
df = pd.DataFrame(np.random.randint(0, 44, 500))
318318

319319
grps = range(0, 55, 5)
320320
bins = pd.cut(df[0], grps)
321321

322-
result = df.groupby(bins).median()
323-
expected = df.groupby(bins).agg(lambda x: x.median())
322+
result = df.groupby(bins, observed=observed).median()
323+
expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
324324
tm.assert_frame_equal(result, expected)
325325

326326

pandas/tests/groupby/test_grouping.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ def test_groupby_levels_and_columns(self):
251251
by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
252252
tm.assert_frame_equal(by_levels, by_columns)
253253

254-
def test_groupby_categorical_index_and_columns(self):
254+
def test_groupby_categorical_index_and_columns(self, observed):
255255
# GH18432
256256
columns = ['A', 'B', 'A', 'B']
257257
categories = ['B', 'A']
@@ -260,17 +260,26 @@ def test_groupby_categorical_index_and_columns(self):
260260
categories=categories,
261261
ordered=True)
262262
df = DataFrame(data=data, columns=cat_columns)
263-
result = df.groupby(axis=1, level=0).sum()
263+
result = df.groupby(axis=1, level=0, observed=observed).sum()
264264
expected_data = 2 * np.ones((5, 2), int)
265-
expected_columns = CategoricalIndex(categories,
266-
categories=categories,
267-
ordered=True)
265+
266+
if observed:
267+
# if we are not-observed we undergo a reindex
268+
# so need to adjust the output as our expected sets us up
269+
# to be non-observed
270+
expected_columns = CategoricalIndex(['A', 'B'],
271+
categories=categories,
272+
ordered=True)
273+
else:
274+
expected_columns = CategoricalIndex(categories,
275+
categories=categories,
276+
ordered=True)
268277
expected = DataFrame(data=expected_data, columns=expected_columns)
269278
assert_frame_equal(result, expected)
270279

271280
# test transposed version
272281
df = DataFrame(data.T, index=cat_columns)
273-
result = df.groupby(axis=0, level=0).sum()
282+
result = df.groupby(axis=0, level=0, observed=observed).sum()
274283
expected = DataFrame(data=expected_data.T, index=expected_columns)
275284
assert_frame_equal(result, expected)
276285

@@ -572,11 +581,11 @@ def test_get_group(self):
572581
pytest.raises(ValueError,
573582
lambda: g.get_group(('foo', 'bar', 'baz')))
574583

575-
def test_get_group_empty_bins(self):
584+
def test_get_group_empty_bins(self, observed):
576585

577586
d = pd.DataFrame([3, 1, 7, 6])
578587
bins = [0, 5, 10, 15]
579-
g = d.groupby(pd.cut(d[0], bins))
588+
g = d.groupby(pd.cut(d[0], bins), observed=observed)
580589

581590
# TODO: should prob allow a str of Interval work as well
582591
# IOW '(0, 5]'

0 commit comments

Comments
 (0)