Skip to content

Commit 73e045d

Browse files
RainFungHyukjinKwon
authored andcommitted
Add diff in Groupby (#622)
This PR adds rank GroupBy (both SeriesGroupBy and DataFrameGroupBy) by using existing diff logic in Series.
1 parent c8bc4e1 commit 73e045d

File tree

5 files changed

+93
-7
lines changed

5 files changed

+93
-7
lines changed

databricks/koalas/groupby.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,63 @@ def size(self):
425425
for i, s in enumerate(groupkeys)])
426426
return _col(DataFrame(internal))
427427

428+
def diff(self, periods=1):
429+
"""
430+
First discrete difference of element.
431+
432+
Calculates the difference of a DataFrame element compared with another element in the
433+
DataFrame group (default is the element in the same column of the previous row).
434+
435+
Parameters
436+
----------
437+
periods : int, default 1
438+
Periods to shift for calculating difference, accepts negative values.
439+
440+
Returns
441+
-------
442+
diffed : DataFrame or Series
443+
444+
See Also
445+
--------
446+
databricks.koalas.Series.groupby
447+
databricks.koalas.DataFrame.groupby
448+
449+
Examples
450+
--------
451+
>>> df = ks.DataFrame({'a': [1, 2, 3, 4, 5, 6],
452+
... 'b': [1, 1, 2, 3, 5, 8],
453+
... 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c'])
454+
>>> df
455+
a b c
456+
0 1 1 1
457+
1 2 1 4
458+
2 3 2 9
459+
3 4 3 16
460+
4 5 5 25
461+
5 6 8 36
462+
463+
>>> df.groupby(['b']).diff().sort_index()
464+
a c
465+
0 NaN NaN
466+
1 1.0 3.0
467+
2 NaN NaN
468+
3 NaN NaN
469+
4 NaN NaN
470+
5 NaN NaN
471+
472+
Difference with previous column in a group.
473+
474+
>>> df.groupby(['b'])['a'].diff().sort_index()
475+
0 NaN
476+
1 1.0
477+
2 NaN
478+
3 NaN
479+
4 NaN
480+
5 NaN
481+
Name: a, dtype: float64
482+
"""
483+
return self._diff(periods)
484+
428485
def cummax(self):
429486
"""
430487
Cumulative max for each group.
@@ -1164,6 +1221,19 @@ def __getitem__(self, item):
11641221
return DataFrameGroupBy(self._kdf, self._groupkeys, as_index=self._as_index,
11651222
agg_columns=item)
11661223

1224+
def _diff(self, *args, **kwargs):
1225+
applied = []
1226+
kdf = self._kdf
1227+
groupkey_columns = [s.name for s in self._groupkeys]
1228+
1229+
for column in kdf._internal.data_columns:
1230+
if column not in groupkey_columns:
1231+
applied.append(kdf[column].groupby(self._groupkeys)._diff(*args, **kwargs))
1232+
1233+
sdf = kdf._sdf.select(kdf._internal.index_scols + [c._scol for c in applied])
1234+
internal = kdf._internal.copy(sdf=sdf, data_columns=[c.name for c in applied])
1235+
return DataFrame(internal)
1236+
11671237
def _rank(self, *args, **kwargs):
11681238
applied = []
11691239
kdf = self._kdf
@@ -1226,6 +1296,10 @@ def __getattr__(self, item: str) -> Any:
12261296
return partial(property_or_func, self)
12271297
raise AttributeError(item)
12281298

1299+
def _diff(self, *args, **kwargs):
1300+
groupkey_scols = [s._scol for s in self._groupkeys]
1301+
return Series._diff(self._ks, *args, **kwargs, part_cols=groupkey_scols)
1302+
12291303
def _cum(self, func):
12301304
groupkey_scols = [s._scol for s in self._groupkeys]
12311305
return Series._cum(self._ks, func, True, part_cols=groupkey_scols)

databricks/koalas/missing/groupby.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ class _MissingPandasLikeDataFrameGroupBy(object):
3333
corr = unsupported_property('corr')
3434
corrwith = unsupported_property('corrwith')
3535
cov = unsupported_property('cov')
36-
diff = unsupported_property('diff')
3736
dtypes = unsupported_property('dtypes')
3837
fillna = unsupported_property('fillna')
3938
groups = unsupported_property('groups')
@@ -82,7 +81,6 @@ class _MissingPandasLikeSeriesGroupBy(object):
8281
# Properties
8382
corr = unsupported_property('corr')
8483
cov = unsupported_property('cov')
85-
diff = unsupported_property('diff')
8684
dtype = unsupported_property('dtype')
8785
fillna = unsupported_property('fillna')
8886
groups = unsupported_property('groups')

databricks/koalas/series.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2638,16 +2638,17 @@ def diff(self, periods=1):
26382638
5 NaN
26392639
Name: c, dtype: float64
26402640
"""
2641+
return self._diff(periods)
26412642

2643+
def _diff(self, periods, part_cols=()):
26422644
if len(self._internal.index_columns) == 0:
26432645
raise ValueError("Index must be set.")
2644-
26452646
if not isinstance(periods, int):
26462647
raise ValueError('periods should be an int; however, got [%s]' % type(periods))
2647-
2648-
col = self._scol
2649-
window = Window.orderBy(self._internal.index_scols).rowsBetween(-periods, -periods)
2650-
return self._with_new_scol(col - F.lag(col, periods).over(window)).alias(self.name)
2648+
window = Window.partitionBy(*part_cols).orderBy(self._internal.index_scols)\
2649+
.rowsBetween(-periods, -periods)
2650+
scol = self._scol - F.lag(self._scol, periods).over(window)
2651+
return Series(self._kdf._internal.copy(scol=scol), anchor=self._kdf).rename(self.name)
26512652

26522653
def idxmax(self, skipna=True):
26532654
"""

databricks/koalas/tests/test_groupby.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,18 @@ def test_size(self):
170170
self.assert_eq(kdf.groupby(['A', 'B']).size().sort_index(),
171171
pdf.groupby(['A', 'B']).size().sort_index())
172172

173+
def test_diff(self):
174+
pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
175+
'b': [1, 1, 2, 3, 5, 8],
176+
'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c'])
177+
kdf = koalas.DataFrame(pdf)
178+
self.assert_eq(kdf.groupby("b").diff().sort_index(),
179+
pdf.groupby("b").diff().sort_index())
180+
self.assert_eq(kdf.groupby(['a', 'b']).diff().sort_index(),
181+
pdf.groupby(['a', 'b']).diff().sort_index())
182+
self.assert_eq(repr(kdf.groupby(['b'])['a'].diff().sort_index()),
183+
repr(pdf.groupby(['b'])['a'].diff().sort_index()))
184+
173185
def test_missing(self):
174186
kdf = koalas.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]})
175187

docs/source/reference/groupby.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,4 @@ Computations / Descriptive Stats
4343
GroupBy.sum
4444
GroupBy.var
4545
GroupBy.size
46+
GroupBy.diff

0 commit comments

Comments
 (0)