@@ -425,6 +425,63 @@ def size(self):
425425 for i , s in enumerate (groupkeys )])
426426 return _col (DataFrame (internal ))
427427
428+ def diff (self , periods = 1 ):
429+ """
430+ First discrete difference of element.
431+
432+ Calculates the difference of a DataFrame element compared with another element in the
433+ DataFrame group (default is the element in the same column of the previous row).
434+
435+ Parameters
436+ ----------
437+ periods : int, default 1
438+ Periods to shift for calculating difference, accepts negative values.
439+
440+ Returns
441+ -------
442+ diffed : DataFrame or Series
443+
444+ See Also
445+ --------
446+ databricks.koalas.Series.groupby
447+ databricks.koalas.DataFrame.groupby
448+
449+ Examples
450+ --------
451+ >>> df = ks.DataFrame({'a': [1, 2, 3, 4, 5, 6],
452+ ... 'b': [1, 1, 2, 3, 5, 8],
453+ ... 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c'])
454+ >>> df
455+ a b c
456+ 0 1 1 1
457+ 1 2 1 4
458+ 2 3 2 9
459+ 3 4 3 16
460+ 4 5 5 25
461+ 5 6 8 36
462+
463+ >>> df.groupby(['b']).diff().sort_index()
464+ a c
465+ 0 NaN NaN
466+ 1 1.0 3.0
467+ 2 NaN NaN
468+ 3 NaN NaN
469+ 4 NaN NaN
470+ 5 NaN NaN
471+
472+ Difference with previous column in a group.
473+
474+ >>> df.groupby(['b'])['a'].diff().sort_index()
475+ 0 NaN
476+ 1 1.0
477+ 2 NaN
478+ 3 NaN
479+ 4 NaN
480+ 5 NaN
481+ Name: a, dtype: float64
482+ """
483+ return self ._diff (periods )
484+
428485 def cummax (self ):
429486 """
430487 Cumulative max for each group.
@@ -1164,6 +1221,19 @@ def __getitem__(self, item):
11641221 return DataFrameGroupBy (self ._kdf , self ._groupkeys , as_index = self ._as_index ,
11651222 agg_columns = item )
11661223
1224+ def _diff (self , * args , ** kwargs ):
1225+ applied = []
1226+ kdf = self ._kdf
1227+ groupkey_columns = [s .name for s in self ._groupkeys ]
1228+
1229+ for column in kdf ._internal .data_columns :
1230+ if column not in groupkey_columns :
1231+ applied .append (kdf [column ].groupby (self ._groupkeys )._diff (* args , ** kwargs ))
1232+
1233+ sdf = kdf ._sdf .select (kdf ._internal .index_scols + [c ._scol for c in applied ])
1234+ internal = kdf ._internal .copy (sdf = sdf , data_columns = [c .name for c in applied ])
1235+ return DataFrame (internal )
1236+
11671237 def _rank (self , * args , ** kwargs ):
11681238 applied = []
11691239 kdf = self ._kdf
@@ -1226,6 +1296,10 @@ def __getattr__(self, item: str) -> Any:
12261296 return partial (property_or_func , self )
12271297 raise AttributeError (item )
12281298
1299+ def _diff (self , * args , ** kwargs ):
1300+ groupkey_scols = [s ._scol for s in self ._groupkeys ]
1301+ return Series ._diff (self ._ks , * args , ** kwargs , part_cols = groupkey_scols )
1302+
12291303 def _cum (self , func ):
12301304 groupkey_scols = [s ._scol for s in self ._groupkeys ]
12311305 return Series ._cum (self ._ks , func , True , part_cols = groupkey_scols )
0 commit comments