From b66a1c8754eec71c3b8dba1dd8e1ac620e923f9d Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 15 May 2016 17:28:36 -0500 Subject: [PATCH 1/5] PERF: DataFrame transform --- asv_bench/benchmarks/groupby.py | 15 +++++++++++++++ doc/source/whatsnew/v0.18.2.txt | 4 ++-- pandas/core/groupby.py | 31 ++++++++---------------------- pandas/tests/test_groupby.py | 34 +++++++++++++++++++++++++++++++-- 4 files changed, 57 insertions(+), 27 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 7279d73eb0d97..586bd00b091fe 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -773,6 +773,21 @@ def setup(self): def time_groupby_transform_series2(self): self.df.groupby('id')['val'].transform(np.mean) + +class groupby_transform_dataframe(object): + # GH 12737 + goal_time = 0.2 + + def setup(self): + self.df = pd.DataFrame({'group': np.repeat(np.arange(1000), 10), + 'B': np.nan, + 'C': np.nan}) + self.df.ix[4::10, 'B':'C'] = 5 + + def time_groupby_transform_dataframe(self): + self.df.groupby('group').transform('first') + + class groupby_transform_cythonized(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 3ac466158276f..6dcc637a00a74 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -104,7 +104,7 @@ Performance Improvements - increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) - +- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) .. _whatsnew_0182.bug_fixes: @@ -123,7 +123,7 @@ Bug Fixes - Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) - +- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) - Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7a4791189726e..f2ca0ca8f8de9 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2776,18 +2776,9 @@ def _transform_fast(self, func): func = getattr(self, func) ids, _, ngroup = self.grouper.group_info - mask = ids != -1 - - out = func().values[ids] - if not mask.all(): - out = np.where(mask, out, np.nan) - - obs = np.zeros(ngroup, dtype='bool') - obs[ids[mask]] = True - if not obs.all(): - out = self._try_cast(out, self._selected_obj) - return Series(out, index=self.obj.index) + out = algos.take_1d(func().values, ids) + return Series(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): # noqa """ @@ -3465,19 +3456,13 @@ def transform(self, func, *args, **kwargs): if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) - results = np.empty_like(obj.values, result.values.dtype) - for (name, group), (i, row) in zip(self, result.iterrows()): - indexer = self._get_index(name) - if len(indexer) > 0: - results[indexer] = np.tile(row.values, len( - indexer)).reshape(len(indexer), -1) - - counts = self.size().fillna(0).values - if any(counts == 0): - results = self._try_cast(results, obj[result.columns]) + # Fast transform + ids, _, ngroup = self.grouper.group_info + out = {} + for col in result: + out[col] = algos.take_nd(result[col].values, ids) - return (DataFrame(results, columns=result.columns, index=obj.index) - ._convert(datetime=True)) + return DataFrame(out, columns=result.columns, index=obj.index) def _define_paths(self, func, *args, **kwargs): if isinstance(func, compat.string_types): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 583b1c7aea270..accb445ea6248 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1051,13 +1051,33 @@ def test_transform_fast(self): values = np.repeat(grp.mean().values, com._ensure_platform_int(grp.count().values)) - expected = pd.Series(values, index=df.index) + expected = pd.Series(values, index=df.index, name='val') result = grp.transform(np.mean) assert_series_equal(result, expected) result = grp.transform('mean') assert_series_equal(result, expected) + # GH 12737 + df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], + 'd': pd.date_range('2014-1-1', '2014-1-4'), + 'i': [1, 2, 3, 4]}, + columns=['grouping', 'f', 'i', 'd']) + result = df.groupby('grouping').transform('first') + + dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] + expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], + 'd': dates, + 'i': [1, 2, 2, 4]}, + columns=['f', 'i', 'd']) + assert_frame_equal(result, expected) + + # selection + result = df.groupby('grouping')[['f', 'i']].transform('first') + expected = expected[['f', 'i']] + assert_frame_equal(result, expected) + def test_transform_broadcast(self): grouped = self.ts.groupby(lambda x: x.month) result = grouped.transform(np.mean) @@ -1191,6 +1211,16 @@ def test_transform_function_aliases(self): expected = self.df.groupby('A')['C'].transform(np.mean) assert_series_equal(result, expected) + def test_series_fast_transform_date(self): + # GH 13191 + df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], + 'd': pd.date_range('2014-1-1', '2014-1-4')}) + result = df.groupby('grouping')['d'].transform('first') + dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-4')] + expected = pd.Series(dates, name='d') + assert_series_equal(result, expected) + def test_transform_length(self): # GH 9697 df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) @@ -4406,7 +4436,7 @@ def test_groupby_datetime64_32_bit(self): df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2}) result = df.groupby("A")["B"].transform(min) - expected = Series([pd.Timestamp('2000-01-1')] * 2) + expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B') assert_series_equal(result, expected) def test_groupby_categorical_unequal_len(self): From 045d0c73a8be28d52fb86611270ac79c5239e625 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 15 May 2016 19:48:47 -0500 Subject: [PATCH 2/5] add back some casting --- pandas/core/groupby.py | 17 +++++++++++++++-- pandas/tests/test_categorical.py | 3 +-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f2ca0ca8f8de9..aedadc5e8018f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2776,8 +2776,11 @@ def _transform_fast(self, func): func = getattr(self, func) ids, _, ngroup = self.grouper.group_info - + counts = self.size().fillna(0).values + cast = (counts == 0).any() out = algos.take_1d(func().values, ids) + if cast: + out = self._try_cast(out, self.obj) return Series(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): # noqa @@ -3456,11 +3459,21 @@ def transform(self, func, *args, **kwargs): if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) - # Fast transform + # Fast transform path for aggregations + + # if there were groups with no observations (Categorical only?) + # try casting data to original dtype + counts = self.size().fillna(0).values + cast = (counts == 0).any() + + # by column (could be by block?) reshape aggregated data to + # size of original frame by repeating obvservations with take ids, _, ngroup = self.grouper.group_info out = {} for col in result: out[col] = algos.take_nd(result[col].values, ids) + if cast: + out[col] = self._try_cast(out[col], obj[col]) return DataFrame(out, columns=result.columns, index=obj.index) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 55df64264d6f9..f11a83fcef46e 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3043,8 +3043,7 @@ def f(x): c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a'], check_names=False) - self.assertTrue(result.name is None) + tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) From 9d78f659f55128e55d0b062f30fb04ae68994d3e Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 15 May 2016 20:06:54 -0500 Subject: [PATCH 3/5] other categorical test name fix --- pandas/tests/test_categorical.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index f11a83fcef46e..5a6667e57ce9d 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3025,8 +3025,7 @@ def f(x): c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a'], check_names=False) - self.assertTrue(result.name is None) + tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) From d61d4e0bff498e9b78427d33525279c29d7b0287 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 17 May 2016 18:39:55 -0500 Subject: [PATCH 4/5] handle duplicate column case --- pandas/core/groupby.py | 28 ++++++++++++++++------------ pandas/tests/test_groupby.py | 6 ++++++ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index aedadc5e8018f..41303ca195fe1 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2776,8 +2776,7 @@ def _transform_fast(self, func): func = getattr(self, func) ids, _, ngroup = self.grouper.group_info - counts = self.size().fillna(0).values - cast = (counts == 0).any() + cast = self.size().isnull().any() out = algos.take_1d(func().values, ids) if cast: out = self._try_cast(out, self.obj) @@ -3459,23 +3458,28 @@ def transform(self, func, *args, **kwargs): if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) - # Fast transform path for aggregations + return self._transform_fast(result, obj) + def _transform_fast(self, result, obj): + """ + Fast transform path for aggregations + """ # if there were groups with no observations (Categorical only?) # try casting data to original dtype - counts = self.size().fillna(0).values - cast = (counts == 0).any() + cast = self.size().isnull().any() - # by column (could be by block?) reshape aggregated data to - # size of original frame by repeating obvservations with take + # for each col, reshape to to size of original frame + # by take operation ids, _, ngroup = self.grouper.group_info - out = {} - for col in result: - out[col] = algos.take_nd(result[col].values, ids) + output = [] + for i, _ in enumerate(result.columns): + res = algos.take_1d(result.iloc[:, i].values, ids) if cast: - out[col] = self._try_cast(out[col], obj[col]) + res = self._try_cast(res, obj.iloc[:, i]) + output.append(res) - return DataFrame(out, columns=result.columns, index=obj.index) + return DataFrame._from_arrays(output, columns=result.columns, + index=obj.index) def _define_paths(self, func, *args, **kwargs): if isinstance(func, compat.string_types): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index accb445ea6248..d15bab708b61f 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1078,6 +1078,12 @@ def test_transform_fast(self): expected = expected[['f', 'i']] assert_frame_equal(result, expected) + # dup columns + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) + result = df.groupby('g').transform('first') + expected = df.drop('g', axis=1) + assert_frame_equal(result, expected) + def test_transform_broadcast(self): grouped = self.ts.groupby(lambda x: x.month) result = grouped.transform(np.mean) From 0af1e5582d4cc299137e925285108eaee20bcc8f Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 18 May 2016 05:26:51 -0500 Subject: [PATCH 5/5] revert casting logic --- pandas/core/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 41303ca195fe1..424859da82877 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2776,7 +2776,7 @@ def _transform_fast(self, func): func = getattr(self, func) ids, _, ngroup = self.grouper.group_info - cast = self.size().isnull().any() + cast = (self.size().fillna(0) > 0).any() out = algos.take_1d(func().values, ids) if cast: out = self._try_cast(out, self.obj) @@ -3466,7 +3466,7 @@ def _transform_fast(self, result, obj): """ # if there were groups with no observations (Categorical only?) # try casting data to original dtype - cast = self.size().isnull().any() + cast = (self.size().fillna(0) > 0).any() # for each col, reshape to to size of original frame # by take operation