From a9a0a19b98ec0dd0bc36f6c67a42002d0435a928 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sun, 9 Aug 2020 00:33:56 +0100 Subject: [PATCH 01/13] first pass at a post function --- pandas/core/groupby/groupby.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4597afeeaddbf..862acbaa6408a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -733,7 +733,7 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) - def _make_wrapper(self, name): + def _make_wrapper(self, name): assert name in self._apply_allowlist with _group_selection_context(self): @@ -1789,7 +1789,14 @@ def _fill(self, direction, limit=None): if limit is None: limit = -1 - return self._get_cythonized_result( + def _post(values, inferences): + if not self.dropna: + return values + mask = DataFrame(self.grouper.codes).eq(-1).any() + values[mask] = np.nan + return values + + res = self._get_cythonized_result( "group_fillna_indexer", numeric_only=False, needs_mask=True, @@ -1797,7 +1804,10 @@ def _fill(self, direction, limit=None): result_is_index=True, direction=direction, limit=limit, + post_processing=_post, ) + + return res @Substitution(name="groupby") def pad(self, limit=None): @@ -2532,7 +2542,7 @@ def _get_cythonized_result( # error_msg is "" on an frame/series with no rows or columns if len(output) == 0 and error_msg != "": raise TypeError(error_msg) - + if aggregate: return self._wrap_aggregated_output(output) else: From e6f30220ad789270ec45f3da0b74de62ccbe1a6c Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sun, 23 Aug 2020 22:03:59 +0100 Subject: [PATCH 02/13] mask.any() --- pandas/core/groupby/groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 862acbaa6408a..573e3dab98f0c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1793,7 +1793,8 @@ def _post(values, inferences): if not self.dropna: return values mask = DataFrame(self.grouper.codes).eq(-1).any() - values[mask] = np.nan + if mask.any(): + values[mask] = np.nan return values res = self._get_cythonized_result( From 61c5915334e2140c4a8f1d6d588dae17e97d1b65 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 24 Aug 2020 01:12:43 +0100 Subject: [PATCH 03/13] wrote test --- pandas/core/groupby/groupby.py | 6 ++- .../tests/groupby/transform/test_transform.py | 45 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1ee70b2c8560f..3aa2ab9cb111b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1818,7 +1818,11 @@ def _post(values, inferences): return values mask = DataFrame(self.grouper.codes).eq(-1).any() if mask.any(): - values[mask] = np.nan + try: + values[mask] = np.nan + except ValueError: + values[mask] = np.datetime64('NaT') + return values res = self._get_cythonized_result( diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index c09f35526a6bf..6af9622288fd4 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -976,6 +976,51 @@ def test_ffill_bfill_non_unique_multilevel(func, expected_status): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize('dropna', [True, False]) +@pytest.mark.parametrize('limit', [None, 1]) +@pytest.mark.parametrize('method', ['ffill', 'bfill', 'pad']) +@pytest.mark.parametrize('by', ['grp1', ['grp1'], ['grp1', 'grp2']]) +@pytest.mark.parametrize('has_nan', [[], ['grp1'], ['grp1', 'grp2']]) +def test_pad_handles_nan_groups(dropna, limit, method, by, has_nan): + + rows = pd.DataFrame({ + 'int' : pd.array([1,2], dtype='Int64'), + 'float' : [.1, .2], + 'bool' : pd.array([True, False], dtype='bool'), + 'date' : [pd.Timestamp(2010,1,1), pd.Timestamp(2020,2,2)], + 'period' : pd.array([pd.Period('2010-01'), pd.Period('2020-2')], dtype='period[M]'), + 'obj' : ['hello', 'world'], + 'cat' : pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']), + }) + + ridx = pd.Series([None]*10) + ridx[2] = 0 + ridx[7] = 1 + df = rows.reindex(ridx).reset_index(drop=True) + + grps = pd.Series(['good']*5 + ['bad']*5) + if type(by) is list: + grps = pd.concat([grps]*len(by), axis=1) + df[by] = grps + + by = [by] if type(by) is not list else by + has_nan = list(set(has_nan).intersection(set(by))) + df[has_nan] = df[has_nan].replace('bad', np.nan) + + grouped = df.groupby(by=by, dropna=dropna) + result = getattr(grouped, method)(limit=limit) + + if dropna and (len(has_nan) > 0): + ridx[7] = None + lim = 2 if limit is None else limit + + ridx = getattr(ridx, method)(limit=lim) + expected = rows.reindex(ridx).reset_index(drop=True) + + tm.assert_frame_equal(result, expected) + + + @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653 From edd10e540006762c9ea852b4820049725eacdb16 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Thu, 1 Oct 2020 22:25:16 +0100 Subject: [PATCH 04/13] clean up func. comments on test --- pandas/core/groupby/groupby.py | 23 +++--- .../tests/groupby/transform/test_transform.py | 77 ++++++++++++------- 2 files changed, 62 insertions(+), 38 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6cd7785b66d7f..3edc66792be7b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1847,18 +1847,17 @@ def _fill(self, direction, limit=None): if limit is None: limit = -1 - def _post(values, inferences): + def _nan_group_gets_nan_values(values, *args): if not self.dropna: return values - mask = DataFrame(self.grouper.codes).eq(-1).any() - if mask.any(): - try: - values[mask] = np.nan - except ValueError: - values[mask] = np.datetime64('NaT') - + in_nan_group = DataFrame(self.grouper.codes).eq(-1).any() + if in_nan_group.any(): + filler = {np.datetime64: np.datetime64("NaT")}.get( + values.dtype.type, np.nan + ) + values[in_nan_group] = filler return values - + res = self._get_cythonized_result( "group_fillna_indexer", numeric_only=False, @@ -1867,9 +1866,9 @@ def _post(values, inferences): result_is_index=True, direction=direction, limit=limit, - post_processing=_post, + post_processing=_nan_group_gets_nan_values, ) - + return res @Substitution(name="groupby") @@ -2605,7 +2604,7 @@ def _get_cythonized_result( # error_msg is "" on an frame/series with no rows or columns if len(output) == 0 and error_msg != "": raise TypeError(error_msg) - + if aggregate: return self._wrap_aggregated_output(output, index=self.grouper.result_index) else: diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index c8314f5d6d14d..77e9f9e1efe3f 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -977,51 +977,76 @@ def test_ffill_bfill_non_unique_multilevel(func, expected_status): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('dropna', [True, False]) -@pytest.mark.parametrize('limit', [None, 1]) -@pytest.mark.parametrize('method', ['ffill', 'bfill', 'pad']) -@pytest.mark.parametrize('by', ['grp1', ['grp1'], ['grp1', 'grp2']]) -@pytest.mark.parametrize('has_nan', [[], ['grp1'], ['grp1', 'grp2']]) +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("limit", [None, 1]) +@pytest.mark.parametrize("method", ["ffill", "bfill", "pad", "backfill"]) +@pytest.mark.parametrize("by", ["grp1", ["grp1"], ["grp1", "grp2"]]) +@pytest.mark.parametrize("has_nan", [[], ["grp1"], ["grp1", "grp2"]]) def test_pad_handles_nan_groups(dropna, limit, method, by, has_nan): - - rows = pd.DataFrame({ - 'int' : pd.array([1,2], dtype='Int64'), - 'float' : [.1, .2], - 'bool' : pd.array([True, False], dtype='bool'), - 'date' : [pd.Timestamp(2010,1,1), pd.Timestamp(2020,2,2)], - 'period' : pd.array([pd.Period('2010-01'), pd.Period('2020-2')], dtype='period[M]'), - 'obj' : ['hello', 'world'], - 'cat' : pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']), - }) - - ridx = pd.Series([None]*10) + # GH 34725 + + # Create two rows with many different dytypes. The first row will be in + # the 'good' group which never has a nan in the grouping column(s). The + # second row will be in the 'bad' grouping which sometimes has a nan in + # the group column(s). + rows = pd.DataFrame( + { + "int": pd.array([1, 2], dtype="Int64"), + "float": [0.1, 0.2], + "bool": pd.array([True, False], dtype="bool"), + "date": [pd.Timestamp(2010, 1, 1), pd.Timestamp(2020, 2, 2)], + "period": pd.array( + [pd.Period("2010-01"), pd.Period("2020-2")], dtype="period[M]" + ), + "obj": ["hello", "world"], + "cat": pd.Categorical(["a", "b"], categories=["a", "b", "c"]), + } + ) + + # Put those rows into a 10-row dataframe at rows 2 and 7. This will + # allows us to ffill and bfill the rows and confirm that our method is + # behaving as expected + ridx = pd.Series([None] * 10) ridx[2] = 0 ridx[7] = 1 df = rows.reindex(ridx).reset_index(drop=True) - - grps = pd.Series(['good']*5 + ['bad']*5) + + # Add the grouping column(s). + grps = pd.Series(["good"] * 5 + ["bad"] * 5) if type(by) is list: - grps = pd.concat([grps]*len(by), axis=1) + grps = pd.concat([grps] * len(by), axis=1) df[by] = grps - + + # Our 'has_nan' arg sometimes lists more columns than we are actually + # grouping by (our 'by' arg), i.e. has_nan=['grp1', 'grp2'] when + # by=['grp1']. We can just reduce 'has_nan' to its intersection with 'by'. by = [by] if type(by) is not list else by has_nan = list(set(has_nan).intersection(set(by))) - df[has_nan] = df[has_nan].replace('bad', np.nan) - + + # For the colunms that are in 'has_nan' replace 'bad' with 'nan' + df[has_nan] = df[has_nan].replace("bad", np.nan) + grouped = df.groupby(by=by, dropna=dropna) result = getattr(grouped, method)(limit=limit) - + + # If dropna=True and 'bad' has been replaced by 'nan', then the second + # 5 rows will all be nan, which is what we want: the nan group contains + # only nan values if dropna and (len(has_nan) > 0): ridx[7] = None + + # To get our expected/benchmark output, we ffill/bfill the rows directly + # (not via a groupby), so we don't want limit=None for this part. With 5 + # rows per group and the value rows in positions 2&7, we ffill/bfill + # with limit=2. If we use limit=None rows 2&7 will ffill/bfill into the + # other group lim = 2 if limit is None else limit - ridx = getattr(ridx, method)(limit=lim) expected = rows.reindex(ridx).reset_index(drop=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653 From 8447d264b40e9d383803fdeb71dedef97dd4fb81 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Thu, 1 Oct 2020 22:31:57 +0100 Subject: [PATCH 05/13] whatsnew --- doc/source/whatsnew/v1.1.3.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 15777abcb8084..02c6534328d86 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -58,6 +58,7 @@ Bug fixes - Bug in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to string dtype (:issue:`36451`) - Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) - Bug in :func:`cut` raising a ``ValueError`` when passed a :class:`Series` of labels with ``ordered=False`` (:issue:`36603`) +- Bug in :meth:`DataFrameGroupBy.ffill` where a ``NaN`` group would return foward-filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) .. --------------------------------------------------------------------------- From 09a64da645402b26202f2afec1514d58eb26a38b Mon Sep 17 00:00:00 2001 From: smithto1 Date: Tue, 6 Oct 2020 06:57:57 +0100 Subject: [PATCH 06/13] implemented cython fix in group_fillna_indexer rather than post-processing --- pandas/_libs/groupby.pyx | 5 ++++- pandas/core/groupby/groupby.py | 13 +------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index a83634aad3ce2..09d876c55c594 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -344,7 +344,7 @@ def group_shift_indexer(int64_t[:] out, const int64_t[:] labels, @cython.boundscheck(False) def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, ndarray[uint8_t] mask, object direction, - int64_t limit): + int64_t limit, bint dropna): """ Indexes how to fill values forwards or backwards within a group. @@ -389,6 +389,9 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, else: # reset items when not missing filled_vals = 0 curr_fill_idx = idx + + if dropna and labels[idx] == -1: + curr_fill_idx = -1 out[idx] = curr_fill_idx diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3edc66792be7b..5f1824d552ef8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1847,17 +1847,6 @@ def _fill(self, direction, limit=None): if limit is None: limit = -1 - def _nan_group_gets_nan_values(values, *args): - if not self.dropna: - return values - in_nan_group = DataFrame(self.grouper.codes).eq(-1).any() - if in_nan_group.any(): - filler = {np.datetime64: np.datetime64("NaT")}.get( - values.dtype.type, np.nan - ) - values[in_nan_group] = filler - return values - res = self._get_cythonized_result( "group_fillna_indexer", numeric_only=False, @@ -1866,7 +1855,7 @@ def _nan_group_gets_nan_values(values, *args): result_is_index=True, direction=direction, limit=limit, - post_processing=_nan_group_gets_nan_values, + dropna=self.dropna ) return res From 694e63cacdcc6b2ec64fe97f91566c7ee2d922b2 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Tue, 6 Oct 2020 07:56:50 +0100 Subject: [PATCH 07/13] simplified test --- pandas/core/groupby/groupby.py | 2 +- .../tests/groupby/transform/test_transform.py | 88 ++++++------------- 2 files changed, 27 insertions(+), 63 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5f1824d552ef8..75c3763461312 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1855,7 +1855,7 @@ def _fill(self, direction, limit=None): result_is_index=True, direction=direction, limit=limit, - dropna=self.dropna + dropna=self.dropna, ) return res diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 77e9f9e1efe3f..45149da3d72c8 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -977,72 +977,36 @@ def test_ffill_bfill_non_unique_multilevel(func, expected_status): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("method", ["ffill", "bfill"]) @pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("limit", [None, 1]) -@pytest.mark.parametrize("method", ["ffill", "bfill", "pad", "backfill"]) -@pytest.mark.parametrize("by", ["grp1", ["grp1"], ["grp1", "grp2"]]) -@pytest.mark.parametrize("has_nan", [[], ["grp1"], ["grp1", "grp2"]]) -def test_pad_handles_nan_groups(dropna, limit, method, by, has_nan): +@pytest.mark.parametrize("has_nan_group", [True, False]) +def test_ffill_handles_nan_groups(dropna, method, has_nan_group): # GH 34725 - # Create two rows with many different dytypes. The first row will be in - # the 'good' group which never has a nan in the grouping column(s). The - # second row will be in the 'bad' grouping which sometimes has a nan in - # the group column(s). - rows = pd.DataFrame( - { - "int": pd.array([1, 2], dtype="Int64"), - "float": [0.1, 0.2], - "bool": pd.array([True, False], dtype="bool"), - "date": [pd.Timestamp(2010, 1, 1), pd.Timestamp(2020, 2, 2)], - "period": pd.array( - [pd.Period("2010-01"), pd.Period("2020-2")], dtype="period[M]" - ), - "obj": ["hello", "world"], - "cat": pd.Categorical(["a", "b"], categories=["a", "b", "c"]), - } - ) + df_without_nan_rows = pd.DataFrame([(1, 0.1), (2, 0.2)]) + + ridx = [-1, 0, -1, -1, 1, -1] + df = df_without_nan_rows.reindex(ridx).reset_index(drop=True) + + group_b = np.nan if has_nan_group else "b" + df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3) + + grouped = df.groupby(by="group_col", dropna=dropna) + result = getattr(grouped, method)(limit=None) + + expected_rows = { + ("ffill", True, True): [-1, 0, 0, -1, -1, -1], + ("ffill", True, False): [-1, 0, 0, -1, 1, 1], + ("ffill", False, True): [-1, 0, 0, -1, 1, 1], + ("ffill", False, False): [-1, 0, 0, -1, 1, 1], + ("bfill", True, True): [0, 0, -1, -1, -1, -1], + ("bfill", True, False): [0, 0, -1, 1, 1, -1], + ("bfill", False, True): [0, 0, -1, 1, 1, -1], + ("bfill", False, False): [0, 0, -1, 1, 1, -1], + } - # Put those rows into a 10-row dataframe at rows 2 and 7. This will - # allows us to ffill and bfill the rows and confirm that our method is - # behaving as expected - ridx = pd.Series([None] * 10) - ridx[2] = 0 - ridx[7] = 1 - df = rows.reindex(ridx).reset_index(drop=True) - - # Add the grouping column(s). - grps = pd.Series(["good"] * 5 + ["bad"] * 5) - if type(by) is list: - grps = pd.concat([grps] * len(by), axis=1) - df[by] = grps - - # Our 'has_nan' arg sometimes lists more columns than we are actually - # grouping by (our 'by' arg), i.e. has_nan=['grp1', 'grp2'] when - # by=['grp1']. We can just reduce 'has_nan' to its intersection with 'by'. - by = [by] if type(by) is not list else by - has_nan = list(set(has_nan).intersection(set(by))) - - # For the colunms that are in 'has_nan' replace 'bad' with 'nan' - df[has_nan] = df[has_nan].replace("bad", np.nan) - - grouped = df.groupby(by=by, dropna=dropna) - result = getattr(grouped, method)(limit=limit) - - # If dropna=True and 'bad' has been replaced by 'nan', then the second - # 5 rows will all be nan, which is what we want: the nan group contains - # only nan values - if dropna and (len(has_nan) > 0): - ridx[7] = None - - # To get our expected/benchmark output, we ffill/bfill the rows directly - # (not via a groupby), so we don't want limit=None for this part. With 5 - # rows per group and the value rows in positions 2&7, we ffill/bfill - # with limit=2. If we use limit=None rows 2&7 will ffill/bfill into the - # other group - lim = 2 if limit is None else limit - ridx = getattr(ridx, method)(limit=lim) - expected = rows.reindex(ridx).reset_index(drop=True) + ridx = expected_rows.get((method, dropna, has_nan_group)) + expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True) tm.assert_frame_equal(result, expected) From bb886099edb3c5e850c1474fbcb642d881b9fd36 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Tue, 6 Oct 2020 08:01:11 +0100 Subject: [PATCH 08/13] whatsnew and docstring --- doc/source/whatsnew/v1.1.3.rst | 1 - doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/groupby.pyx | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 02c6534328d86..15777abcb8084 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -58,7 +58,6 @@ Bug fixes - Bug in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to string dtype (:issue:`36451`) - Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) - Bug in :func:`cut` raising a ``ValueError`` when passed a :class:`Series` of labels with ``ordered=False`` (:issue:`36603`) -- Bug in :meth:`DataFrameGroupBy.ffill` where a ``NaN`` group would return foward-filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 016e8d90e7d21..6d338e6717035 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -404,6 +404,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) - Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`) +- Bug in :meth:`DataFrameGroupBy.ffill` where a ``NaN`` group would return foward-filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 09d876c55c594..17a557818ee00 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -358,6 +358,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, direction : {'ffill', 'bfill'} Direction for fill to be applied (forwards or backwards, respectively) limit : Consecutive values to fill before stopping, or -1 for no limit + dropna : Flag to indicate if NaN groups should return all NaN values Notes ----- From 11c4c9a12ed2a5a08c551a33e5adb17b2f73e846 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Tue, 6 Oct 2020 08:03:36 +0100 Subject: [PATCH 09/13] cleaning up --- pandas/core/groupby/groupby.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 75c3763461312..b8e7af18e0f2b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1847,7 +1847,7 @@ def _fill(self, direction, limit=None): if limit is None: limit = -1 - res = self._get_cythonized_result( + return self._get_cythonized_result( "group_fillna_indexer", numeric_only=False, needs_mask=True, @@ -1858,8 +1858,6 @@ def _fill(self, direction, limit=None): dropna=self.dropna, ) - return res - @Substitution(name="groupby") def pad(self, limit=None): """ From 6f4fa32aedbfc3e9942b4ad326ff1f61bceed3f3 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Tue, 6 Oct 2020 09:29:04 +0100 Subject: [PATCH 10/13] whitespace --- pandas/_libs/groupby.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 17a557818ee00..386267422fd0b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -390,7 +390,6 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, else: # reset items when not missing filled_vals = 0 curr_fill_idx = idx - if dropna and labels[idx] == -1: curr_fill_idx = -1 From 3fcce3bb5a16827d1fd184d12b3c077c861ed6ef Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 7 Oct 2020 14:45:01 +0100 Subject: [PATCH 11/13] addressing comments --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/groupby.pyx | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 03fd019447925..673d656c730ab 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -439,7 +439,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`) - Bug in :meth:`Rolling.count` returned ``np.nan`` with :class:`pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in window (:issue:`35579`) - Bug where :class:`pandas.core.window.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`) -- Bug in :meth:`DataFrameGroupBy.ffill` where a ``NaN`` group would return foward-filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) +- Bug in :meth:`DataFrameGroupBy.ffill` and :meth:`DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 386267422fd0b..1c887135ced0e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -382,7 +382,9 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, with nogil: for i in range(N): idx = sorted_labels[i] - if mask[idx] == 1: # is missing + if dropna and labels[idx] == -1: # nan-group gets nan-values + curr_fill_idx = -1 + elif mask[idx] == 1: # is missing # Stop filling once we've hit the limit if filled_vals >= limit and limit != -1: curr_fill_idx = -1 @@ -390,8 +392,6 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, else: # reset items when not missing filled_vals = 0 curr_fill_idx = idx - if dropna and labels[idx] == -1: - curr_fill_idx = -1 out[idx] = curr_fill_idx From 434cef59db0ac8a355ce45508ec9dcec3573c739 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 7 Oct 2020 15:10:01 +0100 Subject: [PATCH 12/13] fixing pre-commit check --- pandas/_libs/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 1c887135ced0e..5a958d5e0bd3c 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -382,7 +382,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, with nogil: for i in range(N): idx = sorted_labels[i] - if dropna and labels[idx] == -1: # nan-group gets nan-values + if dropna and labels[idx] == -1: # nan-group gets nan-values curr_fill_idx = -1 elif mask[idx] == 1: # is missing # Stop filling once we've hit the limit From 84abae0c43b8ebea3c1584a71a2e6a7ae9fd8e79 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Thu, 8 Oct 2020 10:41:13 +0100 Subject: [PATCH 13/13] move test to test_missing,py --- pandas/tests/groupby/test_missing.py | 34 +++++++++++++++++++ .../tests/groupby/transform/test_transform.py | 34 ------------------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 116aed9935694..70d8dfc20822a 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -82,3 +82,37 @@ def test_fill_consistency(): expected = df.groupby(level=0, axis=0).fillna(method="ffill") result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["ffill", "bfill"]) +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("has_nan_group", [True, False]) +def test_ffill_handles_nan_groups(dropna, method, has_nan_group): + # GH 34725 + + df_without_nan_rows = pd.DataFrame([(1, 0.1), (2, 0.2)]) + + ridx = [-1, 0, -1, -1, 1, -1] + df = df_without_nan_rows.reindex(ridx).reset_index(drop=True) + + group_b = np.nan if has_nan_group else "b" + df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3) + + grouped = df.groupby(by="group_col", dropna=dropna) + result = getattr(grouped, method)(limit=None) + + expected_rows = { + ("ffill", True, True): [-1, 0, 0, -1, -1, -1], + ("ffill", True, False): [-1, 0, 0, -1, 1, 1], + ("ffill", False, True): [-1, 0, 0, -1, 1, 1], + ("ffill", False, False): [-1, 0, 0, -1, 1, 1], + ("bfill", True, True): [0, 0, -1, -1, -1, -1], + ("bfill", True, False): [0, 0, -1, 1, 1, -1], + ("bfill", False, True): [0, 0, -1, 1, 1, -1], + ("bfill", False, False): [0, 0, -1, 1, 1, -1], + } + + ridx = expected_rows.get((method, dropna, has_nan_group)) + expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 45149da3d72c8..97be039e16ebb 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -977,40 +977,6 @@ def test_ffill_bfill_non_unique_multilevel(func, expected_status): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("method", ["ffill", "bfill"]) -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("has_nan_group", [True, False]) -def test_ffill_handles_nan_groups(dropna, method, has_nan_group): - # GH 34725 - - df_without_nan_rows = pd.DataFrame([(1, 0.1), (2, 0.2)]) - - ridx = [-1, 0, -1, -1, 1, -1] - df = df_without_nan_rows.reindex(ridx).reset_index(drop=True) - - group_b = np.nan if has_nan_group else "b" - df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3) - - grouped = df.groupby(by="group_col", dropna=dropna) - result = getattr(grouped, method)(limit=None) - - expected_rows = { - ("ffill", True, True): [-1, 0, 0, -1, -1, -1], - ("ffill", True, False): [-1, 0, 0, -1, 1, 1], - ("ffill", False, True): [-1, 0, 0, -1, 1, 1], - ("ffill", False, False): [-1, 0, 0, -1, 1, 1], - ("bfill", True, True): [0, 0, -1, -1, -1, -1], - ("bfill", True, False): [0, 0, -1, 1, 1, -1], - ("bfill", False, True): [0, 0, -1, 1, 1, -1], - ("bfill", False, False): [0, 0, -1, 1, 1, -1], - } - - ridx = expected_rows.get((method, dropna, has_nan_group)) - expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True) - - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653