From 5e4b2ee4debadfbce3ed8ecbfb8f11f6ca05ee75 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Mon, 4 Feb 2019 14:44:21 +0100 Subject: [PATCH 01/31] Added maxgap keyword for series.interpolate Added numpy-based implementation that searchs for NaN-gaps wider than `maxgap`. In line with the current implementations for NaN handling in `series.interpolate`, a set of NaN-indices that has to be preserved is generated. Test and documentation were also added. --- pandas/core/generic.py | 10 ++++-- pandas/core/missing.py | 50 +++++++++++++++++++++++++---- pandas/tests/series/test_missing.py | 35 ++++++++++++++++++++ 3 files changed, 86 insertions(+), 9 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0312ed6ecf3bf..33bc886a74761 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6648,7 +6648,12 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, * 'outside': Only fill NaNs outside valid values (extrapolate). .. versionadded:: 0.21.0 - + maxgap : int, optional + Maximum number of consecutive NaN values up to which a NaN-gap + will be interpolated. For all NaN-gaps wider than that no + interpolation is carried out. Must be greater than 0. + + .. versionadded:: 0.25.0 downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. **kwargs @@ -6783,7 +6788,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, @Appender(_shared_docs['interpolate'] % _shared_doc_kwargs) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', limit_area=None, + maxgap=None, limit_direction='forward', limit_area=None, downcast=None, **kwargs): """ Interpolate values according to different methods. @@ -6836,6 +6841,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, values=_maybe_transposed_self, limit=limit, limit_direction=limit_direction, limit_area=limit_area, + maxgap=maxgap, inplace=inplace, downcast=downcast, **kwargs) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index cc7bdf95200d1..4ca90be310e62 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -108,7 +108,7 @@ def clean_interp_method(method, **kwargs): return method -def interpolate_1d(xvalues, yvalues, method='linear', limit=None, +def interpolate_1d(xvalues, yvalues, method='linear', limit=None, maxgap=None, limit_direction='forward', limit_area=None, fill_value=None, bounds_error=False, order=None, **kwargs): """ @@ -165,6 +165,16 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, elif limit < 1: raise ValueError('Limit must be greater than 0') + if (maxgap is not None) and (limit is not None): + raise ValueError('maxgap cannot be used together with limit') + + if maxgap is None: + pass + elif not is_integer(maxgap): + raise ValueError('maxgap must be an integer') + elif maxgap < 1: + raise ValueError('maxgap must be greater than 0') + from pandas import Series ys = Series(yvalues) @@ -182,14 +192,40 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, # contain indices of NaNs at the beginning of the series, and NaNs that # are more than'limit' away from the prior non-NaN. + # In case that maxgap is provided, preserve_nans is derived so that + # gaps with continuous NaN values of width > maxgap will be preserved. + # set preserve_nans based on direction using _interp_limit - if limit_direction == 'forward': - preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) - elif limit_direction == 'backward': - preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) + if maxgap is None: + if limit_direction == 'forward': + preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) + elif limit_direction == 'backward': + preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) + else: + # both directions... just use _interp_limit + preserve_nans = set(_interp_limit(invalid, limit, limit)) else: - # both directions... just use _interp_limit - preserve_nans = set(_interp_limit(invalid, limit, limit)) + def bfill_nan(arr): + """ Backward-fill NaNs """ + mask = np.isnan(arr) + idx = np.where(~mask, np.arange(mask.shape[0]), mask.shape[0] - 1) + idx = np.minimum.accumulate(idx[::-1], axis=0)[::-1] + out = arr[idx] + return out + + # Generate array where the NaN-gap-width is filled in as value + # at each NaN location. + cumsum = np.cumsum(invalid).astype('float') + diff = np.zeros_like(yvalues) + diff[~invalid] = np.pad(np.diff(cumsum[~invalid]), + (1, 0), mode='constant') + diff[invalid] = np.nan + diff = bfill_nan(diff) + # hack to avoid having trailing NaNs in `diff`. Fill these + # with `maxgap`. Everthing smaller than `maxgap` won't matter + # in the following. + diff[np.isnan(diff)] = maxgap + preserve_nans = set(np.flatnonzero((diff > maxgap) & invalid)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 985288c439917..7dae1c35a2055 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1204,6 +1204,41 @@ def test_interp_limit_to_ends(self): limit_direction='both') assert_series_equal(result, expected) + def test_interp_maxgap(self): + s = Series([ + np.nan, + 1., np.nan, + 2., np.nan, np.nan, + 5., np.nan, np.nan, np.nan, + -1., np.nan, np.nan + ]) + + excpected = Series([ + 1., + 1., 1.5, + 2., 3., 4., + 5., np.nan, np.nan, np.nan, + -1., -1, -1 + ]) + + result = s.interpolate(method='linear', maxgap=2) + assert_series_equal(result, excpected) + + excpected = Series([ + np.nan, + 1., 1.5, + 2., 3., 4., + 5., np.nan, np.nan, np.nan, + -1., np.nan, np.nan + ]) + + result = s.interpolate(method='linear', maxgap=2, limit_area='inside') + assert_series_equal(result, excpected) + + with pytest.raises(ValueError, + match='maxgap cannot be used together with limit'): + s.interpolate(method='linear', maxgap=2, limit=3) + def test_interp_limit_before_ends(self): # These test are for issue #11115 -- limit ends properly. s = Series([np.nan, np.nan, 5, 7, np.nan, np.nan]) From b7526020d79e84d1d6db1442b3142e66b76ba9e1 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Mon, 4 Feb 2019 15:46:33 +0100 Subject: [PATCH 02/31] minor pep8 fixes --- pandas/core/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 33bc886a74761..7f6f2b76862b0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6649,10 +6649,10 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, .. versionadded:: 0.21.0 maxgap : int, optional - Maximum number of consecutive NaN values up to which a NaN-gap - will be interpolated. For all NaN-gaps wider than that no + Maximum number of consecutive NaN values up to which a NaN-gap + will be interpolated. For all NaN-gaps wider than that no interpolation is carried out. Must be greater than 0. - + .. versionadded:: 0.25.0 downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. From 839b11a4c616448bbd1f66f88a632d2cafc3f933 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Mon, 4 Feb 2019 21:03:44 +0100 Subject: [PATCH 03/31] fixed parameter order --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7f6f2b76862b0..957e9293650cc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6788,7 +6788,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, @Appender(_shared_docs['interpolate'] % _shared_doc_kwargs) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - maxgap=None, limit_direction='forward', limit_area=None, + limit_direction='forward', limit_area=None, maxgap=None, downcast=None, **kwargs): """ Interpolate values according to different methods. From 3cb371e1470d957bf251d5366c22ba29f773cb2e Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 11 Jun 2019 17:16:21 +0200 Subject: [PATCH 04/31] Changed parameter name from `maxgap` to `max_gap` --- pandas/core/generic.py | 6 +++--- pandas/core/missing.py | 28 ++++++++++++++-------------- pandas/tests/series/test_missing.py | 10 +++++----- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 033461fc5ff91..0cf6fb5f46d96 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6590,7 +6590,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, * 'outside': Only fill NaNs outside valid values (extrapolate). .. versionadded:: 0.23.0 - maxgap : int, optional + max_gap : int, optional Maximum number of consecutive NaN values up to which a NaN-gap will be interpolated. For all NaN-gaps wider than that no interpolation is carried out. Must be greater than 0. @@ -6730,7 +6730,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, @Appender(_shared_docs['interpolate'] % _shared_doc_kwargs) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', limit_area=None, maxgap=None, + limit_direction='forward', limit_area=None, max_gap=None, downcast=None, **kwargs): """ Interpolate values according to different methods. @@ -6791,7 +6791,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, values=_maybe_transposed_self, limit=limit, limit_direction=limit_direction, limit_area=limit_area, - maxgap=maxgap, + max_gap=max_gap, inplace=inplace, downcast=downcast, **kwargs) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 2648ae1ec1bfc..fa6928e2ca274 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -106,7 +106,7 @@ def clean_interp_method(method, **kwargs): return method -def interpolate_1d(xvalues, yvalues, method='linear', limit=None, maxgap=None, +def interpolate_1d(xvalues, yvalues, method='linear', limit=None, max_gap=None, limit_direction='forward', limit_area=None, fill_value=None, bounds_error=False, order=None, **kwargs): """ @@ -163,15 +163,15 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, maxgap=None, elif limit < 1: raise ValueError('Limit must be greater than 0') - if (maxgap is not None) and (limit is not None): - raise ValueError('maxgap cannot be used together with limit') + if (max_gap is not None) and (limit is not None): + raise ValueError('max_gap cannot be used together with limit') - if maxgap is None: + if max_gap is None: pass - elif not is_integer(maxgap): - raise ValueError('maxgap must be an integer') - elif maxgap < 1: - raise ValueError('maxgap must be greater than 0') + elif not is_integer(max_gap): + raise ValueError('max_gap must be an integer') + elif max_gap < 1: + raise ValueError('max_gap must be greater than 0') from pandas import Series ys = Series(yvalues) @@ -190,11 +190,11 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, maxgap=None, # contain indices of NaNs at the beginning of the series, and NaNs that # are more than'limit' away from the prior non-NaN. - # In case that maxgap is provided, preserve_nans is derived so that - # gaps with continuous NaN values of width > maxgap will be preserved. + # In case that max_gap is provided, preserve_nans is derived so that + # gaps with continuous NaN values of width > max_gap will be preserved. # set preserve_nans based on direction using _interp_limit - if maxgap is None: + if max_gap is None: if limit_direction == 'forward': preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == 'backward': @@ -220,10 +220,10 @@ def bfill_nan(arr): diff[invalid] = np.nan diff = bfill_nan(diff) # hack to avoid having trailing NaNs in `diff`. Fill these - # with `maxgap`. Everthing smaller than `maxgap` won't matter + # with `max_gap`. Everthing smaller than `max_gap` won't matter # in the following. - diff[np.isnan(diff)] = maxgap - preserve_nans = set(np.flatnonzero((diff > maxgap) & invalid)) + diff[np.isnan(diff)] = max_gap + preserve_nans = set(np.flatnonzero((diff > max_gap) & invalid)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index f04976f5e277b..905edb50e2813 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1246,7 +1246,7 @@ def test_interp_limit_to_ends(self): limit_direction='both') assert_series_equal(result, expected) - def test_interp_maxgap(self): + def test_interp_max_gap(self): s = Series([ np.nan, 1., np.nan, @@ -1263,7 +1263,7 @@ def test_interp_maxgap(self): -1., -1, -1 ]) - result = s.interpolate(method='linear', maxgap=2) + result = s.interpolate(method='linear', max_gap=2) assert_series_equal(result, excpected) excpected = Series([ @@ -1274,12 +1274,12 @@ def test_interp_maxgap(self): -1., np.nan, np.nan ]) - result = s.interpolate(method='linear', maxgap=2, limit_area='inside') + result = s.interpolate(method='linear', max_gap=2, limit_area='inside') assert_series_equal(result, excpected) with pytest.raises(ValueError, - match='maxgap cannot be used together with limit'): - s.interpolate(method='linear', maxgap=2, limit=3) + match='max_gap cannot be used together with limit'): + s.interpolate(method='linear', max_gap=2, limit=3) def test_interp_limit_before_ends(self): # These test are for issue #11115 -- limit ends properly. From 8c6ff7a0a796898d0288d9017c234b7e98702fd9 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 11 Jun 2019 21:03:04 +0200 Subject: [PATCH 05/31] Moved code to derive indices of "NaNs to preserve" in separate function --- pandas/core/missing.py | 94 +++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 37 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index fa6928e2ca274..c97056e5e6cb5 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -173,6 +173,61 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, max_gap=None, elif max_gap < 1: raise ValueError('max_gap must be greater than 0') + preserve_nans = _derive_indices_of_nans_to_preserve( + yvalues=yvalues, valid=valid, invalid=invalid, + limit=limit, limit_area=limit_area, limit_direction=limit_direction, + max_gap=max_gap) + + xvalues = getattr(xvalues, 'values', xvalues) + yvalues = getattr(yvalues, 'values', yvalues) + result = yvalues.copy() + + if method in ['linear', 'time', 'index', 'values']: + if method in ('values', 'index'): + inds = np.asarray(xvalues) + # hack for DatetimeIndex, #1646 + if needs_i8_conversion(inds.dtype.type): + inds = inds.view(np.int64) + if inds.dtype == np.object_: + inds = lib.maybe_convert_objects(inds) + else: + inds = xvalues + result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) + result[preserve_nans] = np.nan + return result + + sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'barycentric', 'krogh', 'spline', 'polynomial', + 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima'] + + if method in sp_methods: + inds = np.asarray(xvalues) + # hack for DatetimeIndex, #1646 + if issubclass(inds.dtype.type, np.datetime64): + inds = inds.view(np.int64) + result[invalid] = _interpolate_scipy_wrapper(inds[valid], + yvalues[valid], + inds[invalid], + method=method, + fill_value=fill_value, + bounds_error=bounds_error, + order=order, **kwargs) + result[preserve_nans] = np.nan + return result + + +def _derive_indices_of_nans_to_preserve(yvalues, invalid, valid, + limit, limit_area, limit_direction, + max_gap): + """ Derive the indices of NaNs that shall be preserved after interpolation + + This function is called by `interpolate_1d` and takes the arguments with + the same name from there. In `interpolate_1d`, after performing the + interpolation the list of indices of NaNs to preserve is used to put + NaNs in the desired locations. + + """ + from pandas import Series ys = Series(yvalues) @@ -220,7 +275,7 @@ def bfill_nan(arr): diff[invalid] = np.nan diff = bfill_nan(diff) # hack to avoid having trailing NaNs in `diff`. Fill these - # with `max_gap`. Everthing smaller than `max_gap` won't matter + # with `max_gap`. Everything smaller than `max_gap` won't matter # in the following. diff[np.isnan(diff)] = max_gap preserve_nans = set(np.flatnonzero((diff > max_gap) & invalid)) @@ -237,42 +292,7 @@ def bfill_nan(arr): # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) - xvalues = getattr(xvalues, 'values', xvalues) - yvalues = getattr(yvalues, 'values', yvalues) - result = yvalues.copy() - - if method in ['linear', 'time', 'index', 'values']: - if method in ('values', 'index'): - inds = np.asarray(xvalues) - # hack for DatetimeIndex, #1646 - if needs_i8_conversion(inds.dtype.type): - inds = inds.view(np.int64) - if inds.dtype == np.object_: - inds = lib.maybe_convert_objects(inds) - else: - inds = xvalues - result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) - result[preserve_nans] = np.nan - return result - - sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'barycentric', 'krogh', 'spline', 'polynomial', - 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima'] - - if method in sp_methods: - inds = np.asarray(xvalues) - # hack for DatetimeIndex, #1646 - if issubclass(inds.dtype.type, np.datetime64): - inds = inds.view(np.int64) - result[invalid] = _interpolate_scipy_wrapper(inds[valid], - yvalues[valid], - inds[invalid], - method=method, - fill_value=fill_value, - bounds_error=bounds_error, - order=order, **kwargs) - result[preserve_nans] = np.nan - return result + return preserve_nans def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None, From 4aaf8dcffa9d3e6182014e816b3f1f5ddf767f38 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 11 Jun 2019 21:53:14 +0200 Subject: [PATCH 06/31] Tests for errors extended and moved to own function --- pandas/tests/series/test_missing.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 905edb50e2813..e22606f78df2a 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1277,10 +1277,27 @@ def test_interp_max_gap(self): result = s.interpolate(method='linear', max_gap=2, limit_area='inside') assert_series_equal(result, excpected) + def test_interp_max_gap_errors(self): + s = Series([ + np.nan, + 1., np.nan, + 2., np.nan, np.nan, + 5., np.nan, np.nan, np.nan, + -1., np.nan, np.nan + ]) + with pytest.raises(ValueError, match='max_gap cannot be used together with limit'): s.interpolate(method='linear', max_gap=2, limit=3) + with pytest.raises(ValueError, + match='max_gap must be an integer'): + s.interpolate(method='linear', max_gap='foo') + + with pytest.raises(ValueError, + match='max_gap must be greater than 0'): + s.interpolate(method='linear', max_gap=0) + def test_interp_limit_before_ends(self): # These test are for issue #11115 -- limit ends properly. s = Series([np.nan, np.nan, 5, 7, np.nan, np.nan]) From 1f0406fb42ca1013992330b990fff911eac28e32 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 11 Jun 2019 22:13:55 +0200 Subject: [PATCH 07/31] added blank lines in docstring as requested --- pandas/core/generic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0cf6fb5f46d96..f09f75c7798a6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6590,12 +6590,14 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, * 'outside': Only fill NaNs outside valid values (extrapolate). .. versionadded:: 0.23.0 + max_gap : int, optional Maximum number of consecutive NaN values up to which a NaN-gap will be interpolated. For all NaN-gaps wider than that no interpolation is carried out. Must be greater than 0. .. versionadded:: 0.25.0 + downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. **kwargs From eaacefd0957bfc6735c8782ac2ba571d7131d250 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 11 Jun 2019 22:40:52 +0200 Subject: [PATCH 08/31] Added test which fails for method='pad' For method='pad' the `max_gap` keyword does not seem to have an effect. --- pandas/tests/series/test_missing.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index e22606f78df2a..2ab13e6fada9c 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1262,7 +1262,6 @@ def test_interp_max_gap(self): 5., np.nan, np.nan, np.nan, -1., -1, -1 ]) - result = s.interpolate(method='linear', max_gap=2) assert_series_equal(result, excpected) @@ -1273,10 +1272,19 @@ def test_interp_max_gap(self): 5., np.nan, np.nan, np.nan, -1., np.nan, np.nan ]) - result = s.interpolate(method='linear', max_gap=2, limit_area='inside') assert_series_equal(result, excpected) + excpected = Series([ + np.nan, + 1., 1, + 2., 2., 2., + 5., np.nan, np.nan, np.nan, + -1., np.nan, np.nan + ]) + result = s.interpolate(method='pad', max_gap=2, limit_area='inside') + assert_series_equal(result, excpected) + def test_interp_max_gap_errors(self): s = Series([ np.nan, From c72acdb904cc2875bd530a1b780a809082ab16bb Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Fri, 30 Aug 2019 16:40:56 +0200 Subject: [PATCH 09/31] manually add black code formating --- pandas/core/missing.py | 6 +++--- pandas/tests/series/test_missing.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 626d80e63f118..861d355d506c5 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -200,14 +200,14 @@ def interpolate_1d( raise ValueError("Limit must be greater than 0") if (max_gap is not None) and (limit is not None): - raise ValueError('max_gap cannot be used together with limit') + raise ValueError("max_gap cannot be used together with limit") if max_gap is None: pass elif not is_integer(max_gap): - raise ValueError('max_gap must be an integer') + raise ValueError("max_gap must be an integer") elif max_gap < 1: - raise ValueError('max_gap must be greater than 0') + raise ValueError("max_gap must be greater than 0") preserve_nans = _derive_indices_of_nans_to_preserve( yvalues=yvalues, diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 08dd0f421e547..31b8cb4e9ed73 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1508,7 +1508,7 @@ def test_interp_max_gap(self): 5., np.nan, np.nan, np.nan, -1., -1, -1 ]) - result = s.interpolate(method='linear', max_gap=2) + result = s.interpolate(method="linear", max_gap=2) assert_series_equal(result, excpected) excpected = Series([ @@ -1518,7 +1518,7 @@ def test_interp_max_gap(self): 5., np.nan, np.nan, np.nan, -1., np.nan, np.nan ]) - result = s.interpolate(method='linear', max_gap=2, limit_area='inside') + result = s.interpolate(method="linear", max_gap=2, limit_area="inside") assert_series_equal(result, excpected) excpected = Series([ @@ -1528,7 +1528,7 @@ def test_interp_max_gap(self): 5., np.nan, np.nan, np.nan, -1., np.nan, np.nan ]) - result = s.interpolate(method='pad', max_gap=2, limit_area='inside') + result = s.interpolate(method="pad", max_gap=2, limit_area="inside") assert_series_equal(result, excpected) def test_interp_max_gap_errors(self): @@ -1541,16 +1541,16 @@ def test_interp_max_gap_errors(self): ]) with pytest.raises(ValueError, - match='max_gap cannot be used together with limit'): - s.interpolate(method='linear', max_gap=2, limit=3) + match="max_gap cannot be used together with limit"): + s.interpolate(method="linear", max_gap=2, limit=3) with pytest.raises(ValueError, - match='max_gap must be an integer'): - s.interpolate(method='linear', max_gap='foo') + match="max_gap must be an integer"): + s.interpolate(method="linear", max_gap="foo") with pytest.raises(ValueError, - match='max_gap must be greater than 0'): - s.interpolate(method='linear', max_gap=0) + match="max_gap must be greater than 0"): + s.interpolate(method="linear", max_gap=0) def test_interp_limit_before_ends(self): # These test are for issue #11115 -- limit ends properly. From e0aee3afd5dc30498d0f8294db4815465ffa6781 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Thu, 5 Sep 2019 21:21:59 +0200 Subject: [PATCH 10/31] First WIP but working version to fix issue with `pad` and `limit_area` * Test are green for test_missing.py but some red test appear when running the whole test suite... * The current solution uses `missing._derive_indices_of_nans_to_preserve`, which was implemented some commits ago also when using `pad` and `backfill`. * The implementation is not very elegant (yet) since I introduced a second version of `missing.interpolate_2d` only for the 1D-case of using `pad` and `backfill`, called `missing.interpolate_1d_fill`. Maybe distinguishing 1D and 2D could also be done in the original `missing.interpolate_2d`... --- pandas/core/internals/blocks.py | 42 ++++++++++++++---- pandas/core/missing.py | 78 ++++++++++++++++++++++++++++++++- 2 files changed, 110 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 33698d245e9ff..df8ba7de9cf01 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1108,6 +1108,7 @@ def interpolate( values=None, inplace=False, limit=None, + max_gap=None, limit_direction="forward", limit_area=None, fill_value=None, @@ -1142,6 +1143,8 @@ def check_int_bool(self, inplace): axis=axis, inplace=inplace, limit=limit, + max_gap=max_gap, + limit_area=limit_area, fill_value=fill_value, coerce=coerce, downcast=downcast, @@ -1158,6 +1161,7 @@ def check_int_bool(self, inplace): values=values, axis=axis, limit=limit, + max_gap=max_gap, limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, @@ -1172,6 +1176,8 @@ def _interpolate_with_fill( axis=0, inplace=False, limit=None, + max_gap=None, + limit_area=None, fill_value=None, coerce=False, downcast=None, @@ -1191,16 +1197,32 @@ def _interpolate_with_fill( values = self.values if inplace else self.values.copy() fill_value = self._try_coerce_args(fill_value) - values = missing.interpolate_2d( - values, - method=method, - axis=axis, - limit=limit, - fill_value=fill_value, - dtype=self.dtype, - ) - blocks = [self.make_block_same_class(values, ndim=self.ndim)] + if values.ndim == 1: + def func(x): + return missing.interpolate_1d_fill( + x, + method=method, + axis=axis, + limit=limit, + max_gap=max_gap, + limit_area=limit_area, + fill_value=fill_value, + dtype=self.dtype, + ) + interp_values = np.apply_along_axis(func, axis, values) + + else: + interp_values = missing.interpolate_2d( + values, + method=method, + axis=axis, + limit=limit, + fill_value=fill_value, + dtype=self.dtype + ) + + blocks = [self.make_block_same_class(interp_values, ndim=self.ndim)] return self._maybe_downcast(blocks, downcast) def _interpolate( @@ -1211,6 +1233,7 @@ def _interpolate( fill_value=None, axis=0, limit=None, + max_gap=None, limit_direction="forward", limit_area=None, inplace=False, @@ -1249,6 +1272,7 @@ def func(x): x, method=method, limit=limit, + max_gap=max_gap, limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 861d355d506c5..8083dda189da8 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -133,7 +133,8 @@ def interpolate_1d( xvalues, yvalues, method="linear", - limit=None, max_gap=None, + limit=None, + max_gap=None, limit_direction="forward", limit_area=None, fill_value=None, @@ -516,6 +517,81 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): return [P(x, nu) for nu in der] +def interpolate_1d_fill( + values, + method="pad", + axis=0, + limit=None, + max_gap=None, + limit_area=None, + fill_value=None, + dtype=None +): + """ + Perform an actual interpolation of values, values will be make 2-d if + needed fills inplace, returns the result. + """ + if method == "pad": + limit_direction = "forward" + elif method == "backfill": + limit_direction = "backward" + else: + raise ValueError("`method` must be either 'pad' or 'backfill'.") + + orig_values = values + + yvalues = values + invalid = isna(yvalues) + valid = ~invalid + + transf = (lambda x: x) if axis == 0 else (lambda x: x.T) + + # reshape a 1 dim if needed + ndim = values.ndim + if values.ndim == 1: + if axis != 0: # pragma: no cover + raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0") + values = values.reshape(tuple((1,) + values.shape)) + + if fill_value is None: + mask = None + else: # todo create faster fill func without masking + mask = mask_missing(transf(values), fill_value) + + preserve_nans = _derive_indices_of_nans_to_preserve( + yvalues=yvalues, + valid=valid, + invalid=invalid, + limit=limit, + limit_area=limit_area, + limit_direction=limit_direction, + max_gap=max_gap) + + method = clean_fill_method(method) + if method == "pad": + values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) + else: + values = transf( + backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype) + ) + + # reshape back + if ndim == 1: + values = values[0] + + if orig_values.dtype.kind == "M": + # convert float back to datetime64 + values = values.astype(orig_values.dtype) + + # if np.issubdtype(values.dtype, np.datetime64): + # values[preserve_nans] = np.datetime64('NaT') + # else: + # values[preserve_nans] = np.nan + values[preserve_nans] = fill_value + + return values + + def interpolate_2d( values, method="pad", axis=0, limit=None, fill_value=None, dtype=None ): From af15eaf275fb1ffa1dd461a4339ce8c2303811fd Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Thu, 5 Sep 2019 22:25:31 +0200 Subject: [PATCH 11/31] fix: do not decide based on dimension but on crucial kwargs which interpolation function to use --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index df8ba7de9cf01..b1cf61caedc65 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1198,7 +1198,7 @@ def _interpolate_with_fill( values = self.values if inplace else self.values.copy() fill_value = self._try_coerce_args(fill_value) - if values.ndim == 1: + if (max_gap is not None) or (limit_area is not None): def func(x): return missing.interpolate_1d_fill( x, From 12d2e5b1408f058fa4eb09082e1472a58ecf4818 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Thu, 5 Sep 2019 22:37:59 +0200 Subject: [PATCH 12/31] some clean up * added explanatory docstring to missing.interpolate_1d_fill * ran black code formating (and shortend test data code to fit onto one line) --- pandas/core/internals/blocks.py | 4 +- pandas/core/missing.py | 26 +++++++----- pandas/tests/series/test_missing.py | 61 +++++++++-------------------- 3 files changed, 37 insertions(+), 54 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b1cf61caedc65..18dbbde34cccd 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1199,6 +1199,7 @@ def _interpolate_with_fill( fill_value = self._try_coerce_args(fill_value) if (max_gap is not None) or (limit_area is not None): + def func(x): return missing.interpolate_1d_fill( x, @@ -1210,6 +1211,7 @@ def func(x): fill_value=fill_value, dtype=self.dtype, ) + interp_values = np.apply_along_axis(func, axis, values) else: @@ -1219,7 +1221,7 @@ def func(x): axis=axis, limit=limit, fill_value=fill_value, - dtype=self.dtype + dtype=self.dtype, ) blocks = [self.make_block_same_class(interp_values, ndim=self.ndim)] diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 8083dda189da8..b549d838b4137 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -217,7 +217,7 @@ def interpolate_1d( limit=limit, limit_area=limit_area, limit_direction=limit_direction, - max_gap=max_gap + max_gap=max_gap, ) xvalues = getattr(xvalues, "values", xvalues) @@ -273,9 +273,9 @@ def interpolate_1d( return result -def _derive_indices_of_nans_to_preserve(yvalues, invalid, valid, - limit, limit_area, limit_direction, - max_gap): +def _derive_indices_of_nans_to_preserve( + yvalues, invalid, valid, limit, limit_area, limit_direction, max_gap +): """ Derive the indices of NaNs that shall be preserved after interpolation This function is called by `interpolate_1d` and takes the arguments with @@ -286,6 +286,7 @@ def _derive_indices_of_nans_to_preserve(yvalues, invalid, valid, """ from pandas import Series + ys = Series(yvalues) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... @@ -315,6 +316,7 @@ def _derive_indices_of_nans_to_preserve(yvalues, invalid, valid, # both directions... just use _interp_limit preserve_nans = set(_interp_limit(invalid, limit, limit)) else: + def bfill_nan(arr): """ Backward-fill NaNs """ mask = np.isnan(arr) @@ -327,8 +329,7 @@ def bfill_nan(arr): # at each NaN location. cumsum = np.cumsum(invalid).astype("float") diff = np.zeros_like(yvalues) - diff[~invalid] = np.pad(np.diff(cumsum[~invalid]), - (1, 0), mode="constant") + diff[~invalid] = np.pad(np.diff(cumsum[~invalid]), (1, 0), mode="constant") diff[invalid] = np.nan diff = bfill_nan(diff) # hack to avoid having trailing NaNs in `diff`. Fill these @@ -525,11 +526,15 @@ def interpolate_1d_fill( max_gap=None, limit_area=None, fill_value=None, - dtype=None + dtype=None, ): """ - Perform an actual interpolation of values, values will be make 2-d if - needed fills inplace, returns the result. + This a modification of `interpolate_2d`, which is used for methods `pad` + and `backfill` when interpolating. This 1D-version is necessary to be + able to handle kwargs `max_gap` and `limit_area` via the function + ` _derive_indices_of_nans_to_preserve. It is used the same way as the + 1D-interpolation functions which are based on scipy-interpolation, i.e. + via np.apply_along_axis. """ if method == "pad": limit_direction = "forward" @@ -565,7 +570,8 @@ def interpolate_1d_fill( limit=limit, limit_area=limit_area, limit_direction=limit_direction, - max_gap=max_gap) + max_gap=max_gap, + ) method = clean_fill_method(method) if method == "pad": diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 31b8cb4e9ed73..f2437afd87836 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1493,63 +1493,38 @@ def test_interp_limit_to_ends(self): assert_series_equal(result, expected) def test_interp_max_gap(self): - s = Series([ - np.nan, - 1., np.nan, - 2., np.nan, np.nan, - 5., np.nan, np.nan, np.nan, - -1., np.nan, np.nan - ]) - - excpected = Series([ - 1., - 1., 1.5, - 2., 3., 4., - 5., np.nan, np.nan, np.nan, - -1., -1, -1 - ]) + s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) + + excpected = Series( + [1.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, -1, -1] + ) result = s.interpolate(method="linear", max_gap=2) assert_series_equal(result, excpected) - excpected = Series([ - np.nan, - 1., 1.5, - 2., 3., 4., - 5., np.nan, np.nan, np.nan, - -1., np.nan, np.nan - ]) + excpected = Series( + [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) result = s.interpolate(method="linear", max_gap=2, limit_area="inside") assert_series_equal(result, excpected) - excpected = Series([ - np.nan, - 1., 1, - 2., 2., 2., - 5., np.nan, np.nan, np.nan, - -1., np.nan, np.nan - ]) + excpected = Series( + [nan, 1.0, 1, 2.0, 2.0, 2.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) result = s.interpolate(method="pad", max_gap=2, limit_area="inside") assert_series_equal(result, excpected) def test_interp_max_gap_errors(self): - s = Series([ - np.nan, - 1., np.nan, - 2., np.nan, np.nan, - 5., np.nan, np.nan, np.nan, - -1., np.nan, np.nan - ]) - - with pytest.raises(ValueError, - match="max_gap cannot be used together with limit"): + s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) + + with pytest.raises( + ValueError, match="max_gap cannot be used together with limit" + ): s.interpolate(method="linear", max_gap=2, limit=3) - with pytest.raises(ValueError, - match="max_gap must be an integer"): + with pytest.raises(ValueError, match="max_gap must be an integer"): s.interpolate(method="linear", max_gap="foo") - with pytest.raises(ValueError, - match="max_gap must be greater than 0"): + with pytest.raises(ValueError, match="max_gap must be greater than 0"): s.interpolate(method="linear", max_gap=0) def test_interp_limit_before_ends(self): From c25d1f8fb67af258dd4d831587c65dfcdd34e0fa Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 17 Sep 2019 21:20:18 +0200 Subject: [PATCH 13/31] Make it work with NaT and test for that --- pandas/core/missing.py | 2 +- pandas/tests/series/test_missing.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index b549d838b4137..4138dde128f1a 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -328,7 +328,7 @@ def bfill_nan(arr): # Generate array where the NaN-gap-width is filled in as value # at each NaN location. cumsum = np.cumsum(invalid).astype("float") - diff = np.zeros_like(yvalues) + diff = np.zeros_like(yvalues, dtype='float') diff[~invalid] = np.pad(np.diff(cumsum[~invalid]), (1, 0), mode="constant") diff[invalid] = np.nan diff = bfill_nan(diff) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index f2437afd87836..017e250f85287 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1513,6 +1513,14 @@ def test_interp_max_gap(self): result = s.interpolate(method="pad", max_gap=2, limit_area="inside") assert_series_equal(result, excpected) + def test_interp_max_gap_nat(self): + series = Series([0, 1, 2, iNaT], dtype="M8[ns]") + + result = series.interpolate(method="pad", max_gap=2) + expected = Series([0, 1, 2, 2], dtype="M8[ns]") + + assert_series_equal(result, expected) + def test_interp_max_gap_errors(self): s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) From 4d4072270a5e9c861e87d06a53a114c2c3285d96 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 17 Sep 2019 21:21:11 +0200 Subject: [PATCH 14/31] Added comment on why two interpolate fill functions are needed --- pandas/core/internals/blocks.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 18dbbde34cccd..5a9709ff63817 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1198,6 +1198,11 @@ def _interpolate_with_fill( values = self.values if inplace else self.values.copy() fill_value = self._try_coerce_args(fill_value) + # We have to distinguish two cases: + # 1. When kwargs `max_gap` or `limit_area` are used: They are not + # supported by `missing.interpolate_2d()`. Using these kwargs only + # works by applying the fill along a certain axis. + # 2. All other cases: Then, `missing.interpolate_2d()` can be used. if (max_gap is not None) or (limit_area is not None): def func(x): From 255518e17594895f48f937f305be38a037f607c8 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 17 Sep 2019 21:25:59 +0200 Subject: [PATCH 15/31] fix typo --- pandas/tests/series/test_missing.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 017e250f85287..8a41cab8e84a2 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1495,23 +1495,23 @@ def test_interp_limit_to_ends(self): def test_interp_max_gap(self): s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) - excpected = Series( + expected = Series( [1.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, -1, -1] ) result = s.interpolate(method="linear", max_gap=2) - assert_series_equal(result, excpected) + assert_series_equal(result, expected) - excpected = Series( + expected = Series( [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] ) result = s.interpolate(method="linear", max_gap=2, limit_area="inside") - assert_series_equal(result, excpected) + assert_series_equal(result, expected) - excpected = Series( + expected = Series( [nan, 1.0, 1, 2.0, 2.0, 2.0, 5.0, nan, nan, nan, -1.0, nan, nan] ) result = s.interpolate(method="pad", max_gap=2, limit_area="inside") - assert_series_equal(result, excpected) + assert_series_equal(result, expected) def test_interp_max_gap_nat(self): series = Series([0, 1, 2, iNaT], dtype="M8[ns]") From 2015e846585dc20a91b259641cb839b2efb7ed6a Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 17 Sep 2019 21:35:37 +0200 Subject: [PATCH 16/31] Added tests for DataFrames The one with `method=pad` currently fails. The others pass. --- pandas/tests/frame/test_missing.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 94667ecfa837d..8aacd19513637 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -549,6 +549,34 @@ def test_frame_fillna_limit(self): expected.values[:3] = np.nan tm.assert_frame_equal(result, expected) + def test_frame_interp_max_gap(self): + nan = np.nan + s = Series( + [nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) + df = pd.concat([s, s], axis=1) + + expected_s = Series( + [1.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, -1, -1] + ) + expected_df = pd.concat([expected_s, expected_s], axis=1) + + result = df.interpolate(method="linear", max_gap=2) + assert_frame_equal(result, expected_df) + + expected_s = Series( + [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + expected_df = pd.concat([expected_s, expected_s], axis=1) + result = df.interpolate(method="linear", max_gap=2, limit_area="inside") + assert_frame_equal(result, expected_df) + + expected_s = Series( + [nan, 1.0, 1, 2.0, 2.0, 2.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + expected_df = pd.concat([expected_s, expected_s], axis=1) + result = df.interpolate(method="pad", max_gap=2, limit_area="inside") + assert_frame_equal(result, expected_df) + def test_fillna_skip_certain_blocks(self): # don't try to fill boolean, int blocks From 4d7b0f18f82487df0ea4e01cb8a81f6a0c95b9f0 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 17 Sep 2019 22:37:59 +0200 Subject: [PATCH 17/31] Added failing test for https://github.com/pandas-dev/pandas/issues/12918 --- pandas/tests/frame/test_missing.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 8aacd19513637..7083fe00b59ff 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -570,13 +570,6 @@ def test_frame_interp_max_gap(self): result = df.interpolate(method="linear", max_gap=2, limit_area="inside") assert_frame_equal(result, expected_df) - expected_s = Series( - [nan, 1.0, 1, 2.0, 2.0, 2.0, 5.0, nan, nan, nan, -1.0, nan, nan] - ) - expected_df = pd.concat([expected_s, expected_s], axis=1) - result = df.interpolate(method="pad", max_gap=2, limit_area="inside") - assert_frame_equal(result, expected_df) - def test_fillna_skip_certain_blocks(self): # don't try to fill boolean, int blocks @@ -813,6 +806,19 @@ def test_interp_nan_idx(self): with pytest.raises(NotImplementedError): df.interpolate(method="values") + def test_interp_pad(self): + # Test for GH 12918 + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + df = df.set_index("C") + expected = df.copy() + + result = df.interpolate(method="pad") + expected.A.loc[3] = 2 + expected.A.loc[13] = 5 + assert_frame_equal(result, expected) + @td.skip_if_no_scipy def test_interp_various(self): df = DataFrame( From cbf7388071cfc65cac2431d7c3f2c66c7428ba7a Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Wed, 18 Sep 2019 00:28:11 +0200 Subject: [PATCH 18/31] Now using 1D pad and backfill functions in `interpolate_1d_fill()` Also added test that now passes with using the 1D pad functions --- pandas/core/missing.py | 26 +++++++------------------- pandas/tests/frame/test_missing.py | 7 +++++++ 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 4138dde128f1a..8daf6c7dc1391 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -529,10 +529,10 @@ def interpolate_1d_fill( dtype=None, ): """ - This a modification of `interpolate_2d`, which is used for methods `pad` + This is a 1D-versoin of `interpolate_2d`, which is used for methods `pad` and `backfill` when interpolating. This 1D-version is necessary to be able to handle kwargs `max_gap` and `limit_area` via the function - ` _derive_indices_of_nans_to_preserve. It is used the same way as the + ` _derive_indices_of_nans_to_preserve`. It is used the same way as the 1D-interpolation functions which are based on scipy-interpolation, i.e. via np.apply_along_axis. """ @@ -549,19 +549,13 @@ def interpolate_1d_fill( invalid = isna(yvalues) valid = ~invalid - transf = (lambda x: x) if axis == 0 else (lambda x: x.T) - - # reshape a 1 dim if needed - ndim = values.ndim - if values.ndim == 1: - if axis != 0: # pragma: no cover - raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0") - values = values.reshape(tuple((1,) + values.shape)) + if values.ndim > 1: + raise AssertionError('This only works with 1D data.') if fill_value is None: mask = None else: # todo create faster fill func without masking - mask = mask_missing(transf(values), fill_value) + mask = mask_missing(values, fill_value) preserve_nans = _derive_indices_of_nans_to_preserve( yvalues=yvalues, @@ -575,15 +569,9 @@ def interpolate_1d_fill( method = clean_fill_method(method) if method == "pad": - values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) + values = pad_1d(values, limit=limit, mask=mask, dtype=dtype) else: - values = transf( - backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype) - ) - - # reshape back - if ndim == 1: - values = values[0] + values = backfill_1d(values, limit=limit, mask=mask, dtype=dtype) if orig_values.dtype.kind == "M": # convert float back to datetime64 diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 7083fe00b59ff..8d4689327c0f5 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -570,6 +570,13 @@ def test_frame_interp_max_gap(self): result = df.interpolate(method="linear", max_gap=2, limit_area="inside") assert_frame_equal(result, expected_df) + expected_s = Series( + [nan, 1.0, 1.0, 2.0, 2.0, 2.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + expected_df = pd.concat([expected_s, expected_s], axis=1) + result = df.interpolate(method="pad", max_gap=2, limit_area="inside") + assert_frame_equal(result, expected_df) + def test_fillna_skip_certain_blocks(self): # don't try to fill boolean, int blocks From 3c55e1e221c4a849a9b5592bcdb7170891aeb51a Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 19 Nov 2019 20:23:43 +0100 Subject: [PATCH 19/31] Additional required adjustments after merge with upstream/master --- pandas/core/missing.py | 1 + pandas/tests/frame/test_missing.py | 8 ++++---- pandas/tests/series/test_missing.py | 10 ++++++---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 8c6430750e19b..06509d94090a6 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -11,6 +11,7 @@ ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, + is_integer, is_integer_dtype, is_scalar, is_timedelta64_dtype, diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 7ad3bac768370..b3425c5da037a 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -562,21 +562,21 @@ def test_frame_interp_max_gap(self): expected_df = pd.concat([expected_s, expected_s], axis=1) result = df.interpolate(method="linear", max_gap=2) - assert_frame_equal(result, expected_df) + tm.assert_frame_equal(result, expected_df) expected_s = Series( [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] ) expected_df = pd.concat([expected_s, expected_s], axis=1) result = df.interpolate(method="linear", max_gap=2, limit_area="inside") - assert_frame_equal(result, expected_df) + tm.assert_frame_equal(result, expected_df) expected_s = Series( [nan, 1.0, 1.0, 2.0, 2.0, 2.0, 5.0, nan, nan, nan, -1.0, nan, nan] ) expected_df = pd.concat([expected_s, expected_s], axis=1) result = df.interpolate(method="pad", max_gap=2, limit_area="inside") - assert_frame_equal(result, expected_df) + tm.assert_frame_equal(result, expected_df) def test_fillna_skip_certain_blocks(self): # don't try to fill boolean, int blocks @@ -825,7 +825,7 @@ def test_interp_pad(self): result = df.interpolate(method="pad") expected.A.loc[3] = 2 expected.A.loc[13] = 5 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) @td.skip_if_no_scipy def test_interp_various(self): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index a38aaf3ef8b35..5b9a64b54006a 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1435,25 +1435,26 @@ def test_interp_limit_to_ends(self): tm.assert_series_equal(result, expected) def test_interp_max_gap(self): + nan = np.nan s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) expected = Series( [1.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, -1, -1] ) result = s.interpolate(method="linear", max_gap=2) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) expected = Series( [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] ) result = s.interpolate(method="linear", max_gap=2, limit_area="inside") - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) expected = Series( [nan, 1.0, 1, 2.0, 2.0, 2.0, 5.0, nan, nan, nan, -1.0, nan, nan] ) result = s.interpolate(method="pad", max_gap=2, limit_area="inside") - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_interp_max_gap_nat(self): series = Series([0, 1, 2, iNaT], dtype="M8[ns]") @@ -1461,9 +1462,10 @@ def test_interp_max_gap_nat(self): result = series.interpolate(method="pad", max_gap=2) expected = Series([0, 1, 2, 2], dtype="M8[ns]") - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_interp_max_gap_errors(self): + nan = np.nan s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) with pytest.raises( From d1bbcd6aa3cd3960a89b670e62154a05c4fcfa8b Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 19 Nov 2019 21:57:42 +0100 Subject: [PATCH 20/31] Removed test for bug with pad which should be solved in a separate PR --- pandas/tests/frame/test_missing.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index b3425c5da037a..bb66c3b647056 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -814,19 +814,6 @@ def test_interp_nan_idx(self): with pytest.raises(NotImplementedError): df.interpolate(method="values") - def test_interp_pad(self): - # Test for GH 12918 - df = DataFrame( - {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} - ) - df = df.set_index("C") - expected = df.copy() - - result = df.interpolate(method="pad") - expected.A.loc[3] = 2 - expected.A.loc[13] = 5 - tm.assert_frame_equal(result, expected) - @td.skip_if_no_scipy def test_interp_various(self): df = DataFrame( From 21b3091b2a88724aee19b63c560e3d816c151ac4 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 19 Nov 2019 22:19:47 +0100 Subject: [PATCH 21/31] removed trailing whitespaces --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 461f06b6a284f..f7deebcbc8005 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6905,14 +6905,14 @@ def replace( * 'outside': Only fill NaNs outside valid values (extrapolate). .. versionadded:: 0.23.0 - + max_gap : int, optional Maximum number of consecutive NaN values up to which a NaN-gap will be interpolated. For all NaN-gaps wider than that no interpolation is carried out. Must be greater than 0. .. versionadded:: 0.25.0 - + downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. **kwargs From c96c604db6836d8bbeffc55a7c4bea4622902544 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 19 Nov 2019 23:00:23 +0100 Subject: [PATCH 22/31] fixed formating for black and flake8 --- pandas/core/internals/blocks.py | 1 - pandas/core/missing.py | 4 ++-- pandas/tests/frame/test_missing.py | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8924c290bdbcc..a12040f590653 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1175,7 +1175,6 @@ def _interpolate_with_fill( # We only get here for non-ExtensionBlock fill_value = convert_scalar(self.values, fill_value) - # We have to distinguish two cases: # 1. When kwargs `max_gap` or `limit_area` are used: They are not # supported by `missing.interpolate_2d()`. Using these kwargs only diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d00141ddfd9c8..a91db6e744a47 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -344,7 +344,7 @@ def bfill_nan(arr): # Generate array where the NaN-gap-width is filled in as value # at each NaN location. cumsum = np.cumsum(invalid).astype("float") - diff = np.zeros_like(yvalues, dtype='float') + diff = np.zeros_like(yvalues, dtype="float") diff[~invalid] = np.pad(np.diff(cumsum[~invalid]), (1, 0), mode="constant") diff[invalid] = np.nan diff = bfill_nan(diff) @@ -566,7 +566,7 @@ def interpolate_1d_fill( valid = ~invalid if values.ndim > 1: - raise AssertionError('This only works with 1D data.') + raise AssertionError("This only works with 1D data.") if fill_value is None: mask = None diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index bb66c3b647056..eae69ded5a9cd 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -552,8 +552,7 @@ def test_frame_fillna_limit(self): def test_frame_interp_max_gap(self): nan = np.nan - s = Series( - [nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) + s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) df = pd.concat([s, s], axis=1) expected_s = Series( From bd84fc9ea1ed1cab2ad8429d64e6c8fc71363dde Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Tue, 19 Nov 2019 23:45:32 +0100 Subject: [PATCH 23/31] updated docstring for interpolat with max_gap * added example * optimized existing text --- pandas/core/generic.py | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f7deebcbc8005..01f736a768544 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6908,10 +6908,10 @@ def replace( max_gap : int, optional Maximum number of consecutive NaN values up to which a NaN-gap - will be interpolated. For all NaN-gaps wider than that no - interpolation is carried out. Must be greater than 0. + will be interpolated. All longer NaN-gaps will be left unchanged. + Must be greater than 0. - .. versionadded:: 0.25.0 + .. versionadded:: 1.0.0 downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. @@ -6996,6 +6996,36 @@ def replace( 7 4.71 8 4.71 dtype: object + + Similar to the examples above. Filling in ``NaN`` in a Series + by padding, but here filling only NaN-gaps smaller than a specific + gap width using the kwarg `max_gap`. + + >>> s = pd.Series([np.nan, "single_one", np.nan, + ... "fill_two_more", np.nan, np.nan, np.nan, + ... 4.71, np.nan]) + >>> s + 0 NaN + 1 single_one + 2 NaN + 3 fill_two_more + 4 NaN + 5 NaN + 6 NaN + 7 4.71 + 8 NaN + dtype: object + >>> s.interpolate(method='pad', max_gap=2) + 0 NaN + 1 single_one + 2 single_one + 3 fill_two_more + 4 NaN + 5 NaN + 6 NaN + 7 4.71 + 8 4.71 + dtype: object Filling in ``NaN`` in a Series via polynomial interpolation or splines: Both 'polynomial' and 'spline' methods require that you also specify From 908ffe56d995c234548e002cd5ecc7bd274c03ae Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Wed, 20 Nov 2019 10:28:29 +0100 Subject: [PATCH 24/31] added max_gap info and example to documentation --- doc/source/user_guide/missing_data.rst | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 6c36a6470f841..735a9e440bb3d 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -339,6 +339,10 @@ Interpolation The ``limit_area`` keyword argument was added. +.. versionadded:: 1.0.0 + + The ``max_gap`` keyword argument was added. + Both Series and DataFrame objects have :meth:`~DataFrame.interpolate` that, by default, performs linear interpolation at missing data points. @@ -481,8 +485,9 @@ filled since the last valid observation: .. ipython:: python - ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, - np.nan, 13, np.nan, np.nan]) + ser = pd.Series([np.nan, np.nan, 2, np.nan, np.nan, + 3, np.nan, np.nan, np.nan, + 13, np.nan, np.nan]) ser # fill all consecutive values in a forward direction @@ -491,6 +496,18 @@ filled since the last valid observation: # fill one consecutive value in a forward direction ser.interpolate(limit=1) +If an interpolation should only be carried out for consecutive ``NaN`` values +of a certain maximum length, the ``max_gap`` keyword, introduced in v1.0.0 +can be used. Any ``NaN`` gap longer than ``max_gap`` will not be modified. +This can be useful, e.g. if an interpolation using the ``scipy`` methods +should be restricted to short NaN-gaps because the expected variation over +longer NaN-gaps forbids using interpolated values. + +.. ipython:: python + + # forward fill only NaN-gaps with a maximum 2 consecutive NaN values + ser.interpolate(max_gap=2) + By default, ``NaN`` values are filled in a ``forward`` direction. Use ``limit_direction`` parameter to fill ``backward`` or from ``both`` directions. From 380ef7c211f759a1763348e910959d7917e49a36 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Wed, 20 Nov 2019 10:41:02 +0100 Subject: [PATCH 25/31] added info to whatsnew file --- doc/source/whatsnew/v1.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 98d861d999ea9..ad89327de2a4d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -125,6 +125,7 @@ Other enhancements - Roundtripping DataFrames with nullable integer or string data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). +- :meth:`Series.interpolate` added the ``max_gap`` keyword to limit interpolation to NaN-gaps of a certain length (:issue:`25141`) Build Changes ^^^^^^^^^^^^^ @@ -300,6 +301,7 @@ Performance improvements Bug fixes ~~~~~~~~~ +- ``limit_area`` and ``limit_direction`` now work in :meth:`Series.interpolate` if ``method`` is ``pad`` (:issue:`25141`) Categorical ^^^^^^^^^^^ From 5a1718ae5b412580b4387de549d15e2bd501bd34 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Wed, 20 Nov 2019 10:43:45 +0100 Subject: [PATCH 26/31] flake8 --- pandas/core/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 01f736a768544..94b975fcfe6d0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6908,7 +6908,7 @@ def replace( max_gap : int, optional Maximum number of consecutive NaN values up to which a NaN-gap - will be interpolated. All longer NaN-gaps will be left unchanged. + will be interpolated. All longer NaN-gaps will be left unchanged. Must be greater than 0. .. versionadded:: 1.0.0 @@ -6996,11 +6996,11 @@ def replace( 7 4.71 8 4.71 dtype: object - - Similar to the examples above. Filling in ``NaN`` in a Series + + Similar to the examples above. Filling in ``NaN`` in a Series by padding, but here filling only NaN-gaps smaller than a specific gap width using the kwarg `max_gap`. - + >>> s = pd.Series([np.nan, "single_one", np.nan, ... "fill_two_more", np.nan, np.nan, np.nan, ... 4.71, np.nan]) From 16755bd3ce5641f1ba83f086ee78f52dc17caa2b Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Wed, 20 Nov 2019 14:11:27 +0100 Subject: [PATCH 27/31] update docs with info on limit_direction and method pad --- doc/source/user_guide/missing_data.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 735a9e440bb3d..86e9e355eb645 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -497,7 +497,7 @@ filled since the last valid observation: ser.interpolate(limit=1) If an interpolation should only be carried out for consecutive ``NaN`` values -of a certain maximum length, the ``max_gap`` keyword, introduced in v1.0.0 +of a certain maximum length, the ``max_gap`` keyword, introduced in v1.0.0, can be used. Any ``NaN`` gap longer than ``max_gap`` will not be modified. This can be useful, e.g. if an interpolation using the ``scipy`` methods should be restricted to short NaN-gaps because the expected variation over @@ -510,6 +510,9 @@ longer NaN-gaps forbids using interpolated values. By default, ``NaN`` values are filled in a ``forward`` direction. Use ``limit_direction`` parameter to fill ``backward`` or from ``both`` directions. +Note that for methods `pad`, `ffill`, `backfill` and `bfill` ``limit_directions`` +must not be set as these fill methods implicitly are meant to work only in one +direction. .. ipython:: python From b58d721b05f61dc357e8c6b462fa345c14bc1c44 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Wed, 20 Nov 2019 14:12:32 +0100 Subject: [PATCH 28/31] better test for https://github.com/pandas-dev/pandas/issues/26796 --- pandas/core/generic.py | 26 +++++++++++++++-- pandas/tests/series/test_missing.py | 43 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 94b975fcfe6d0..afc11f6986702 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6894,7 +6894,9 @@ def replace( Update the data in place if possible. limit_direction : {'forward', 'backward', 'both'}, default 'forward' If limit is specified, consecutive NaNs will be filled in this - direction. + direction. If the methods 'pad' or 'ffill' are used it must be + None or 'forward'. If 'backfill' or 'bfill' are use it must be + None or 'backwards'. limit_area : {`None`, 'inside', 'outside'}, default None If limit is specified, consecutive NaNs will be filled with this restriction. @@ -7082,7 +7084,7 @@ def interpolate( axis=0, limit=None, inplace=False, - limit_direction="forward", + limit_direction=None, limit_area=None, max_gap=None, downcast=None, @@ -7123,6 +7125,26 @@ def interpolate( "column to a numeric dtype." ) + # Set `limit_direction` depending on `method` + if (method == 'pad') or (method == 'ffill'): + if (limit_direction == 'backward') or (limit_direction == 'both'): + raise ValueError( + "`limit_direction` must not be `%s` for method `%s`" % (limit_direction, method) + ) + else: + limit_direction = 'forward' + elif (method == 'backfill') or (method == 'bfill'): + if (limit_direction == 'forward') or (limit_direction == 'both'): + raise ValueError( + "`limit_direction` must not be `%s` for method `%s`" % (limit_direction, method) + ) + else: + limit_direction = 'backward' + else: + # Set default + if limit_direction is None: + limit_direction = 'forward' + # create/use the index if method == "linear": # prior default diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 5b9a64b54006a..07295e101110e 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1397,6 +1397,17 @@ def test_interp_limit_area(self): with pytest.raises(ValueError, match=msg): s.interpolate(method="linear", limit_area="abc") + def test_interp_limit_area_with_pad(self): + # Test for issue #26796 -- using `limit_area` with `method=pad` + s = Series([np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan]) + expected = Series([np.nan, np.nan, 3, 3, 3, 3, 7, np.nan, np.nan]) + result = s.interpolate(method="pad", limit_area="inside") + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, 7, 7]) + result = s.interpolate(method="pad", limit_area="outside") + tm.assert_series_equal(result, expected) + def test_interp_limit_direction(self): # These tests are for issue #9218 -- fill NaNs in both directions. s = Series([1, 3, np.nan, np.nan, np.nan, 11]) @@ -1422,6 +1433,38 @@ def test_interp_limit_direction(self): result = s.interpolate(method="linear", limit=1, limit_direction="both") tm.assert_series_equal(result, expected) + def test_interp_limit_direction_with_pad_error(self): + # Since `pad` forces a forward fill and `bfill` forces a backward fill + # they should not be used together with `limit_direction` + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + with pytest.raises( + ValueError, match="`limit_direction` must not be `backward` for method `pad`" + ): + s.interpolate(method="pad", limit=1, limit_direction="backward") + + with pytest.raises( + ValueError, match="`limit_direction` must not be `backward` for method `ffill`" + ): + s.interpolate(method="ffill", limit=1, limit_direction="backward") + + with pytest.raises( + ValueError, match="`limit_direction` must not be `both` for method `ffill`" + ): + s.interpolate(method="ffill", limit=1, limit_direction="both") + + with pytest.raises( + ValueError, + match="`limit_direction` must not be `forward` for method `backfill`" + ): + s.interpolate(method="backfill", limit=1, limit_direction="forward") + + with pytest.raises( + ValueError, + match="`limit_direction` must not be `forward` for method `bfill`" + ): + s.interpolate(method="bfill", limit=1, limit_direction="forward") + def test_interp_limit_to_ends(self): # These test are for issue #10420 -- flow back to beginning. s = Series([np.nan, np.nan, 5, 7, 9, np.nan]) From aa58ffa0b2a6da65a7b8bd6537bcc93850906be6 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Wed, 20 Nov 2019 14:33:12 +0100 Subject: [PATCH 29/31] typo, black, flake8 --- doc/source/user_guide/missing_data.rst | 2 +- pandas/core/generic.py | 22 ++++++++++++---------- pandas/tests/series/test_missing.py | 16 +++++++++------- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 86e9e355eb645..d8c85b2c5245d 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -510,7 +510,7 @@ longer NaN-gaps forbids using interpolated values. By default, ``NaN`` values are filled in a ``forward`` direction. Use ``limit_direction`` parameter to fill ``backward`` or from ``both`` directions. -Note that for methods `pad`, `ffill`, `backfill` and `bfill` ``limit_directions`` +Note that for methods `pad`, `ffill`, `backfill` and `bfill` ``limit_direction`` must not be set as these fill methods implicitly are meant to work only in one direction. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index afc11f6986702..50e3659b0858b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6896,7 +6896,7 @@ def replace( If limit is specified, consecutive NaNs will be filled in this direction. If the methods 'pad' or 'ffill' are used it must be None or 'forward'. If 'backfill' or 'bfill' are use it must be - None or 'backwards'. + None or 'backwards'. limit_area : {`None`, 'inside', 'outside'}, default None If limit is specified, consecutive NaNs will be filled with this restriction. @@ -7126,24 +7126,26 @@ def interpolate( ) # Set `limit_direction` depending on `method` - if (method == 'pad') or (method == 'ffill'): - if (limit_direction == 'backward') or (limit_direction == 'both'): + if (method == "pad") or (method == "ffill"): + if (limit_direction == "backward") or (limit_direction == "both"): raise ValueError( - "`limit_direction` must not be `%s` for method `%s`" % (limit_direction, method) + "`limit_direction` must not be `%s` for method `%s`" + % (limit_direction, method) ) else: - limit_direction = 'forward' - elif (method == 'backfill') or (method == 'bfill'): - if (limit_direction == 'forward') or (limit_direction == 'both'): + limit_direction = "forward" + elif (method == "backfill") or (method == "bfill"): + if (limit_direction == "forward") or (limit_direction == "both"): raise ValueError( - "`limit_direction` must not be `%s` for method `%s`" % (limit_direction, method) + "`limit_direction` must not be `%s` for method `%s`" + % (limit_direction, method) ) else: - limit_direction = 'backward' + limit_direction = "backward" else: # Set default if limit_direction is None: - limit_direction = 'forward' + limit_direction = "forward" # create/use the index if method == "linear": diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 07295e101110e..7c9a47f02fb8c 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1439,29 +1439,31 @@ def test_interp_limit_direction_with_pad_error(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) with pytest.raises( - ValueError, match="`limit_direction` must not be `backward` for method `pad`" + ValueError, + match="`limit_direction` must not be `backward` for method `pad`", ): s.interpolate(method="pad", limit=1, limit_direction="backward") with pytest.raises( - ValueError, match="`limit_direction` must not be `backward` for method `ffill`" + ValueError, + match="`limit_direction` must not be `backward` for method `ffill`", ): s.interpolate(method="ffill", limit=1, limit_direction="backward") with pytest.raises( - ValueError, match="`limit_direction` must not be `both` for method `ffill`" + ValueError, match="`limit_direction` must not be `both` for method `ffill`" ): s.interpolate(method="ffill", limit=1, limit_direction="both") with pytest.raises( - ValueError, - match="`limit_direction` must not be `forward` for method `backfill`" + ValueError, + match="`limit_direction` must not be `forward` for method `backfill`", ): s.interpolate(method="backfill", limit=1, limit_direction="forward") with pytest.raises( - ValueError, - match="`limit_direction` must not be `forward` for method `bfill`" + ValueError, + match="`limit_direction` must not be `forward` for method `bfill`", ): s.interpolate(method="bfill", limit=1, limit_direction="forward") From ae16124b94590b667703f4be91ed7e45cb18bb6f Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Wed, 20 Nov 2019 14:51:52 +0100 Subject: [PATCH 30/31] update to doc --- doc/source/user_guide/missing_data.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index d8c85b2c5245d..37da46c1ddc17 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -505,7 +505,8 @@ longer NaN-gaps forbids using interpolated values. .. ipython:: python - # forward fill only NaN-gaps with a maximum 2 consecutive NaN values + ser + # interpolate in forward direction but only NaN-gaps with a maximum 2 consecutive NaN values ser.interpolate(max_gap=2) By default, ``NaN`` values are filled in a ``forward`` direction. Use From 28b442c502d2131a51368960798c473b6c8f8b27 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Wed, 20 Nov 2019 14:58:25 +0100 Subject: [PATCH 31/31] fix wrong behavior when combining max_gap and limit_direction * limit_direction was not considered before when max_gap was provided * test have been adjusted for the new correct behavior and additional ones have been added --- pandas/core/missing.py | 18 +++++++++--------- pandas/tests/frame/test_missing.py | 10 +++++++++- pandas/tests/series/test_missing.py | 8 +++++++- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index a91db6e744a47..7f886b2891905 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -323,15 +323,15 @@ def _derive_indices_of_nans_to_preserve( # gaps with continuous NaN values of width > max_gap will be preserved. # set preserve_nans based on direction using _interp_limit - if max_gap is None: - if limit_direction == "forward": - preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) - elif limit_direction == "backward": - preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) - else: - # both directions... just use _interp_limit - preserve_nans = set(_interp_limit(invalid, limit, limit)) + if limit_direction == "forward": + preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) + elif limit_direction == "backward": + preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: + # both directions... just use _interp_limit + preserve_nans = set(_interp_limit(invalid, limit, limit)) + + if max_gap is not None: def bfill_nan(arr): """ Backward-fill NaNs """ @@ -352,7 +352,7 @@ def bfill_nan(arr): # with `max_gap`. Everything smaller than `max_gap` won't matter # in the following. diff[np.isnan(diff)] = max_gap - preserve_nans = set(np.flatnonzero((diff > max_gap) & invalid)) + preserve_nans |= set(np.flatnonzero((diff > max_gap) & invalid)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index eae69ded5a9cd..e8fdd51e09833 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -556,13 +556,21 @@ def test_frame_interp_max_gap(self): df = pd.concat([s, s], axis=1) expected_s = Series( - [1.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, -1, -1] + [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, -1, -1] ) expected_df = pd.concat([expected_s, expected_s], axis=1) result = df.interpolate(method="linear", max_gap=2) tm.assert_frame_equal(result, expected_df) + expected_s = Series( + [1.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + expected_df = pd.concat([expected_s, expected_s], axis=1) + + result = df.interpolate(method="linear", max_gap=2, limit_direction="backward") + tm.assert_frame_equal(result, expected_df) + expected_s = Series( [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] ) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 7c9a47f02fb8c..3b5c3b9dbc555 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1484,11 +1484,17 @@ def test_interp_max_gap(self): s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) expected = Series( - [1.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, -1, -1] + [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, -1, -1] ) result = s.interpolate(method="linear", max_gap=2) tm.assert_series_equal(result, expected) + expected = Series( + [1.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + result = s.interpolate(method="linear", max_gap=2, limit_direction="backward") + tm.assert_series_equal(result, expected) + expected = Series( [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] )