From ab342345391bfbc2c1de43e7a09e22ae3e51e159 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 14 Jul 2018 12:56:45 -0700 Subject: [PATCH 1/4] fix interpolation for datetimelike dtypes --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/generic.py | 13 ++++-- pandas/core/internals.py | 60 ++++++++++++++++++------- pandas/tests/frame/test_missing.py | 70 ++++++++++++++++++++++++++++- pandas/tests/series/test_missing.py | 33 ++++++++++++++ 5 files changed, 157 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index a17bf7c8bd6e9..1a91c26ca9209 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -371,6 +371,7 @@ Datetimelike - Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) - Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`,:issue:`21365`) +- Fixed bug in :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` where null values were not filled for dtypes of ``datetime64[ns]``, ``datetime64[ns, tz]``, ``timedelta64[ns]`` (:issue:`????`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8da678e0adec0..893f102014dfd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -683,7 +683,11 @@ def transpose(self, *args, **kwargs): new_axes = self._construct_axes_dict_from(self, [self._get_axis(x) for x in axes_names]) - new_values = self.values.transpose(axes_numbers) + values = self.values + if isinstance(values, DatetimeIndex): + # kludge for tz-aware case. See GH#19198 + values = values.astype('O').values.reshape(self.shape) + new_values = values.transpose(axes_numbers) if kwargs.pop('copy', None) or (len(args) and args[-1]): new_values = new_values.copy() @@ -6097,8 +6101,11 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, raise ValueError("Only `method=linear` interpolation is supported " "on MultiIndexes.") - if _maybe_transposed_self._data.get_dtype_counts().get( - 'object') == len(_maybe_transposed_self.T): + dtype_counts = _maybe_transposed_self._data.get_dtype_counts() + if ('object' in dtype_counts and + dtype_counts.get('object') == len(_maybe_transposed_self.T)): + # Try to short-circuit tranposing to avoid superfluous dimension + # errors GH#13287, GH#17539, GH#19197 raise TypeError("Cannot interpolate with all NaNs.") # create/use the index diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 208d7b8bcf8a7..159e31ab55425 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -15,6 +15,7 @@ from pandas.core.base import PandasObject +import pandas.core.dtypes.common as ct from pandas.core.dtypes.dtypes import ( ExtensionDtype, DatetimeTZDtype, PandasExtensionDtype, @@ -1158,20 +1159,19 @@ def check_int_bool(self, inplace): try: m = missing.clean_interp_method(method, **kwargs) except: - m = None + raise ValueError("invalid method '{0}' to interpolate." + .format(method)) - if m is not None: - r = check_int_bool(self, inplace) - if r is not None: - return r - return self._interpolate(method=m, index=index, values=values, - axis=axis, limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value=fill_value, inplace=inplace, - downcast=downcast, mgr=mgr, **kwargs) + r = check_int_bool(self, inplace) + if r is not None: + return r + return self._interpolate(method=m, index=index, values=values, + axis=axis, limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, inplace=inplace, + downcast=downcast, mgr=mgr, **kwargs) - raise ValueError("invalid method '{0}' to interpolate.".format(method)) def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, limit=None, fill_value=None, coerce=False, @@ -1199,6 +1199,7 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, blocks = [self.make_block_same_class(values, ndim=self.ndim)] return self._maybe_downcast(blocks, downcast) + # TODO: ignoring `values`? def _interpolate(self, method=None, index=None, values=None, fill_value=None, axis=0, limit=None, limit_direction='forward', limit_area=None, @@ -1206,13 +1207,27 @@ def _interpolate(self, method=None, index=None, values=None, """ interpolate using scipy wrappers """ inplace = validate_bool_kwarg(inplace, 'inplace') - data = self.values if inplace else self.values.copy() # only deal with floats - if not self.is_float: + if ct.needs_i8_conversion(self.dtype): + if ct.is_period_dtype(self.dtype): + raise NotImplementedError("PeriodDtype columns/Series don't " + "exist yet, but will soon. " + "When they do, test them!") + mask = isna(self.values) + values = self.values + + # DatetimeTZBlock.values is DatetimeIndex, need to cast/shape + values = getattr(values, 'values', values).reshape(self.shape) + data = values.astype(np.float64) + data[mask.reshape(self.shape)] = np.nan + elif not self.is_float: if not self.is_integer: return self - data = data.astype(np.float64) + data = self.values.astype(np.float64) + else: + # avoid making a copy if possible + data = self.values if inplace else self.values.copy() if fill_value is None: fill_value = self.fill_value @@ -1224,7 +1239,6 @@ def _interpolate(self, method=None, index=None, values=None, # process 1-d slices in the axis direction def func(x): - # process a 1-d slice, returning it # should the axis argument be handled below in apply_along_axis? # i.e. not an arg to missing.interpolate_1d @@ -1236,6 +1250,20 @@ def func(x): # interp each column independently interp_values = np.apply_along_axis(func, axis, data) + if ct.needs_i8_conversion(self.dtype): + # convert remaining NaNs back to NaT and cast back to own dtype + mask = isna(interp_values) + interp_values[mask] = fill_value # TODO: or self.fill_value? + + # Note: we need to get to a numpy dtype (M8[ns] or m8[ns]) and + # not a pandas tz-aware dtype (for now) + dtype = self.dtype.base + assert isinstance(dtype, np.dtype) + interp_values = interp_values.astype(dtype) + if is_datetimetz(self): + # squeeze() since we expanded dimension above + held = self._holder(interp_values.squeeze(), tz='UTC') + interp_values = held.tz_convert(self.dtype.tz) blocks = [self.make_block_same_class(interp_values)] return self._maybe_downcast(blocks, downcast) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 9567c08781856..3296e60a0edab 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -360,7 +360,10 @@ def test_fillna_categorical_nan(self): cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) - res = df.fillna(df.median()) + with tm.assert_produces_warning(RuntimeWarning): + # RuntimeWarning: All-NaN slice encountered + res = df.fillna(df.median()) + v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype='category') @@ -855,3 +858,68 @@ def test_interp_ignore_all_good(self): # all good result = df[['B', 'D']].interpolate(downcast=None) assert_frame_equal(result, df[['B', 'D']]) + + @pytest.mark.parametrize('use_idx', [True, False]) + @pytest.mark.parametrize('tz', [None, 'US/Central']) + def test_interpolate_dt64_values(self, tz, use_idx): + dti = pd.date_range('2016-01-01', periods=10, tz=tz) + index = dti if use_idx else None + + # Copy to avoid corrupting dti, see GH#21907 + ser = pd.Series(dti, index=index).copy() + ser[::3] = pd.NaT + + expected = pd.Series(dti, index=index) + expected.iloc[0] = pd.NaT + expected.iloc[-1] = expected.iloc[-2] + + df = ser.to_frame() + expected = expected.to_frame() + + result = df.interpolate(method='linear') + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize('use_idx', [True, False]) + def test_interpolate_td64_values(self, use_idx): + tdi = pd.timedelta_range('1D', periods=10) + index = tdi if use_idx else None + + ser = pd.Series(tdi, index=index) + ser[::3] = pd.NaT + + expected = pd.Series(tdi, index=index) + expected.iloc[0] = pd.NaT + expected.iloc[-1] = expected.iloc[-2] + + df = ser.to_frame() + expected = expected.to_frame() + + result = df.interpolate(method='linear') + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize('use_idx', [True, False]) + def test_interpolate_datetimelike_and_object(self, use_idx): + # Check that dt64/td64 with more than one column doesn't get + # screwed up by .transpose() with an object column present. + dti_tz = pd.date_range('2016-01-01', periods=10, tz='US/Central') + dti_naive = pd.date_range('2016-01-01', periods=10, tz=None) + tdi = pd.timedelta_range('1D', periods=10) + objcol = list('ABCDEFGHIJ') + + index = tdi if use_idx else None + + df = pd.DataFrame({'aware': dti_tz, + 'naive': dti_naive, + 'tdi': tdi, + 'obj': objcol}, + columns=['naive', 'aware', 'tdi', 'obj'], + index=index) + + expected = df.copy() + expected.iloc[0, :-1] = pd.NaT + expected.iloc[-1, :-1] = df.iloc[-2, :-1] + + df.iloc[::3, :-1] = pd.NaT + + result = df.interpolate(method='linear') + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 2bc44cb1c683f..debcf987dac27 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1317,3 +1317,36 @@ def test_series_interpolate_intraday(self): result = ts.reindex(new_index).interpolate(method='time') tm.assert_numpy_array_equal(result.values, exp.values) + + # TODO: De-duplicate with similar tests in test.frame.test_missing? + @pytest.mark.parametrize('use_idx', [True, False]) + @pytest.mark.parametrize('tz', [None, 'US/Central']) + def test_interpolate_dt64_values(self, tz, use_idx): + dti = pd.date_range('2016-01-01', periods=10, tz=tz) + index = dti if use_idx else None + + # Copy to avoid corrupting dti, see GH#21907 + ser = pd.Series(dti, index=index).copy() + ser[::3] = pd.NaT + + expected = pd.Series(dti, index=index) + expected.iloc[0] = pd.NaT + expected.iloc[-1] = expected.iloc[-2] + + result = ser.interpolate(method='linear') + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('use_idx', [True, False]) + def test_interpolate_td64_values(self, use_idx): + tdi = pd.timedelta_range('1D', periods=10) + index = tdi if use_idx else None + + ser = pd.Series(tdi, index=index) + ser[::3] = pd.NaT + + expected = pd.Series(tdi, index=index) + expected.iloc[0] = pd.NaT + expected.iloc[-1] = expected.iloc[-2] + + result = ser.interpolate(method='linear') + tm.assert_series_equal(result, expected) From e43bc85d2c465540d6fad8c9813f19758e15a422 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 14 Jul 2018 13:01:04 -0700 Subject: [PATCH 2/4] revert unrelated change --- pandas/core/generic.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 893f102014dfd..e7d9258a9d832 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -683,10 +683,8 @@ def transpose(self, *args, **kwargs): new_axes = self._construct_axes_dict_from(self, [self._get_axis(x) for x in axes_names]) - values = self.values - if isinstance(values, DatetimeIndex): - # kludge for tz-aware case. See GH#19198 - values = values.astype('O').values.reshape(self.shape) + new_values = self.values.transpose(axes_numbers) + new_values = values.transpose(axes_numbers) if kwargs.pop('copy', None) or (len(args) and args[-1]): new_values = new_values.copy() From 2dffb4c76ccb49241ba4259487c6503114d4eb2c Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 14 Jul 2018 13:14:05 -0700 Subject: [PATCH 3/4] add GH references --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/tests/frame/test_missing.py | 3 +++ pandas/tests/series/test_missing.py | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 1a91c26ca9209..9c24e31de4fd5 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -371,7 +371,7 @@ Datetimelike - Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) - Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`,:issue:`21365`) -- Fixed bug in :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` where null values were not filled for dtypes of ``datetime64[ns]``, ``datetime64[ns, tz]``, ``timedelta64[ns]`` (:issue:`????`) +- Fixed bug in :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` where null values were not filled for dtypes of ``datetime64[ns]``, ``datetime64[ns, tz]``, ``timedelta64[ns]`` (:issue:`21915`) Timedelta ^^^^^^^^^ diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 3296e60a0edab..b4416432764cf 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -862,6 +862,7 @@ def test_interp_ignore_all_good(self): @pytest.mark.parametrize('use_idx', [True, False]) @pytest.mark.parametrize('tz', [None, 'US/Central']) def test_interpolate_dt64_values(self, tz, use_idx): + # GH#21915 dti = pd.date_range('2016-01-01', periods=10, tz=tz) index = dti if use_idx else None @@ -881,6 +882,7 @@ def test_interpolate_dt64_values(self, tz, use_idx): @pytest.mark.parametrize('use_idx', [True, False]) def test_interpolate_td64_values(self, use_idx): + # GH#21915 tdi = pd.timedelta_range('1D', periods=10) index = tdi if use_idx else None @@ -899,6 +901,7 @@ def test_interpolate_td64_values(self, use_idx): @pytest.mark.parametrize('use_idx', [True, False]) def test_interpolate_datetimelike_and_object(self, use_idx): + # GH#21915 # Check that dt64/td64 with more than one column doesn't get # screwed up by .transpose() with an object column present. dti_tz = pd.date_range('2016-01-01', periods=10, tz='US/Central') diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index debcf987dac27..96f4a60a8c53f 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1322,6 +1322,7 @@ def test_series_interpolate_intraday(self): @pytest.mark.parametrize('use_idx', [True, False]) @pytest.mark.parametrize('tz', [None, 'US/Central']) def test_interpolate_dt64_values(self, tz, use_idx): + # GH#21915 dti = pd.date_range('2016-01-01', periods=10, tz=tz) index = dti if use_idx else None @@ -1338,6 +1339,7 @@ def test_interpolate_dt64_values(self, tz, use_idx): @pytest.mark.parametrize('use_idx', [True, False]) def test_interpolate_td64_values(self, use_idx): + # GH#21915 tdi = pd.timedelta_range('1D', periods=10) index = tdi if use_idx else None From 9092fc1a4cd77caf7eba3af657be14d33b4240a9 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 14 Jul 2018 13:38:35 -0700 Subject: [PATCH 4/4] fix reversion left-behind --- pandas/core/generic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e7d9258a9d832..8488584d21f65 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -684,8 +684,6 @@ def transpose(self, *args, **kwargs): new_axes = self._construct_axes_dict_from(self, [self._get_axis(x) for x in axes_names]) new_values = self.values.transpose(axes_numbers) - - new_values = values.transpose(axes_numbers) if kwargs.pop('copy', None) or (len(args) and args[-1]): new_values = new_values.copy()