diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 1fb43de5f4c5a..2693c98e56582 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -31,6 +31,7 @@ New features - :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`) - :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame. See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) +- :meth:`DataFrame.shift` :meth:`Series.shift`, :meth:`ExtensionArray.shift`, :meth:`SparseArray.shift`, :meth:`Period.shift`, :meth:`GroupBy.shift`, :meth:`Categorical.shift`, :meth:`NDFrame.shift` and :meth:`Block.shift` now accept `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`) .. _whatsnew_0240.values_api: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cf145064fd7b1..262f043e1f0a1 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -15,6 +15,7 @@ from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -446,8 +447,8 @@ def dropna(self): """ return self[~self.isna()] - def shift(self, periods=1): - # type: (int) -> ExtensionArray + def shift(self, periods=1, fill_value=None): + # type: (int, object) -> ExtensionArray """ Shift values by desired number. @@ -462,6 +463,12 @@ def shift(self, periods=1): The number of periods to shift. Negative values are allowed for shifting backwards. + fill_value : object, optional + The scalar value to use for newly introduced missing values. + The default is ``self.dtype.na_value`` + + .. versionadded:: 0.24.0 + Returns ------- shifted : ExtensionArray @@ -480,8 +487,11 @@ def shift(self, periods=1): if not len(self) or periods == 0: return self.copy() + if isna(fill_value): + fill_value = self.dtype.na_value + empty = self._from_sequence( - [self.dtype.na_value] * min(abs(periods), len(self)), + [fill_value] * min(abs(periods), len(self)), dtype=self.dtype ) if periods > 0: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6ccb8dc5d2725..86fbb6f966089 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1258,7 +1258,7 @@ def shape(self): return tuple([len(self._codes)]) - def shift(self, periods): + def shift(self, periods, fill_value=None): """ Shift Categorical by desired number of periods. @@ -1266,6 +1266,10 @@ def shift(self, periods): ---------- periods : int Number of periods to move, can be positive or negative + fill_value : object, optional + The scalar value to use for newly introduced missing values. + + .. versionadded:: 0.24.0 Returns ------- @@ -1278,10 +1282,18 @@ def shift(self, periods): raise NotImplementedError("Categorical with ndim > 1.") if np.prod(codes.shape) and (periods != 0): codes = np.roll(codes, ensure_platform_int(periods), axis=0) + if isna(fill_value): + fill_value = -1 + elif fill_value in self.categories: + fill_value = self.categories.get_loc(fill_value) + else: + raise ValueError("'fill_value={}' is not present " + "in this Categorical's " + "categories".format(fill_value)) if periods > 0: - codes[:periods] = -1 + codes[:periods] = fill_value else: - codes[periods:] = -1 + codes[periods:] = fill_value return self.from_codes(codes, categories=self.categories, ordered=self.ordered) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 60febc5f5636d..a29454d2563e4 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -453,7 +453,7 @@ def value_counts(self, dropna=False): # -------------------------------------------------------------------- - def shift(self, periods=1): + def shift(self, periods=1, fill_value=None): """ Shift values by desired number. @@ -467,6 +467,9 @@ def shift(self, periods=1): periods : int, default 1 The number of periods to shift. Negative values are allowed for shifting backwards. + fill_value : optional, default NaT + + .. versionadded:: 0.24.0 Returns ------- @@ -475,7 +478,7 @@ def shift(self, periods=1): # TODO(DatetimeArray): remove # The semantics for Index.shift differ from EA.shift # then just call super. - return ExtensionArray.shift(self, periods) + return ExtensionArray.shift(self, periods, fill_value=fill_value) def _time_shift(self, n, freq=None): """ diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 9e1d2efc21b81..e4a8c21bbb839 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -889,12 +889,15 @@ def fillna(self, value=None, method=None, limit=None): return self._simple_new(new_values, self._sparse_index, new_dtype) - def shift(self, periods=1): + def shift(self, periods=1, fill_value=None): if not len(self) or periods == 0: return self.copy() - subtype = np.result_type(np.nan, self.dtype.subtype) + if isna(fill_value): + fill_value = self.dtype.na_value + + subtype = np.result_type(fill_value, self.dtype.subtype) if subtype != self.dtype.subtype: # just coerce up front @@ -903,7 +906,7 @@ def shift(self, periods=1): arr = self empty = self._from_sequence( - [self.dtype.na_value] * min(abs(periods), len(self)), + [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype ) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c4537db254132..e99bb88fe80b5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3939,9 +3939,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, method=method) @Appender(_shared_docs['shift'] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0): + def shift(self, periods=1, freq=None, axis=0, fill_value=None): return super(DataFrame, self).shift(periods=periods, freq=freq, - axis=axis) + axis=axis, fill_value=fill_value) def set_index(self, keys, drop=True, append=False, inplace=False, verify_integrity=False): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6eb6bc124c80a..d893ebded330d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8849,6 +8849,14 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, extend the index when shifting and preserve the original data. axis : {0 or 'index', 1 or 'columns', None}, default None Shift direction. + fill_value : object, optional + The scalar value to use for newly introduced missing values. + the default depends on the dtype of `self`. + For numeric data, ``np.nan`` is used. + For datetime, timedelta, or period data, etc. :attr:`NaT` is used. + For extension dtypes, ``self.dtype.na_value`` is used. + + .. versionchanged:: 0.24.0 Returns ------- @@ -8884,16 +8892,25 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, 2 NaN 15.0 18.0 3 NaN 30.0 33.0 4 NaN 45.0 48.0 + + >>> df.shift(periods=3, fill_value=0) + Col1 Col2 Col3 + 0 0 0 0 + 1 0 0 0 + 2 0 0 0 + 3 10 13 17 + 4 20 23 27 """) @Appender(_shared_docs['shift'] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0): + def shift(self, periods=1, freq=None, axis=0, fill_value=None): if periods == 0: return self.copy() block_axis = self._get_block_manager_axis(axis) if freq is None: - new_data = self._data.shift(periods=periods, axis=block_axis) + new_data = self._data.shift(periods=periods, axis=block_axis, + fill_value=fill_value) else: return self.tshift(periods, freq) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f7c6ccdc25395..60b6c843492c7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1994,7 +1994,7 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, @Substitution(name='groupby') @Appender(_common_see_also) - def shift(self, periods=1, freq=None, axis=0): + def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ Shift each group by periods observations. @@ -2004,10 +2004,14 @@ def shift(self, periods=1, freq=None, axis=0): number of periods to shift freq : frequency string axis : axis to shift, default 0 + fill_value : optional + + .. versionadded:: 0.24.0 """ - if freq is not None or axis != 0: - return self.apply(lambda x: x.shift(periods, freq, axis)) + if freq is not None or axis != 0 or not isna(fill_value): + return self.apply(lambda x: x.shift(periods, freq, + axis, fill_value)) return self._get_cythonized_result('group_shift_indexer', self.grouper, cython_dtype=np.int64, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1383ce09bc2d0..0ccfab93c2830 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1261,12 +1261,12 @@ def diff(self, n, axis=1): new_values = algos.diff(self.values, n, axis=axis) return [self.make_block(values=new_values)] - def shift(self, periods, axis=0): + def shift(self, periods, axis=0, fill_value=None): """ shift the block by periods, possibly upcast """ # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = maybe_upcast(self.values) + new_values, fill_value = maybe_upcast(self.values, fill_value) # make sure array sent to np.roll is c_contiguous f_ordered = new_values.flags.f_contiguous @@ -1955,7 +1955,7 @@ def interpolate(self, method='pad', axis=0, inplace=False, limit=None, limit=limit), placement=self.mgr_locs) - def shift(self, periods, axis=0): + def shift(self, periods, axis=0, fill_value=None): """ Shift the block by `periods`. @@ -1963,9 +1963,11 @@ def shift(self, periods, axis=0): ExtensionBlock. """ # type: (int, Optional[BlockPlacement]) -> List[ExtensionBlock] - return [self.make_block_same_class(self.values.shift(periods=periods), - placement=self.mgr_locs, - ndim=self.ndim)] + return [ + self.make_block_same_class( + self.values.shift(periods=periods, fill_value=fill_value), + placement=self.mgr_locs, ndim=self.ndim) + ] def where(self, other, cond, align=True, errors='raise', try_cast=False, axis=0, transpose=False): @@ -3023,7 +3025,7 @@ def _try_coerce_result(self, result): def _box_func(self): return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz) - def shift(self, periods, axis=0): + def shift(self, periods, axis=0, fill_value=None): """ shift the block by periods """ # think about moving this to the DatetimeIndex. This is a non-freq @@ -3038,10 +3040,12 @@ def shift(self, periods, axis=0): new_values = self.values.asi8.take(indexer) + if isna(fill_value): + fill_value = tslibs.iNaT if periods > 0: - new_values[:periods] = tslibs.iNaT + new_values[:periods] = fill_value else: - new_values[periods:] = tslibs.iNaT + new_values[periods:] = fill_value new_values = self.values._shallow_copy(new_values) return [self.make_block_same_class(new_values, diff --git a/pandas/core/series.py b/pandas/core/series.py index 773f2d17cf0fc..5ef316b000a7f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3716,8 +3716,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, regex=regex, method=method) @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0): - return super(Series, self).shift(periods=periods, freq=freq, axis=axis) + def shift(self, periods=1, freq=None, axis=0, fill_value=None): + return super(Series, self).shift(periods=periods, freq=freq, axis=axis, + fill_value=fill_value) def reindex_axis(self, labels, axis=0, **kwargs): """ diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 7d8cc34ae1462..9c13a20726553 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -10,6 +10,7 @@ import pandas.util._test_decorators as td import pandas as pd +from pandas import isna from pandas.core.sparse.api import SparseArray, SparseDtype, SparseSeries import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal @@ -262,6 +263,18 @@ def test_take_negative(self): exp = SparseArray(np.take(self.arr_data, [-4, -3, -2])) tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp) + @pytest.mark.parametrize('fill_value', [0, None, np.nan]) + def test_shift_fill_value(self, fill_value): + # GH #24128 + sparse = SparseArray(np.array([1, 0, 0, 3, 0]), + fill_value=8.0) + res = sparse.shift(1, fill_value=fill_value) + if isna(fill_value): + fill_value = res.dtype.na_value + exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), + fill_value=8.0) + tm.assert_sp_array_equal(res, exp) + def test_bad_take(self): with pytest.raises(IndexError, match="bounds"): self.arr.take([11]) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4a409a84f3db4..2bcdb8aa7488c 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -221,6 +221,17 @@ def test_shift_empty_array(self, data, periods): expected = empty self.assert_extension_array_equal(result, expected) + def test_shift_fill_value(self, data): + arr = data[:4] + fill_value = data[0] + result = arr.shift(1, fill_value=fill_value) + expected = data.take([0, 0, 1, 2]) + self.assert_extension_array_equal(result, expected) + + result = arr.shift(-2, fill_value=fill_value) + expected = data.take([2, 3, 0, 0]) + self.assert_extension_array_equal(result, expected) + @pytest.mark.parametrize("as_frame", [True, False]) def test_hash_pandas_object_works(self, data, as_frame): # https://github.com/pandas-dev/pandas/issues/23066 diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 52f0b30bf0f0c..02b83aaf5c131 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -320,6 +320,20 @@ def test_shift_categorical(self): xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)}) assert_frame_equal(rs, xp) + def test_shift_fill_value(self): + # GH #24128 + df = DataFrame([1, 2, 3, 4, 5], + index=date_range('1/1/2000', periods=5, freq='H')) + exp = DataFrame([0, 1, 2, 3, 4], + index=date_range('1/1/2000', periods=5, freq='H')) + result = df.shift(1, fill_value=0) + assert_frame_equal(result, exp) + + exp = DataFrame([0, 0, 1, 2, 3], + index=date_range('1/1/2000', periods=5, freq='H')) + result = df.shift(2, fill_value=0) + assert_frame_equal(result, exp) + def test_shift_empty(self): # Regression test for #8019 df = DataFrame({'foo': []}) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 264f2567e45c1..7eda113be0e36 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -9,7 +9,8 @@ from pandas.compat import PY37 from pandas import (Index, MultiIndex, CategoricalIndex, DataFrame, Categorical, Series, qcut) -from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.util.testing import (assert_equal, + assert_frame_equal, assert_series_equal) import pandas.util.testing as tm @@ -860,3 +861,13 @@ def test_groupby_multiindex_categorical_datetime(): expected = pd.DataFrame( {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) assert_frame_equal(result, expected) + + +@pytest.mark.parametrize('fill_value', [None, np.nan, pd.NaT]) +def test_shift(fill_value): + ct = pd.Categorical(['a', 'b', 'c', 'd'], + categories=['a', 'b', 'c', 'd'], ordered=False) + expected = pd.Categorical([None, 'a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], ordered=False) + res = ct.shift(1, fill_value=fill_value) + assert_equal(res, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6d9f60df45ec8..e9de46bba03f1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1617,6 +1617,23 @@ def test_group_shift_with_null_key(): assert_frame_equal(result, expected) +def test_group_shift_with_fill_value(): + # GH #24128 + n_rows = 24 + df = DataFrame([(i % 12, i % 3, i) + for i in range(n_rows)], dtype=float, + columns=["A", "B", "Z"], index=None) + g = df.groupby(["A", "B"]) + + expected = DataFrame([(i + 12 if i < n_rows - 12 + else 0) + for i in range(n_rows)], dtype=float, + columns=["Z"], index=None) + result = g.shift(-1, fill_value=0)[["Z"]] + + assert_frame_equal(result, expected) + + def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = pd.DataFrame({'eventDate': diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index ce464184cd8d6..2425437591dfc 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -129,6 +129,38 @@ def test_shift2(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) pytest.raises(NullFrequencyError, idx.shift, 1) + def test_shift_fill_value(self): + # GH #24128 + ts = Series([1.0, 2.0, 3.0, 4.0, 5.0], + index=date_range('1/1/2000', periods=5, freq='H')) + + exp = Series([0.0, 1.0, 2.0, 3.0, 4.0], + index=date_range('1/1/2000', periods=5, freq='H')) + # check that fill value works + result = ts.shift(1, fill_value=0.0) + tm.assert_series_equal(result, exp) + + exp = Series([0.0, 0.0, 1.0, 2.0, 3.0], + index=date_range('1/1/2000', periods=5, freq='H')) + result = ts.shift(2, fill_value=0.0) + tm.assert_series_equal(result, exp) + + ts = pd.Series([1, 2, 3]) + res = ts.shift(2, fill_value=0) + assert res.dtype == ts.dtype + + def test_categorical_shift_fill_value(self): + ts = pd.Series(['a', 'b', 'c', 'd'], dtype="category") + res = ts.shift(1, fill_value='a') + expected = pd.Series(pd.Categorical(['a', 'a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + ordered=False)) + tm.assert_equal(res, expected) + + # check for incorrect fill_value + with pytest.raises(ValueError): + ts.shift(1, fill_value='f') + def test_shift_dst(self): # GH 13926 dates = date_range('2016-11-06', freq='H', periods=10, tz='US/Eastern')