From 7b839e52984ccd8ddc715bde8c29707910dd81c1 Mon Sep 17 00:00:00 2001 From: Chang She Date: Mon, 9 Jul 2018 13:25:01 -0700 Subject: [PATCH] BUG: datetime rolling min/max segfaults when closed=left (#21704) User reported that `df.rolling(to_offset('3D'), closed='left').max()` segfaults when df has a datetime index. The bug was in PR #19549. In that PR, in https://github.com/pandas-dev/pandas/blame/master/pandas/_libs/window.pyx#L1268 `i` is initialized to `endi[0]`, which is 0 when `closed=left`. So in the next line when it tries to set `output[i-1]` it goes out of bounds. In addition, there are 2 more bugs in the `roll_min_max` code. The second bug is that for variable size windows, the `nobs` is never updated when elements leave the window. The third bug is at the end of the fixed window where all output elements up to `minp` are initialized to 0 if the input is not float. This PR fixes these three bugs, at the cost of casting the output array to floating point even if the input is integer. This is less than ideal if the output has no NaNs, but is consistent with roll_sum behavior. --- doc/source/whatsnew/v0.24.0.txt | 3 + pandas/_libs/window.pyx | 241 +++++++++++++++++++------------- pandas/tests/test_window.py | 54 +++++++ 3 files changed, 201 insertions(+), 97 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 034a56b2ac0cb..798e414b3e60c 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -398,6 +398,9 @@ Groupby/Resample/Rolling - - +- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'` and a + datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) + Sparse ^^^^^^ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 9e704a9bd8d3f..bd6cd476595f3 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1218,141 +1218,188 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, Moving min/max of 1d array of any numeric type along axis=0 ignoring NaNs. """ - cdef: - numeric ai - bint is_variable, should_replace - int64_t N, i, removed, window_i - Py_ssize_t nobs = 0 - deque Q[int64_t] ndarray[int64_t] starti, endi - ndarray[numeric, ndim=1] output - cdef: - int64_t* death - numeric* ring - numeric* minvalue - numeric* end - numeric* last - - cdef: - cdef numeric r + int64_t N + bint is_variable starti, endi, N, win, minp, is_variable = get_window_indexer( input, win, minp, index, closed) - output = np.empty(N, dtype=input.dtype) + if is_variable: + return _roll_min_max_variable(input, starti, endi, N, win, minp, + is_max) + else: + return _roll_min_max_fixed(input, starti, endi, N, win, minp, is_max) + +cdef _roll_min_max_variable(ndarray[numeric] input, + ndarray[int64_t] starti, + ndarray[int64_t] endi, + int64_t N, + int64_t win, + int64_t minp, + bint is_max): + cdef: + numeric ai + int64_t i, close_offset, curr_win_size + Py_ssize_t nobs = 0 + deque Q[int64_t] # min/max always the front + deque W[int64_t] # track the whole window for nobs compute + ndarray[double_t, ndim=1] output + + output = np.empty(N, dtype=float) Q = deque[int64_t]() + W = deque[int64_t]() - if is_variable: + with nogil: - with nogil: + # This is using a modified version of the C++ code in this + # SO post: http://bit.ly/2nOoHlY + # The original impl didn't deal with variable window sizes + # So the code was optimized for that - # This is using a modified version of the C++ code in this - # SO post: http://bit.ly/2nOoHlY - # The original impl didn't deal with variable window sizes - # So the code was optimized for that + for i from starti[0] <= i < endi[0]: + ai = init_mm(input[i], &nobs, is_max) - for i from starti[0] <= i < endi[0]: - ai = init_mm(input[i], &nobs, is_max) + # Discard previous entries if we find new min or max + if is_max: + while not Q.empty() and ((ai >= input[Q.back()]) or + (input[Q.back()] != input[Q.back()])): + Q.pop_back() + else: + while not Q.empty() and ((ai <= input[Q.back()]) or + (input[Q.back()] != input[Q.back()])): + Q.pop_back() + Q.push_back(i) + W.push_back(i) + + # if right is open then the first window is empty + close_offset = 0 if endi[0] > starti[0] else 1 + + for i in range(endi[0], endi[N-1]): + if not Q.empty(): + output[i-1+close_offset] = calc_mm( + minp, nobs, input[Q.front()]) + else: + output[i-1+close_offset] = NaN - if is_max: - while not Q.empty() and ai >= input[Q.back()]: - Q.pop_back() - else: - while not Q.empty() and ai <= input[Q.back()]: - Q.pop_back() - Q.push_back(i) + ai = init_mm(input[i], &nobs, is_max) + + # Discard previous entries if we find new min or max + if is_max: + while not Q.empty() and ((ai >= input[Q.back()]) or + (input[Q.back()] != input[Q.back()])): + Q.pop_back() + else: + while not Q.empty() and ((ai <= input[Q.back()]) or + (input[Q.back()] != input[Q.back()])): + Q.pop_back() - for i from endi[0] <= i < N: - output[i-1] = calc_mm(minp, nobs, input[Q.front()]) + # Maintain window/nobs retention + curr_win_size = endi[i + close_offset] - starti[i + close_offset] + while not Q.empty() and Q.front() <= i - curr_win_size: + Q.pop_front() + while not W.empty() and W.front() <= i - curr_win_size: + remove_mm(input[W.front()], &nobs) + W.pop_front() - ai = init_mm(input[i], &nobs, is_max) + Q.push_back(i) + W.push_back(i) - if is_max: - while not Q.empty() and ai >= input[Q.back()]: - Q.pop_back() - else: - while not Q.empty() and ai <= input[Q.back()]: - Q.pop_back() + output[N-1] = calc_mm(minp, nobs, input[Q.front()]) - while not Q.empty() and Q.front() <= i - (endi[i] - starti[i]): - Q.pop_front() + return output - Q.push_back(i) - output[N-1] = calc_mm(minp, nobs, input[Q.front()]) +cdef _roll_min_max_fixed(ndarray[numeric] input, + ndarray[int64_t] starti, + ndarray[int64_t] endi, + int64_t N, + int64_t win, + int64_t minp, + bint is_max): + cdef: + numeric ai + bint should_replace + int64_t i, removed, window_i, + Py_ssize_t nobs = 0 + int64_t* death + numeric* ring + numeric* minvalue + numeric* end + numeric* last + ndarray[double_t, ndim=1] output - else: - # setup the rings of death! - ring = malloc(win * sizeof(numeric)) - death = malloc(win * sizeof(int64_t)) - - end = ring + win - last = ring - minvalue = ring - ai = input[0] - minvalue[0] = init_mm(input[0], &nobs, is_max) - death[0] = win - nobs = 0 + output = np.empty(N, dtype=float) + # setup the rings of death! + ring = malloc(win * sizeof(numeric)) + death = malloc(win * sizeof(int64_t)) + + end = ring + win + last = ring + minvalue = ring + ai = input[0] + minvalue[0] = init_mm(input[0], &nobs, is_max) + death[0] = win + nobs = 0 - with nogil: + with nogil: - for i in range(N): - ai = init_mm(input[i], &nobs, is_max) + for i in range(N): + ai = init_mm(input[i], &nobs, is_max) - if i >= win: - remove_mm(input[i - win], &nobs) + if i >= win: + remove_mm(input[i - win], &nobs) - if death[minvalue - ring] == i: - minvalue = minvalue + 1 - if minvalue >= end: - minvalue = ring + if death[minvalue - ring] == i: + minvalue = minvalue + 1 + if minvalue >= end: + minvalue = ring - if is_max: - should_replace = ai >= minvalue[0] - else: - should_replace = ai <= minvalue[0] - if should_replace: + if is_max: + should_replace = ai >= minvalue[0] + else: + should_replace = ai <= minvalue[0] + if should_replace: - minvalue[0] = ai - death[minvalue - ring] = i + win - last = minvalue + minvalue[0] = ai + death[minvalue - ring] = i + win + last = minvalue - else: + else: + if is_max: + should_replace = last[0] <= ai + else: + should_replace = last[0] >= ai + while should_replace: + if last == ring: + last = end + last -= 1 if is_max: should_replace = last[0] <= ai else: should_replace = last[0] >= ai - while should_replace: - if last == ring: - last = end - last -= 1 - if is_max: - should_replace = last[0] <= ai - else: - should_replace = last[0] >= ai - last += 1 - if last == end: - last = ring - last[0] = ai - death[last - ring] = i + win + last += 1 + if last == end: + last = ring + last[0] = ai + death[last - ring] = i + win - output[i] = calc_mm(minp, nobs, minvalue[0]) + output[i] = calc_mm(minp, nobs, minvalue[0]) - for i in range(minp - 1): - if numeric in cython.floating: - output[i] = NaN - else: - output[i] = 0 + for i in range(minp - 1): + if numeric in cython.floating: + output[i] = NaN + else: + output[i] = 0 - free(ring) - free(death) + free(ring) + free(death) - # print("output: {0}".format(output)) return output diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 78d1fa84cc5db..14966177978f4 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -464,6 +464,60 @@ def test_closed(self): with pytest.raises(ValueError): df.rolling(window=3, closed='neither') + @pytest.mark.parametrize("input_dtype", ['int', 'float']) + @pytest.mark.parametrize("func,closed,expected", [ + ('min', 'right', [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ('min', 'both', [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ('min', 'neither', [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ('min', 'left', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ('max', 'right', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ('max', 'both', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ('max', 'neither', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), + ('max', 'left', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]) + ]) + def test_closed_min_max_datetime(self, input_dtype, + func, closed, + expected): + # see gh-21704 + ser = pd.Series(data=np.arange(10).astype(input_dtype), + index=pd.date_range('2000', periods=10)) + + result = getattr(ser.rolling('3D', closed=closed), func)() + expected = pd.Series(expected, index=ser.index) + tm.assert_series_equal(result, expected) + + def test_closed_uneven(self): + # see gh-21704 + ser = pd.Series(data=np.arange(10), + index=pd.date_range('2000', periods=10)) + + # uneven + ser = ser.drop(index=ser.index[[1, 5]]) + result = ser.rolling('3D', closed='left').min() + expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6], + index=ser.index) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func,closed,expected", [ + ('min', 'right', [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ('min', 'both', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ('min', 'neither', [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ('min', 'left', [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ('max', 'right', [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]), + ('max', 'both', [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]), + ('max', 'neither', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]), + ('max', 'left', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan]) + ]) + def test_closed_min_max_minp(self, func, closed, expected): + # see gh-21704 + ser = pd.Series(data=np.arange(10), + index=pd.date_range('2000', periods=10)) + ser[ser.index[-3:]] = np.nan + result = getattr(ser.rolling('3D', min_periods=2, closed=closed), + func)() + expected = pd.Series(expected, index=ser.index) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('roller', ['1s', 1]) def tests_empty_df_rolling(self, roller): # GH 15819 Verifies that datetime and integer rolling windows can be