Skip to content

Commit 16b6167

Browse files
Merge pull request #52 from dataiku/fix/resampling-last-timestamp-not-round-unit-sc-113991
fix resampling extra date
2 parents bdb662c + e1820aa commit 16b6167

File tree

5 files changed

+37
-16
lines changed

5 files changed

+37
-16
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Changelog
22

3+
## Version 2.0.2 - Bugfix release - 2023-01
4+
- 🪲 Fix the bug that was adding an extra date at the end after resampling when the last input timestamp was exactly at the end of a period (week, month, half-year, year)
5+
36
## Version 2.0.1 - Bugfix release - 2021-06
47
- :bug: Keep the empty values rather than filtering them with the extrapolation method "Don't extrapolate (impute nulls)"
58
- :scissors: Add the extrapolation method "Don't extrapolate (no imputation)" to filter missing values

plugin.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"id": "timeseries-preparation",
3-
"version": "2.0.1",
3+
"version": "2.0.2",
44
"meta": {
55
"supportLevel": "SUPPORTED",
66
"label": "Time Series Preparation",
@@ -15,4 +15,4 @@
1515
"Time Series"
1616
]
1717
}
18-
}
18+
}

python-lib/dku_timeseries/timeseries_helpers.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pandas as pd
66
from pandas.tseries.frequencies import to_offset
77
from pandas.tseries.offsets import BDay
8+
from pandas.tseries.offsets import Day
89

910
logger = logging.getLogger(__name__)
1011

@@ -77,17 +78,35 @@ def get_date_offset(time_unit, offset_value):
7778

7879

7980
def generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit):
80-
rounding_freq_string = FREQUENCY_STRINGS.get(time_unit)
8181
clip_start_value = get_date_offset(time_unit, clip_start)
8282
clip_end_value = get_date_offset(time_unit, clip_end)
8383
shift_value = get_date_offset(time_unit, shift)
84-
if time_unit in ROUND_COMPATIBLE_TIME_UNIT:
85-
start_index = start_time.round(rounding_freq_string) + clip_start_value + shift_value
86-
end_index = end_time.round(rounding_freq_string) - clip_end_value + shift_value
87-
else: # for week, month, year we round up to closest day
88-
start_index = start_time.round("D") + clip_start_value + shift_value
89-
# for some reason date_range omit the last entry when dealing with months, years
90-
end_index = end_time.round("D") - clip_end_value + get_date_offset(time_unit, time_step) + shift_value
84+
85+
# for business day, week, month, year we round up to closest day
86+
rounding_freq_string = FREQUENCY_STRINGS.get(time_unit) if time_unit in ROUND_COMPATIBLE_TIME_UNIT else "D"
87+
start_index = start_time.round(rounding_freq_string)
88+
end_index = end_time.round(rounding_freq_string)
89+
90+
if time_unit not in ROUND_COMPATIBLE_TIME_UNIT:
91+
# pd.date_range omits the end index when frequency is business day, week, month or year,
92+
# unless the end index is exactly at the end of the period.
93+
# so we need to offset the end index to make sure it falls between the last time step and the following one
94+
if time_unit == "business_days":
95+
# if start index is not a business day, then we want to start the range on the next Monday
96+
# adding BDay(0) does nothing if the timestamp is already a business day and converts it into the first next business day otherwise
97+
start_index = start_index + BDay(0)
98+
99+
# if end index is not a business day, then we want to end the range on the previous Friday
100+
# adding Day(1) then subtracting BDay(1) does nothing if the timestamp is already a business day and converts it into the last previous business day otherwise
101+
end_index = (end_index + Day(1)) - BDay(1)
102+
else:
103+
# we add one less Day to the end index to make sure we do not include the following time stamp
104+
# if the end index is exactly at the end of the period
105+
end_index = end_index + get_date_offset(time_unit, time_step) - Day(1)
106+
107+
start_index = start_index + clip_start_value + shift_value
108+
end_index = end_index - clip_end_value + shift_value
109+
91110
return pd.date_range(start=start_index, end=end_index, freq=frequency)
92111

93112

tests/python/unit/dku_timeseries/resampling/test_resampler_helpers.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def test_generate_date_range_week(self, config):
9191

9292
end_time = pd.Timestamp('2021-01-24 00:00:00')
9393
date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit)
94-
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2020-12-27', '2021-01-10', '2021-01-24', '2021-02-07']))
94+
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2020-12-27', '2021-01-10', '2021-01-24']))
9595

9696
date_range = generate_date_range(start_time, end_time, 1, 0, 1, frequency, time_step, time_unit)
9797
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-10', '2021-01-24', '2021-02-07']))
@@ -145,19 +145,19 @@ def test_generate_date_range_b_days(self, config):
145145
time_step = params.time_step
146146

147147
date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit)
148-
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-11']))
148+
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08']))
149149

150150
clip_start = 1
151151
clip_end = 1
152152
shift = 0
153153
date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit)
154-
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-11']))
154+
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-05', '2021-01-06', '2021-01-07']))
155155

156156
clip_start = 2
157157
clip_end = 2
158158
shift = 0
159159
date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit)
160-
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08']))
160+
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-06']))
161161

162162
def test_generate_date_range_days(self, config):
163163
config["time_unit"] = "days"

tests/python/unit/dku_timeseries/resampling/test_resampling_frequencies.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def test_month(self, config, columns):
4444
df = get_df("Y", columns)
4545
output_df = resampler.transform(df, columns.date)
4646

47-
assert np.mean(output_df[columns.data]) == 316.32550000000003
4847
expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1960-02-29T00:00:00.000000000',
4948
'1960-04-30T00:00:00.000000000', '1960-06-30T00:00:00.000000000',
5049
'1960-08-31T00:00:00.000000000', '1960-10-31T00:00:00.000000000',
@@ -54,7 +53,7 @@ def test_month(self, config, columns):
5453
'1961-12-31T00:00:00.000000000', '1962-02-28T00:00:00.000000000',
5554
'1962-04-30T00:00:00.000000000', '1962-06-30T00:00:00.000000000',
5655
'1962-08-31T00:00:00.000000000', '1962-10-31T00:00:00.000000000',
57-
'1962-12-31T00:00:00.000000000', '1963-02-28T00:00:00.000000000'])
56+
'1962-12-31T00:00:00.000000000'])
5857
np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
5958

6059
def test_weeks_sunday_end(self, config, columns):

0 commit comments

Comments
 (0)