Skip to content

Commit 0dda388

Browse files
authored
Merge pull request #61 from dataiku/fix-type-handling
Allow integers columns with null values to be resampled
2 parents 4a3ef72 + 4ab1ed7 commit 0dda388

File tree

4 files changed

+44
-8
lines changed

4 files changed

+44
-8
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
4+
## Version 2.1.1 - Bugfix release - 2025-04
5+
### Resampling recipe
6+
- :bug: support integer columns containing null values
7+
- :bug: support float values in resampling
8+
39
## Version 2.1.0 - New feature release - 2024-12
410
### Resampling recipe
511
- :date: Support selecting custom dates for the start and end of extrapolation

custom-recipes/timeseries-preparation-resampling/recipe.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,42 @@
66
from io_utils import get_input_output
77
from recipe_config_loading import check_and_get_groupby_columns, check_time_column_parameter, check_python_version, get_resampling_params
88

9+
import inspect
10+
911
check_python_version()
1012

1113
# --- Setup
1214
(input_dataset, output_dataset) = get_input_output()
1315
recipe_config = get_recipe_config()
14-
input_dataset_columns = [column["name"] for column in input_dataset.read_schema()]
16+
schema = input_dataset.read_schema()
17+
input_dataset_columns = [column["name"] for column in schema]
1518
check_time_column_parameter(recipe_config, input_dataset_columns)
1619
groupby_columns = check_and_get_groupby_columns(recipe_config, input_dataset_columns)
1720
datetime_column = recipe_config.get('datetime_column')
1821
params = get_resampling_params(recipe_config)
1922

20-
# --- Run
21-
df = input_dataset.get_dataframe(infer_with_pandas=False)
23+
# use_nullable_integers is only available in DSS >= 13.1
24+
# Prior to this, the plugin does not support integer columns with NaN values
25+
signature = inspect.signature(input_dataset.get_dataframe)
26+
can_use_nullable_integers = "use_nullable_integers" in signature.parameters
27+
28+
if can_use_nullable_integers:
29+
df = input_dataset.get_dataframe(infer_with_pandas=False, use_nullable_integers=True, bool_as_str=True)
30+
else:
31+
df = input_dataset.get_dataframe(infer_with_pandas=False)
32+
2233
resampler = Resampler(params)
2334
output_df = resampler.transform(df, datetime_column, groupby_columns=groupby_columns)
2435

36+
# int columns must be resampled into int values
37+
columns_to_round = [
38+
column["name"]
39+
for column in schema
40+
if column["type"] in ["tinyint", "smallint", "int", "bigint"]
41+
]
42+
output_df[columns_to_round] = output_df[columns_to_round].round()
43+
44+
2545
# --- Write output
26-
output_dataset.write_with_schema(output_df)
46+
output_dataset.write_schema(schema)
47+
output_dataset.write_dataframe(output_df)

plugin.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"id": "timeseries-preparation",
3-
"version": "2.1.0",
3+
"version": "2.1.1",
44
"meta": {
55
"supportLevel": "SUPPORTED",
66
"label": "Time Series Preparation",

python-lib/dku_timeseries/resampling.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33

44
import pandas as pd
5+
import numpy as np
56
from scipy import interpolate
67

78
from dku_timeseries.dataframe_helpers import has_duplicates, nothing_to_do, filter_empty_columns, generic_check_compute_arguments
@@ -69,6 +70,7 @@ def check(self):
6970

7071

7172
class Resampler:
73+
RESAMPLEABLE_TYPES = [int, float, np.float32, np.int32]
7274

7375
def __init__(self, params=None):
7476

@@ -77,7 +79,7 @@ def __init__(self, params=None):
7779
self.params = params
7880
self.params.check()
7981

80-
def transform(self, df, datetime_column, groupby_columns=None):
82+
def transform(self, df, datetime_column, groupby_columns=None):
8183
if groupby_columns is None:
8284
groupby_columns = []
8385

@@ -94,8 +96,8 @@ def transform(self, df, datetime_column, groupby_columns=None):
9496
# when having multiple timeseries, their time range is not necessarily the same
9597
# we thus compute a unified time index for all partitions
9698
reference_time_index = self._compute_full_time_index(df_copy, datetime_column)
97-
columns_to_resample = [col for col in df_copy.select_dtypes([int, float]).columns.tolist() if col != datetime_column and col not in groupby_columns]
98-
category_columns = [col for col in df.select_dtypes([object, bool]).columns.tolist() if col != datetime_column and col not in columns_to_resample and
99+
columns_to_resample = [col for col in df_copy.select_dtypes(Resampler.RESAMPLEABLE_TYPES).columns.tolist() if col != datetime_column and col not in groupby_columns]
100+
category_columns = [col for col in df.select_dtypes(exclude=Resampler.RESAMPLEABLE_TYPES).columns.tolist() if col != datetime_column and col not in columns_to_resample and
99101
col not in groupby_columns]
100102
if groupby_columns:
101103
grouped = df_copy.groupby(groupby_columns)
@@ -232,6 +234,13 @@ def _fill_in_category_values(self, df, category_columns):
232234
elif self.params.category_imputation_method == "clip":
233235
category_filled_df.loc[:, category_columns] = category_filled_df.loc[:, category_columns].ffill().bfill()
234236
elif self.params.category_imputation_method == "mode":
237+
# .mode() loses the timezone info for any datetimetz column
235238
most_frequent_categoricals = category_filled_df.loc[:, category_columns].mode().iloc[0]
239+
240+
for col in category_columns:
241+
# only perform conversion if the column has a timezone
242+
if pd.api.types.is_datetime64_any_dtype(category_filled_df[col]) and category_filled_df[col].dt.tz is not None:
243+
most_frequent_categoricals[col] = most_frequent_categoricals[col].tz_localize("UTC")
244+
236245
category_filled_df.loc[:, category_columns] = category_filled_df.loc[:, category_columns].fillna(most_frequent_categoricals)
237246
return category_filled_df

0 commit comments

Comments
 (0)