Merge pull request #61 from dataiku/fix-type-handling

tmwly · web-flow · commit 0dda38881fb3 · 2025-04-28T10:17:49.000+02:00
Allow integers columns with null values to be resampled
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+
+## Version 2.1.1 - Bugfix release - 2025-04
+### Resampling recipe
+- :bug: support integer columns containing null values
+- :bug: support float values in resampling
+
 ## Version 2.1.0 - New feature release - 2024-12
 ### Resampling recipe
 - :date: Support selecting custom dates for the start and end of extrapolation
diff --git a/custom-recipes/timeseries-preparation-resampling/recipe.py b/custom-recipes/timeseries-preparation-resampling/recipe.py
@@ -6,21 +6,42 @@
 from io_utils import get_input_output
 from recipe_config_loading import check_and_get_groupby_columns, check_time_column_parameter, check_python_version, get_resampling_params
 
+import inspect
+
 check_python_version()
 
 # --- Setup
 (input_dataset, output_dataset) = get_input_output()
 recipe_config = get_recipe_config()
-input_dataset_columns = [column["name"] for column in input_dataset.read_schema()]
+schema = input_dataset.read_schema()
+input_dataset_columns = [column["name"] for column in schema]
 check_time_column_parameter(recipe_config, input_dataset_columns)
 groupby_columns = check_and_get_groupby_columns(recipe_config, input_dataset_columns)
 datetime_column = recipe_config.get('datetime_column')
 params = get_resampling_params(recipe_config)
 
-# --- Run
-df = input_dataset.get_dataframe(infer_with_pandas=False)
+# use_nullable_integers is only available in DSS >= 13.1
+# Prior to this, the plugin does not support integer columns with NaN values
+signature = inspect.signature(input_dataset.get_dataframe)
+can_use_nullable_integers = "use_nullable_integers" in signature.parameters
+
+if can_use_nullable_integers:
+    df = input_dataset.get_dataframe(infer_with_pandas=False, use_nullable_integers=True, bool_as_str=True)
+else:
+    df = input_dataset.get_dataframe(infer_with_pandas=False)
+
 resampler = Resampler(params)
 output_df = resampler.transform(df, datetime_column, groupby_columns=groupby_columns)
 
+# int columns must be resampled into int values
+columns_to_round = [
+    column["name"]
+    for column in schema
+    if column["type"] in ["tinyint", "smallint", "int", "bigint"]
+]
+output_df[columns_to_round] = output_df[columns_to_round].round()
+
+
 # --- Write output
-output_dataset.write_with_schema(output_df)
+output_dataset.write_schema(schema)
+output_dataset.write_dataframe(output_df)
diff --git a/plugin.json b/plugin.json
@@ -1,6 +1,6 @@
 {
     "id": "timeseries-preparation",
-    "version": "2.1.0",
+    "version": "2.1.1",
     "meta": {
         "supportLevel": "SUPPORTED",
         "label": "Time Series Preparation",
diff --git a/python-lib/dku_timeseries/resampling.py b/python-lib/dku_timeseries/resampling.py
@@ -2,6 +2,7 @@
 import logging
 
 import pandas as pd
+import numpy as np
 from scipy import interpolate
 
 from dku_timeseries.dataframe_helpers import has_duplicates, nothing_to_do, filter_empty_columns, generic_check_compute_arguments
@@ -69,6 +70,7 @@ def check(self):
 
 
 class Resampler:
+    RESAMPLEABLE_TYPES = [int, float, np.float32, np.int32]
 
     def __init__(self, params=None):
 
@@ -77,7 +79,7 @@ def __init__(self, params=None):
         self.params = params
         self.params.check()
 
-    def transform(self, df, datetime_column, groupby_columns=None):
+    def transform(self, df, datetime_column, groupby_columns=None):    
         if groupby_columns is None:
             groupby_columns = []
 
@@ -94,8 +96,8 @@ def transform(self, df, datetime_column, groupby_columns=None):
         # when having multiple timeseries, their time range is not necessarily the same
         # we thus compute a unified time index for all partitions
         reference_time_index = self._compute_full_time_index(df_copy, datetime_column)
-        columns_to_resample = [col for col in df_copy.select_dtypes([int, float]).columns.tolist() if col != datetime_column and col not in groupby_columns]
-        category_columns = [col for col in df.select_dtypes([object, bool]).columns.tolist() if col != datetime_column and col not in columns_to_resample and
+        columns_to_resample = [col for col in df_copy.select_dtypes(Resampler.RESAMPLEABLE_TYPES).columns.tolist() if col != datetime_column and col not in groupby_columns]
+        category_columns = [col for col in df.select_dtypes(exclude=Resampler.RESAMPLEABLE_TYPES).columns.tolist() if col != datetime_column and col not in columns_to_resample and
                             col not in groupby_columns]
         if groupby_columns:
             grouped = df_copy.groupby(groupby_columns)
@@ -232,6 +234,13 @@ def _fill_in_category_values(self, df, category_columns):
         elif self.params.category_imputation_method == "clip":
             category_filled_df.loc[:, category_columns] = category_filled_df.loc[:, category_columns].ffill().bfill()
         elif self.params.category_imputation_method == "mode":
+            # .mode() loses the timezone info for any datetimetz column
             most_frequent_categoricals = category_filled_df.loc[:, category_columns].mode().iloc[0]
+
+            for col in category_columns:
+                # only perform conversion if the column has a timezone
+                if pd.api.types.is_datetime64_any_dtype(category_filled_df[col]) and category_filled_df[col].dt.tz is not None:
+                    most_frequent_categoricals[col] = most_frequent_categoricals[col].tz_localize("UTC")
+
             category_filled_df.loc[:, category_columns] = category_filled_df.loc[:, category_columns].fillna(most_frequent_categoricals)
         return category_filled_df

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"id": "timeseries-preparation",`
`3`		`- "version": "2.1.0",`
	`3`	`+ "version": "2.1.1",`
`4`	`4`	`"meta": {`
`5`	`5`	`"supportLevel": "SUPPORTED",`
`6`	`6`	`"label": "Time Series Preparation",`