2
2
import logging
3
3
4
4
import pandas as pd
5
+ import numpy as np
5
6
from scipy import interpolate
6
7
7
8
from dku_timeseries .dataframe_helpers import has_duplicates , nothing_to_do , filter_empty_columns , generic_check_compute_arguments
@@ -69,6 +70,7 @@ def check(self):
69
70
70
71
71
72
class Resampler :
73
+ RESAMPLEABLE_TYPES = [int , float , np .float32 , np .int32 ]
72
74
73
75
def __init__ (self , params = None ):
74
76
@@ -77,7 +79,7 @@ def __init__(self, params=None):
77
79
self .params = params
78
80
self .params .check ()
79
81
80
- def transform (self , df , datetime_column , groupby_columns = None ):
82
+ def transform (self , df , datetime_column , groupby_columns = None ):
81
83
if groupby_columns is None :
82
84
groupby_columns = []
83
85
@@ -94,8 +96,8 @@ def transform(self, df, datetime_column, groupby_columns=None):
94
96
# when having multiple timeseries, their time range is not necessarily the same
95
97
# we thus compute a unified time index for all partitions
96
98
reference_time_index = self ._compute_full_time_index (df_copy , datetime_column )
97
- columns_to_resample = [col for col in df_copy .select_dtypes ([ int , float ] ).columns .tolist () if col != datetime_column and col not in groupby_columns ]
98
- category_columns = [col for col in df .select_dtypes ([ object , bool ] ).columns .tolist () if col != datetime_column and col not in columns_to_resample and
99
+ columns_to_resample = [col for col in df_copy .select_dtypes (Resampler . RESAMPLEABLE_TYPES ).columns .tolist () if col != datetime_column and col not in groupby_columns ]
100
+ category_columns = [col for col in df .select_dtypes (exclude = Resampler . RESAMPLEABLE_TYPES ).columns .tolist () if col != datetime_column and col not in columns_to_resample and
99
101
col not in groupby_columns ]
100
102
if groupby_columns :
101
103
grouped = df_copy .groupby (groupby_columns )
@@ -232,6 +234,13 @@ def _fill_in_category_values(self, df, category_columns):
232
234
elif self .params .category_imputation_method == "clip" :
233
235
category_filled_df .loc [:, category_columns ] = category_filled_df .loc [:, category_columns ].ffill ().bfill ()
234
236
elif self .params .category_imputation_method == "mode" :
237
+ # .mode() loses the timezone info for any datetimetz column
235
238
most_frequent_categoricals = category_filled_df .loc [:, category_columns ].mode ().iloc [0 ]
239
+
240
+ for col in category_columns :
241
+ # only perform conversion if the column has a timezone
242
+ if pd .api .types .is_datetime64_any_dtype (category_filled_df [col ]) and category_filled_df [col ].dt .tz is not None :
243
+ most_frequent_categoricals [col ] = most_frequent_categoricals [col ].tz_localize ("UTC" )
244
+
236
245
category_filled_df .loc [:, category_columns ] = category_filled_df .loc [:, category_columns ].fillna (most_frequent_categoricals )
237
246
return category_filled_df
0 commit comments