Fix freq string issues in datasets (awslabs#3232)

lostella · lostella · commit b5a70f79c65f · 2024-11-07T09:22:34.000+01:00
*Issue #, if available:* fixes awslabs#3229, pandas changes in frequency strings broke some of our logic. *Description of changes:* Add missing frequency strings in _tsf_datasets.py, and get rid of other frequency-related warnings with other datasets. I tested the change by running the following script: ```python from gluonts.dataset.repository import get_dataset, dataset_names skip = [ "m3_monthly", "m3_yearly", "m3_quarterly", "m3_other", "m5", ] for dataset_name in dataset_names: if dataset_name in skip: continue print(dataset_name) dataset = get_dataset(dataset_name, regenerate=True) ``` By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. **Please tag this pr with at least one of these labels to make our release process faster:** BREAKING, new feature, bug fix, other change, dev setup
diff --git a/src/gluonts/dataset/repository/_ercot.py b/src/gluonts/dataset/repository/_ercot.py
@@ -26,7 +26,7 @@ def generate_ercot_dataset(dataset_path: Path, dataset_writer: DatasetWriter):
     df.ffill(inplace=True)
     regions = [col for col in df.columns if col not in ["ds", "y"]]
 
-    freq = "1H"
+    freq = "1h"
     prediction_length = 24
 
     start = pd.Period(df["ds"][0], freq=freq)
diff --git a/src/gluonts/dataset/repository/_gp_copula_2019.py b/src/gluonts/dataset/repository/_gp_copula_2019.py
@@ -63,7 +63,7 @@ class GPCopulaDataset(NamedTuple):
         # original dataset can be found at https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014#
         num_series=370,
         prediction_length=24,
-        freq="H",
+        freq="h",
         rolling_evaluations=7,
         max_target_dim=None,
     ),
@@ -73,7 +73,7 @@ class GPCopulaDataset(NamedTuple):
         # note there are 963 in the original dataset from https://archive.ics.uci.edu/ml/datasets/PEMS-SF
         num_series=963,
         prediction_length=24,
-        freq="H",
+        freq="h",
         rolling_evaluations=7,
         max_target_dim=None,
     ),
@@ -82,7 +82,7 @@ class GPCopulaDataset(NamedTuple):
         url=root + "solar_nips.tar.gz",
         num_series=137,
         prediction_length=24,
-        freq="H",
+        freq="h",
         rolling_evaluations=7,
         max_target_dim=None,
     ),
diff --git a/src/gluonts/dataset/repository/_lstnet.py b/src/gluonts/dataset/repository/_lstnet.py
@@ -91,7 +91,7 @@ class LstnetDataset(NamedTuple):
         prediction_length=24,
         rolling_evaluations=7,
         start_date="2012-01-01",
-        freq="1H",
+        freq="1h",
         agg_freq=None,
     ),
     "traffic": LstnetDataset(
@@ -105,7 +105,7 @@ class LstnetDataset(NamedTuple):
         prediction_length=24,
         rolling_evaluations=7,
         start_date="2015-01-01",
-        freq="H",
+        freq="h",
         agg_freq=None,
     ),
     "solar-energy": LstnetDataset(
@@ -117,7 +117,7 @@ class LstnetDataset(NamedTuple):
         rolling_evaluations=7,
         start_date="2006-01-01",
         freq="10min",
-        agg_freq="1H",
+        agg_freq="1h",
     ),
 }
 
diff --git a/src/gluonts/dataset/repository/_tsf_datasets.py b/src/gluonts/dataset/repository/_tsf_datasets.py
@@ -278,11 +278,15 @@ def generate_forecasting_dataset(
 def default_prediction_length_from_frequency(freq: str) -> int:
     prediction_length_map = {
         "T": 60,
+        "min": 60,
         "H": 48,
+        "h": 48,
         "D": 30,
         "W-SUN": 8,
         "M": 12,
+        "ME": 12,
         "Y": 4,
+        "YE": 4,
     }
     try:
         freq = to_offset(freq).name
diff --git a/src/gluonts/dataset/repository/_tsf_reader.py b/src/gluonts/dataset/repository/_tsf_reader.py
@@ -49,10 +49,10 @@ def frequency_converter(freq: str):
 
 BASE_FREQ_TO_PANDAS_OFFSET: Dict[str, str] = {
     "seconds": "S",
-    "minutely": "T",
-    "minutes": "T",
-    "hourly": "H",
-    "hours": "H",
+    "minutely": "min",
+    "minutes": "min",
+    "hourly": "h",
+    "hours": "h",
     "daily": "D",
     "days": "D",
     "weekly": "W",
diff --git a/src/gluonts/dataset/repository/_uber_tlc.py b/src/gluonts/dataset/repository/_uber_tlc.py
@@ -28,7 +28,7 @@ def generate_uber_dataset(
     prediction_length: int,
     dataset_writer: DatasetWriter,
 ):
-    subsets = {"daily": "1D", "hourly": "1H"}
+    subsets = {"daily": "1D", "hourly": "1h"}
     assert (
         uber_freq.lower() in subsets
     ), f"invalid uber_freq='{uber_freq}'. Allowed values: {subsets.keys()}"
diff --git a/test/dataset/test_tsf_reader.py b/test/dataset/test_tsf_reader.py
@@ -20,10 +20,10 @@
     "input_freq_str, output_freq_str",
     [
         ("30_seconds", "30S"),
-        ("minutely", "T"),
-        ("10_minutes", "10T"),
-        ("hourly", "H"),
-        ("half_hourly", "0.5H"),
+        ("minutely", "min"),
+        ("10_minutes", "10min"),
+        ("hourly", "h"),
+        ("half_hourly", "0.5h"),
         ("daily", "D"),
         ("7_days", "7D"),
         ("weekly", "W"),