Skip to content

Commit ecfabdb

Browse files
abdulfatirAbdul Fatir Ansari
andauthored
Add KernelSynth script (#64)
*Description of changes:* This PR adds the script to generate synthetic data from KernelSynth. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. --------- Co-authored-by: Abdul Fatir Ansari <[email protected]>
1 parent b4e8085 commit ecfabdb

File tree

3 files changed

+199
-1
lines changed

3 files changed

+199
-1
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
## 🚀 News
1212

13-
- **10 May 2024**: 🚀 We added the code for pretraining and fine-tuning Chronos models. You can find it in [this folder](./scripts/training).
13+
- **10 May 2024**: 🚀 We added the code for pretraining and fine-tuning Chronos models. You can find it in [this folder](./scripts/training). We also added [a script](./scripts/kernel-synth.py) for generating synthetic time series data from Gaussian processes (KernelSynth; see Section 4.2 in the paper for details).
1414
- **19 Apr 2024**: 🚀 Chronos is now supported on [AutoGluon-TimeSeries](https://auto.gluon.ai/stable/tutorials/timeseries/index.html), the powerful AutoML package for time series forecasting which enables model ensembles, cloud deployments, and much more. Get started with the [tutorial](https://auto.gluon.ai/stable/tutorials/timeseries/forecasting-chronos.html).
1515
- **08 Apr 2024**: 🧪 Experimental [MLX inference support](https://github.com/amazon-science/chronos-forecasting/tree/mlx) added. If you have an Apple Silicon Mac, you can now obtain significantly faster forecasts from Chronos compared to CPU inference. This provides an alternative way to exploit the GPU on your Apple Silicon Macs together with the "mps" support in PyTorch.
1616
- **25 Mar 2024**: [v1.1.0 released](https://github.com/amazon-science/chronos-forecasting/releases/tag/v1.1.0) with inference optimizations and `pipeline.embed` to extract encoder embeddings from Chronos.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ dependencies = [
1313
test = ["pytest~=8.0", "numpy~=1.21"]
1414
typecheck = ["mypy~=1.9"]
1515
training = ["gluonts[pro]", "numpy", "tensorboard", "typer", "typer-config"]
16+
kernel-synth = ["gluonts[pro]", "joblib", "scikit-learn"]
1617

1718
[tool.mypy]
1819
ignore_missing_imports = true

scripts/kernel-synth.py

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
import argparse
2+
import functools
3+
from pathlib import Path
4+
from typing import Optional
5+
6+
import numpy as np
7+
from gluonts.dataset.arrow import ArrowWriter
8+
from joblib import Parallel, delayed
9+
from sklearn.gaussian_process import GaussianProcessRegressor
10+
from sklearn.gaussian_process.kernels import (
11+
RBF,
12+
ConstantKernel,
13+
DotProduct,
14+
ExpSineSquared,
15+
Kernel,
16+
RationalQuadratic,
17+
WhiteKernel,
18+
)
19+
from tqdm.auto import tqdm
20+
21+
LENGTH = 1024
22+
KERNEL_BANK = [
23+
ExpSineSquared(periodicity=24 / LENGTH), # H
24+
ExpSineSquared(periodicity=48 / LENGTH), # 0.5H
25+
ExpSineSquared(periodicity=96 / LENGTH), # 0.25H
26+
ExpSineSquared(periodicity=24 * 7 / LENGTH), # H
27+
ExpSineSquared(periodicity=48 * 7 / LENGTH), # 0.5H
28+
ExpSineSquared(periodicity=96 * 7 / LENGTH), # 0.25H
29+
ExpSineSquared(periodicity=7 / LENGTH), # D
30+
ExpSineSquared(periodicity=14 / LENGTH), # 0.5D
31+
ExpSineSquared(periodicity=30 / LENGTH), # D
32+
ExpSineSquared(periodicity=60 / LENGTH), # 0.5D
33+
ExpSineSquared(periodicity=365 / LENGTH), # D
34+
ExpSineSquared(periodicity=365 * 2 / LENGTH), # 0.5D
35+
ExpSineSquared(periodicity=4 / LENGTH), # W
36+
ExpSineSquared(periodicity=26 / LENGTH), # W
37+
ExpSineSquared(periodicity=52 / LENGTH), # W
38+
ExpSineSquared(periodicity=4 / LENGTH), # M
39+
ExpSineSquared(periodicity=6 / LENGTH), # M
40+
ExpSineSquared(periodicity=12 / LENGTH), # M
41+
ExpSineSquared(periodicity=4 / LENGTH), # Q
42+
ExpSineSquared(periodicity=4 * 10 / LENGTH), # Q
43+
ExpSineSquared(periodicity=10 / LENGTH), # Y
44+
DotProduct(sigma_0=0.0),
45+
DotProduct(sigma_0=1.0),
46+
DotProduct(sigma_0=10.0),
47+
RBF(length_scale=0.1),
48+
RBF(length_scale=1.0),
49+
RBF(length_scale=10.0),
50+
RationalQuadratic(alpha=0.1),
51+
RationalQuadratic(alpha=1.0),
52+
RationalQuadratic(alpha=10.0),
53+
WhiteKernel(noise_level=0.1),
54+
WhiteKernel(noise_level=1.0),
55+
ConstantKernel(),
56+
]
57+
58+
59+
def random_binary_map(a: Kernel, b: Kernel):
60+
"""
61+
Applies a random binary operator (+ or *) with equal probability
62+
on kernels ``a`` and ``b``.
63+
64+
Parameters
65+
----------
66+
a
67+
A GP kernel.
68+
b
69+
A GP kernel.
70+
71+
Returns
72+
-------
73+
The composite kernel `a + b` or `a * b`.
74+
"""
75+
binary_maps = [lambda x, y: x + y, lambda x, y: x * y]
76+
return np.random.choice(binary_maps)(a, b)
77+
78+
79+
def sample_from_gp_prior(
80+
kernel: Kernel, X: np.ndarray, random_seed: Optional[int] = None
81+
):
82+
"""
83+
Draw a sample from a GP prior.
84+
85+
Parameters
86+
----------
87+
kernel
88+
The GP covaraince kernel.
89+
X
90+
The input "time" points.
91+
random_seed, optional
92+
The random seed for sampling, by default None.
93+
94+
Returns
95+
-------
96+
A time series sampled from the GP prior.
97+
"""
98+
if X.ndim == 1:
99+
X = X[:, None]
100+
101+
assert X.ndim == 2
102+
gpr = GaussianProcessRegressor(kernel=kernel)
103+
ts = gpr.sample_y(X, n_samples=1, random_state=random_seed)
104+
105+
return ts
106+
107+
108+
def sample_from_gp_prior_efficient(
109+
kernel: Kernel,
110+
X: np.ndarray,
111+
random_seed: Optional[int] = None,
112+
method: str = "eigh",
113+
):
114+
"""
115+
Draw a sample from a GP prior. An efficient version that allows specification
116+
of the sampling method. The default sampling method used in GaussianProcessRegressor
117+
is based on SVD which is significantly slower that alternatives such as `eigh` and
118+
`cholesky`.
119+
120+
Parameters
121+
----------
122+
kernel
123+
The GP covaraince kernel.
124+
X
125+
The input "time" points.
126+
random_seed, optional
127+
The random seed for sampling, by default None.
128+
method, optional
129+
The sampling method for multivariate_normal, by default `eigh`.
130+
131+
Returns
132+
-------
133+
A time series sampled from the GP prior.
134+
"""
135+
if X.ndim == 1:
136+
X = X[:, None]
137+
138+
assert X.ndim == 2
139+
140+
cov = kernel(X)
141+
ts = np.random.default_rng(seed=random_seed).multivariate_normal(
142+
mean=np.zeros(X.shape[0]), cov=cov, method=method
143+
)
144+
145+
return ts
146+
147+
148+
def generate_time_series(max_kernels: int = 5):
149+
"""Generate a synthetic time series from KernelSynth.
150+
151+
Parameters
152+
----------
153+
max_kernels, optional
154+
The maximum number of base kernels to use for each time series, by default 5
155+
156+
Returns
157+
-------
158+
A time series generated by KernelSynth.
159+
"""
160+
while True:
161+
X = np.linspace(0, 1, LENGTH)
162+
163+
# Randomly select upto max_kernels kernels from the KERNEL_BANK
164+
selected_kernels = np.random.choice(
165+
KERNEL_BANK, np.random.randint(1, max_kernels + 1), replace=True
166+
)
167+
168+
# Combine the sampled kernels using random binary operators
169+
kernel = functools.reduce(random_binary_map, selected_kernels)
170+
171+
# Sample a time series from the GP prior
172+
try:
173+
ts = sample_from_gp_prior(kernel=kernel, X=X)
174+
except np.linalg.LinAlgError as err:
175+
print("Error caught:", err)
176+
continue
177+
178+
# The timestamp is arbitrary
179+
return {"start": np.datetime64("2000-01-01 00:00", "s"), "target": ts.squeeze()}
180+
181+
182+
if __name__ == "__main__":
183+
parser = argparse.ArgumentParser()
184+
parser.add_argument("-N", "--num-series", type=int, default=1000_000)
185+
parser.add_argument("-J", "--max-kernels", type=int, default=5)
186+
args = parser.parse_args()
187+
path = Path(__file__).parent / "kernelsynth-data.arrow"
188+
189+
generated_dataset = Parallel(n_jobs=-1)(
190+
delayed(generate_time_series)(max_kernels=args.max_kernels)
191+
for _ in tqdm(range(args.num_series))
192+
)
193+
194+
ArrowWriter(compression="lz4").write_to_file(
195+
generated_dataset,
196+
path=path,
197+
)

0 commit comments

Comments
 (0)