Skip to content

Commit 4a09ae8

Browse files
committed
add explainability tests
1 parent 71efc26 commit 4a09ae8

File tree

2 files changed

+601
-0
lines changed

2 files changed

+601
-0
lines changed
Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
import pandas as pd
2+
import numpy as np
3+
from datetime import datetime, timedelta
4+
5+
6+
class TimeSeriesGenerator:
7+
"""
8+
A class to generate synthetic timeseries data with various features and target values.
9+
10+
Attributes:
11+
- num_series: Number of different time series to generate.
12+
- num_points: Number of data points per time series.
13+
- start_date: Start date for the time series.
14+
- non_linear_func: Function to apply non-linear transformation to feature_3.
15+
- coeffs: Dictionary of coefficients for the features. Defaults to 1 if not provided.
16+
- freq: Frequency of the datetime column. Options: 'D' (daily), 'W' (weekly), 'M' (monthly), '2W' (bi-weekly), 'Y' (yearly), 'H' (hourly), 'T' (minutely).
17+
- freq_map: Mapping of frequency options to timedelta values.
18+
- static_1, static_2, static_3: Static features that remain constant for each series.
19+
- seasonality: Dictionary of seasonalities for the features. Defaults to predefined values if not provided.
20+
- trend_type: Type of trend ('linear', 'quadratic', 'exponential', 'logarithmic').
21+
- trend_direction: Direction of trend ('increasing', 'decreasing').
22+
"""
23+
24+
def __init__(
25+
self,
26+
num_series=10,
27+
num_points=100,
28+
start_date="2023-01-01",
29+
non_linear_func=None,
30+
coeffs=None,
31+
freq="D",
32+
seasonality=None,
33+
trend_type="linear",
34+
trend_direction="increasing",
35+
horizon=1,
36+
seed=42,
37+
):
38+
"""
39+
Initialize the TimeSeriesGenerator with the given parameters.
40+
"""
41+
self.num_series = num_series
42+
self.num_points = num_points
43+
self.start_date = datetime.strptime(start_date, "%Y-%m-%d")
44+
self.non_linear_func = (
45+
non_linear_func if non_linear_func else lambda x: np.sin(x)
46+
)
47+
self.coeffs = (
48+
coeffs
49+
if coeffs
50+
else {
51+
"feature_1": 1,
52+
"feature_2": 1,
53+
"feature_3": 1,
54+
"static_1": 0.1,
55+
"static_2": 0.1,
56+
"static_3": 0.1,
57+
}
58+
)
59+
self.freq = freq
60+
self.freq_map = {
61+
"D": timedelta(days=1),
62+
"W": timedelta(weeks=1),
63+
"2W": timedelta(weeks=2),
64+
"M": timedelta(days=30),
65+
"Y": timedelta(days=365),
66+
"H": timedelta(hours=1),
67+
"T": timedelta(minutes=1),
68+
}
69+
self.static_1 = np.random.RandomState(seed).randint(0, 100, self.num_series)
70+
self.static_2 = np.random.RandomState(seed + 1).randint(0, 100, self.num_series)
71+
self.static_3 = np.random.RandomState(seed + 2).randint(0, 100, self.num_series)
72+
self.seasonality = (
73+
seasonality
74+
if seasonality
75+
else {"feature_1": 30, "feature_2": 30, "feature_3": 15}
76+
)
77+
self.trend_type = trend_type
78+
self.trend_direction = trend_direction
79+
self.trend_feature = self.generate_trend()
80+
self.horizon = max(1, horizon) # Ensure horizon is at least 1
81+
self.seed = seed
82+
83+
def generate_trend(self):
84+
"""
85+
Generate a trend based on the specified type and direction.
86+
87+
Returns:
88+
- Numpy array representing the trend.
89+
"""
90+
t = np.arange(self.num_points)
91+
if self.trend_type == "linear":
92+
trend = t
93+
elif self.trend_type == "quadratic":
94+
trend = t**2
95+
elif self.trend_type == "exponential":
96+
trend = np.exp(t / self.num_points)
97+
elif self.trend_type == "logarithmic":
98+
trend = np.log(t + 1)
99+
else:
100+
trend = t
101+
102+
if self.trend_direction == "decreasing":
103+
trend = -trend
104+
105+
return trend / np.max(np.abs(trend))
106+
107+
def generate_dates(self):
108+
"""
109+
Generate a list of dates based on the start date and frequency.
110+
111+
Returns:
112+
- List of datetime objects.
113+
"""
114+
return [
115+
self.start_date + i * self.freq_map[self.freq]
116+
for i in range(self.num_points + self.horizon)
117+
]
118+
119+
def generate_features(self):
120+
"""
121+
Generate random features for the time series with positive values and seasonality/trend.
122+
123+
Returns:
124+
- Tuple of three numpy arrays representing the features.
125+
"""
126+
t = np.arange(self.num_points + self.horizon)
127+
rng = np.random.RandomState(self.seed)
128+
feature_1 = np.abs(
129+
np.sin(2 * np.pi * t / self.seasonality["feature_1"])
130+
+ rng.randn(self.num_points + self.horizon) * 0.1
131+
)
132+
feature_2 = np.abs(
133+
np.cos(2 * np.pi * t / self.seasonality["feature_2"])
134+
+ rng.randn(self.num_points + self.horizon) * 0.1
135+
)
136+
feature_3 = np.abs(
137+
np.sin(2 * np.pi * t / self.seasonality["feature_3"])
138+
+ rng.randn(self.num_points + self.horizon) * 0.1
139+
)
140+
fourier_1 = np.sin(2 * np.pi * t / 365.25)
141+
fourier_2 = np.cos(2 * np.pi * t / 365.25)
142+
return feature_1, feature_2, feature_3, fourier_1, fourier_2
143+
144+
def calculate_target(
145+
self, feature_1, feature_2, feature_3, fourier_1, fourier_2, series_id
146+
):
147+
"""
148+
Calculate the target value based on the features and static values.
149+
150+
Parameters:
151+
- feature_1, feature_2, feature_3, fourier_1, fourier_2: Numpy arrays representing the features.
152+
- series_id: Integer representing the series ID.
153+
154+
Returns:
155+
- Numpy array representing the target values.
156+
"""
157+
rng = np.random.RandomState(self.seed + series_id)
158+
noise = (
159+
rng.randn(self.num_points) * 5
160+
) # Adding noise for more realistic variations
161+
return (
162+
self.coeffs.get("feature_1", 10) * feature_1
163+
+ self.coeffs.get("feature_2", 10) * feature_2
164+
+ self.non_linear_func(self.coeffs.get("feature_3", 10) * feature_3)
165+
+ self.coeffs.get("static_1", 0.1) * self.static_1[series_id]
166+
+ self.coeffs.get("static_2", 0.1) * self.static_2[series_id]
167+
+ self.coeffs.get("static_3", 0.1) * self.static_3[series_id]
168+
+ self.coeffs.get("fourier_1", 5) * fourier_1
169+
+ self.coeffs.get("fourier_2", 5) * fourier_2
170+
+ self.trend_feature
171+
+ noise
172+
)
173+
174+
def generate_series(self, series_id):
175+
"""
176+
Generate a single time series with the given series ID.
177+
178+
Parameters:
179+
- series_id: Integer representing the series ID.
180+
181+
Returns:
182+
- DataFrame containing the generated time series data.
183+
"""
184+
dates = self.generate_dates()
185+
feature_1, feature_2, feature_3, fourier_1, fourier_2 = self.generate_features()
186+
target = self.calculate_target(
187+
feature_1[: self.num_points],
188+
feature_2[: self.num_points],
189+
feature_3[: self.num_points],
190+
fourier_1[: self.num_points],
191+
fourier_2[: self.num_points],
192+
series_id,
193+
)
194+
195+
data = {
196+
"series_id": [series_id] * (self.num_points + self.horizon),
197+
"ds": dates,
198+
"feature_1": feature_1,
199+
"feature_2": feature_2,
200+
"feature_3": feature_3,
201+
"fourier_1": fourier_1,
202+
"fourier_2": fourier_2,
203+
"static_1": [self.static_1[series_id]] * (self.num_points + self.horizon),
204+
"static_2": [self.static_2[series_id]] * (self.num_points + self.horizon),
205+
"static_3": [self.static_3[series_id]] * (self.num_points + self.horizon),
206+
"trend_feature": np.concatenate(
207+
[self.trend_feature, np.zeros(self.horizon)]
208+
),
209+
"target": np.concatenate([target, np.zeros(self.horizon)]),
210+
}
211+
212+
return pd.DataFrame(data)
213+
214+
def generate_timeseries_data(self):
215+
"""
216+
Generate the complete timeseries data for all series.
217+
218+
Returns:
219+
- Tuple of two DataFrames: primary and additional.
220+
"""
221+
series_list = [
222+
self.generate_series(series_id) for series_id in range(self.num_series)
223+
]
224+
full_data = pd.concat(series_list, ignore_index=True)
225+
226+
primary = (
227+
full_data.groupby("series_id")
228+
.apply(lambda df: df.iloc[: self.num_points])
229+
.reset_index(drop=True)[["series_id", "ds", "target"]]
230+
)
231+
additional = full_data.drop(columns=["target"])
232+
233+
return primary, additional
234+
235+
236+
if __name__ == "__main__":
237+
generator = TimeSeriesGenerator(
238+
non_linear_func=np.cos,
239+
coeffs={
240+
"feature_1": 2,
241+
"feature_2": 3,
242+
"feature_3": 0.5,
243+
"static_1": 0.1,
244+
"static_2": 0.1,
245+
"static_3": 0.1,
246+
"fourier_1": 0.3,
247+
"fourier_2": 0.3,
248+
},
249+
freq="T",
250+
trend_type="exponential",
251+
trend_direction="increasing",
252+
)
253+
primary, additional = generator.generate_timeseries_data()
254+
print(primary.tail(20), primary.shape)
255+
print(additional.tail(20), additional.shape)

0 commit comments

Comments
 (0)