Skip to content

Commit 48505ca

Browse files
authored
Merge pull request #102 from zillow/batch-model-bug-fixes
Luminaire Package Upgrade
2 parents 8c48e15 + cce13b7 commit 48505ca

14 files changed

+156
-37
lines changed

.github/workflows/github-pages.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ jobs:
1111
steps:
1212
- uses: actions/checkout@v2
1313

14-
- name: Setup Python
14+
- name: Set up Python 3.10
1515
uses: actions/setup-python@v2
1616
with:
17-
python-version: '3.7'
17+
python-version: '3.10'
1818

1919
- name: Install dependencies
2020
run: |

.github/workflows/python-app.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ jobs:
1313
steps:
1414
- uses: actions/checkout@v2
1515

16-
- name: Set up Python 3.7
16+
- name: Set up Python 3.10
1717
uses: actions/setup-python@v2
1818
with:
19-
python-version: '3.7'
19+
python-version: '3.10'
2020

2121
- name: Cache Dependencies
2222
id: cache

.github/workflows/python-publish.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ jobs:
1515
steps:
1616
- uses: actions/checkout@v2
1717

18-
- name: Set up Python 3.7
18+
- name: Set up Python 3.10
1919
uses: actions/setup-python@v2
2020
with:
21-
python-version: '3.7'
21+
python-version: '3.10'
2222

2323
- name: Install dependencies
2424
run: |

docs/tutorial/dataprofiling.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ Luminaire *DataExploration* implements different exploratory data analysis to de
88

99
Luminaire data exploration and profiling runs two different workflows. The impute only option in profiling performs imputation for any missing data in the input time series and does not run any profiling to generate insights from the input time series.
1010

11+
.. NOTE::
12+
If duplicate dates dates are present in the input time series, Data profiling processes the data dates by averaging
13+
the duplicates and merging them as a single data date.
14+
15+
1116
>>> from luminaire.exploration.data_exploration import DataExploration
1217
>>> data
1318
raw

docs/tutorial/outlier_batch.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@ These *preprocessing_parameters* are used for training the structural model.
3434
>>> print(success, model_date, model)
3535
(True, '2020-06-07 00:00:00', <luminaire_models.model.lad_structural.LADStructuralModel object at 0x7f97e127d320>)
3636

37+
.. NOTE::
38+
Luminaire allows to run model validation during the training process. This validation is a check for any underfit
39+
that might have occurred due to any misspecification of the model hyperparameters or due to nonstationarity.
40+
Validation flag is set to false by default but that can be turned on by simply setting *validation = True* during
41+
training. This option is only available for the *LADStructuralModel* class. Refer to the API reference for more
42+
information.
43+
3744
The trained model works as a data-driven source of truth to evaluate any future time series values to be monitored. The *score* method is used to check whether new data points are anomalous.
3845

3946
>>> model.score(2000, '2020-06-08')

luminaire/exploration/data_exploration.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,9 +130,24 @@ def __init__(self,
130130
self.tc_max_window_length = 24
131131

132132
def add_missing_index(self, df=None, freq=None):
133+
"""
134+
This function reindexes a pandas dataframe with missing dates for a given time series frequency.
135+
136+
Note: If duplicate dates dates are present in the dataframe, this function takes average of the duplicate
137+
data dates and merges them as a single data date.
138+
139+
:param pandas.DataFrame df: Input pandas dataframe containing the time series
140+
:param str freq: The frequency of the time-series. A `Pandas offset`_ such as 'D', 'H', or 'M'
141+
:return: pandas dataframe after reindexing missing data dates
142+
143+
:rtype: pandas.DataFrame
144+
"""
133145

134146
import pandas as pd
135147

148+
# Adding a group by logic for duplicate index
149+
df = df.groupby(df.index).mean()
150+
136151
# Create a new Pandas data frame based on the first valid index and
137152
# current date using the frequency defined by the use
138153
idx = pd.date_range(start=df.first_valid_index(), end=df.last_valid_index(), freq=freq)
@@ -798,14 +813,20 @@ def kf_naive_outlier_detection(self, input_series, idx_position):
798813
from pykalman import KalmanFilter
799814

800815
kf = KalmanFilter()
801-
filtered_state_means, filtered_state_covariance = kf.em(input_series).filter(input_series)
802816

803-
residuals = input_series - filtered_state_means[:,0]
817+
truncated_series = input_series[-(self.min_ts_length * 3):] \
818+
if idx_position == len(input_series) - 1 else input_series
819+
idx_position = len(truncated_series) - 1
820+
821+
filtered_state_means, filtered_state_covariance = kf.em(truncated_series).filter(truncated_series)
804822

823+
residuals = truncated_series - filtered_state_means[:, 0]
824+
825+
# Catching marginal anomalies to avoid during training
805826
is_anomaly = residuals[idx_position] < np.mean(residuals) \
806-
- (2.5 * np.sqrt(filtered_state_covariance)[idx_position][0][0]) \
827+
- (1 * np.sqrt(filtered_state_covariance)[idx_position][0][0]) \
807828
or residuals[idx_position] > np.mean(residuals) \
808-
+ (2.5 * np.sqrt(filtered_state_covariance)[idx_position][0][0])
829+
+ (1 * np.sqrt(filtered_state_covariance)[idx_position][0][0])
809830

810831
return is_anomaly
811832

@@ -882,6 +903,9 @@ def _prepare(self, df, impute_only, streaming=False, **kwargs):
882903
df = df.iloc[-min(max_ts_length, len(df)):]
883904
df = self._truncate_by_data_gaps(df=df, target_metric=target_metric)
884905

906+
if len(df) < min_ts_length:
907+
raise ValueError("Due to a recent data gap, training is waiting for more data to populate")
908+
885909
if not streaming and len(df) < min_ts_length:
886910
raise ValueError("The training data observed continuous missing data near the end. Require more stable "
887911
"data to train")

luminaire/model/lad_structural.py

Lines changed: 80 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ def _seasonal_arima(self, endog=None, exog=None, p=None, d=None, q=None, imodels
237237
"""
238238

239239
import numpy as np
240-
import statsmodels.tsa.arima_model as arima
240+
import statsmodels.tsa.arima.model as arima
241241

242242
# Extract the exogenous variable generated based on (imodels * 2) number of most significant
243243
# frequencies
@@ -257,13 +257,7 @@ def _seasonal_arima(self, endog=None, exog=None, p=None, d=None, q=None, imodels
257257

258258
try:
259259
stepwise_fit.append(arima.ARIMA(endog=endog, exog=exog,
260-
order=(p, d, q)).fit(seasonal=False, trace=False,
261-
method='css',
262-
solver='bfgs',
263-
error_action='ignore',
264-
stepwise_fit=True,
265-
warn_convergence=False,
266-
disp=False))
260+
order=(p, d, q)).fit(method='statespace'))
267261
except Exception as e:
268262
raise LADStructuralError(message=str(e))
269263

@@ -442,12 +436,77 @@ def _training(self, data, ts_start, ts_end, min_ts_length=None, min_ts_mean=None
442436

443437
return result, order
444438

445-
def train(self, data, optimize=False, **kwargs):
439+
def _validate_model(self, data, hyper_params, result):
440+
"""
441+
This function validates a newly trained model before publishing. This valudation checks for underfit due to
442+
misspecification of hyperparameters or nonstationarity.
443+
444+
:param pandas.DataFrame data: Input time series data
445+
:param dict hyper_params: Hyperparameters for model training
446+
:param dict result: Training outputs
447+
:return: A validation flag for model to publish. The function returns True when the validation is successful,
448+
False otherwise.
449+
450+
:rtype: bool
451+
"""
452+
453+
import numpy as np
454+
import scipy.stats as st
455+
456+
levene_alpha = 0.05
457+
grubb_alpha = 0.001
458+
f_test_alpha = 0.05
459+
mwu_alpha = 0.0005
460+
passed = True
461+
462+
model = LADStructuralModel(hyper_params=self.hyper_params, **result)
463+
464+
freq = "1" + result['freq'] if not any(char.isdigit() for char in result['freq']) else result['freq']
465+
466+
baseline = data[self._imputed_metric][-self.max_scoring_length:].values
467+
baseline = (np.exp(baseline) - 1).tolist() if hyper_params['is_log_transformed'] else baseline.tolist()
468+
469+
predictions = []
470+
pred_date = pd.Timestamp(result['training_end_date'])
471+
for i in range(len(baseline)):
472+
pred_date = pred_date + pd.Timedelta(freq)
473+
result = model.score(baseline[i], pred_date)
474+
if result['Success']:
475+
predictions.append(result['Prediction'])
476+
477+
# We perform Levene test, Mann-Whitney U test, and Grubb test between the predictions and the baseline
478+
# to understand if the are widely different. The two arrays need to be of the same length in order to
479+
# perform these tests.
480+
if len(predictions) == len(baseline):
481+
N = len(predictions)
482+
483+
pvalue_levene = st.levene(predictions, baseline)[1]
484+
pvalue_levene_diff = st.levene(np.diff(predictions), np.diff(baseline))[1]
485+
pvalue_mwu = st.mannwhitneyu(predictions, baseline)[1] if predictions != baseline else 1.0
486+
487+
grubb_test_stat = max(abs(np.array(predictions) - np.mean(predictions))) / np.std(predictions, ddof=1)
488+
t_cutoff = st.t.ppf((grubb_alpha / (2 * N)), df=N - 2)
489+
grubb_test = grubb_test_stat > ((N - 1) / np.sqrt(N)) * np.sqrt((t_cutoff**2) / (N - 2 + (t_cutoff**2)))
490+
491+
if grubb_test:
492+
passed = False
493+
if np.std(baseline, ddof=1) > 0:
494+
if (not np.isnan(pvalue_levene) and not np.isnan(pvalue_levene_diff) and not np.isnan(pvalue_mwu)) and \
495+
(pvalue_levene < levene_alpha or pvalue_levene_diff < levene_alpha):
496+
F = (np.std(predictions, ddof=1) ** 2) / (np.std(baseline, ddof=1) ** 2)
497+
f_test_pval = 1 - st.f.cdf(F, N - 1, len(baseline) - 1)
498+
if f_test_pval < f_test_alpha and pvalue_mwu < mwu_alpha:
499+
passed = False
500+
501+
return passed
502+
503+
def train(self, data, optimize=False, validate=False, **kwargs):
446504
"""
447505
This function trains a structural LAD model for a given time series.
448506
449507
:param pandas.DataFrame data: Input time series data
450508
:param bool optimize: Flag to identify whether called from hyperparameter optimization
509+
:param bool validate: Flag to identify whether to run model validation after training
451510
:return: success flag, the model date and the trained lad structural model object
452511
:rtype: tuple[bool, str, LADStructuralModel object]
453512
@@ -493,6 +552,13 @@ def train(self, data, optimize=False, **kwargs):
493552

494553
success = False if 'ErrorMessage' in result else True
495554

555+
# Model validation step to check underfit due to misspecification of hyperparameters to some other reasons.
556+
if success and validate and not optimize:
557+
passed = self._validate_model(data=data, hyper_params=self.hyper_params, result=result)
558+
if not passed:
559+
success = False
560+
result['ErrorMessage'] = 'Model validation failed! Check any issues with the input data or the hyper-parameters.'
561+
496562
return success, kwargs['ts_end'], LADStructuralModel(hyper_params=self.hyper_params, **result)
497563

498564
@classmethod
@@ -569,7 +635,11 @@ def _predict(cls, model, is_log_transformed,
569635
else:
570636
pred_exog['fourier_feature'] = seasonal_feature_scoring[:forecast_ndays]
571637

572-
forecast = list(model.forecast(steps=forecast_ndays, alpha=alpha, exog=pred_exog))
638+
forecast = []
639+
forecast_obj = model.get_forecast(steps=forecast_ndays, alpha=alpha, exog=pred_exog)
640+
forecast.append(np.real(forecast_obj.predicted_mean).tolist()) # adding prediction mean to forecast
641+
forecast.append(np.real(forecast_obj.se_mean).tolist()) # adding prediction standard error to forecast
642+
forecast.append(np.real(forecast_obj.conf_int()).tolist()) # adding predictions ci's to forecast
573643
interpolated_training_data = list(zip(*training_tail))[1]
574644

575645
for order in list(reversed(range(order_of_diff))):

luminaire/model/model_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ class LADHolidays(pandas.tseries.holiday.AbstractHolidayCalendar):
5858
offset=_DateOffset(weekday=_H.SU(1)))
5959
Easter = _H.Holiday('Easter', month=1, day=1, offset=[_H.Easter(), _H.Day(0)])
6060

61+
USThanksgivingWednesday = _H.Holiday('Thanksgiving Wednesday', month=11, day=1,
62+
offset=[_DateOffset(weekday=_H.TH(4)), _DateOffset(days=-1)])
6163
USThanksgivingDay = _H.Holiday('Thanksgiving', month=11, day=1,
6264
offset=_DateOffset(weekday=_H.TH(4)))
6365
USThanksgivingFriday = _H.Holiday('Thanksgiving Friday', month=11, day=1,
@@ -67,6 +69,7 @@ class LADHolidays(pandas.tseries.holiday.AbstractHolidayCalendar):
6769
USThanksgivingSunday = _H.Holiday('Thanksgiving Sunday', month=11, day=1,
6870
offset=[_DateOffset(weekday=_H.TH(4)), _DateOffset(days=3)])
6971

72+
Dec23 = _H.Holiday('December 23', month=12, day=23)
7073
ChristmasEve = _H.Holiday('Christmas Eve', month=12, day=24)
7174
ChristmasDay = _H.Holiday('Christmas Day', month=12, day=25)
7275
ChristmasDayObserved = _H.Holiday('Christmas Day Observed', month=12, day=25, observance=_H.nearest_workday)
@@ -79,6 +82,8 @@ class LADHolidays(pandas.tseries.holiday.AbstractHolidayCalendar):
7982

8083
NewYearsDay = _H.Holiday('New Years Day', month=1, day=1)
8184
NewYearsDayObserved = _H.Holiday('New Years Day Observed', month=1, day=1, observance=_H.nearest_workday)
85+
Jan2 = _H.Holiday('January 2', month=1, day=2)
86+
Jan3 = _H.Holiday('January 3', month=1, day=3)
8287

8388
rules = [
8489
USMemorialDay,
@@ -105,10 +110,12 @@ class LADHolidays(pandas.tseries.holiday.AbstractHolidayCalendar):
105110
Superbowl,
106111
Easter,
107112

113+
USThanksgivingWednesday,
108114
USThanksgivingDay,
109115
USThanksgivingFriday,
110116
USThanksgivingSaturday,
111117
USThanksgivingSunday,
118+
Dec23,
112119
ChristmasEve,
113120
ChristmasDay,
114121
ChristmasDayObserved,
@@ -120,6 +127,8 @@ class LADHolidays(pandas.tseries.holiday.AbstractHolidayCalendar):
120127
Dec31,
121128
NewYearsDay,
122129
NewYearsDayObserved,
130+
Jan2,
131+
Jan3,
123132
]
124133

125134
def __init__(self, name=None, holiday_rules=None):

luminaire/model/window_density.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -102,14 +102,16 @@ def _volume_shift_detection(self, mean_list=None, sd_list=None, probability_thre
102102
:rtype: int
103103
"""
104104
import numpy as np
105-
from bayesian_changepoint_detection import offline_changepoint_detection as offcd
105+
from bayesian_changepoint_detection.priors import const_prior
106+
from bayesian_changepoint_detection.bayesian_models import offline_changepoint_detection
107+
import bayesian_changepoint_detection.offline_likelihoods as offline_ll
106108
from functools import partial
107109

108110
# Volume shift detection over the means of the training window
109-
q, p, pcp = offcd.offline_changepoint_detection(
111+
q, p, pcp = offline_changepoint_detection(
110112
data=np.array(mean_list),
111-
prior_func=partial(offcd.const_prior, l=(len(mean_list) + 1)),
112-
observation_log_likelihood_function=offcd.gaussian_obs_log_likelihood,
113+
prior_function=partial(const_prior, p=1/(len(mean_list) + 1)),
114+
log_likelihood_class=offline_ll.StudentT(),
113115
truncate=-10)
114116

115117
mask_mean = np.append(0, np.exp(pcp).sum(0)) > probability_threshold
@@ -118,10 +120,10 @@ def _volume_shift_detection(self, mean_list=None, sd_list=None, probability_thre
118120
change_points = np.array(mask_mean).nonzero()
119121
last_mean_cp = change_points[0][-1] if len(change_points[0]) > 0 else []
120122

121-
q, p, pcp = offcd.offline_changepoint_detection(
123+
q, p, pcp = offline_changepoint_detection(
122124
data=np.array(sd_list),
123-
prior_func=partial(offcd.const_prior, l=(len(sd_list) + 1)),
124-
observation_log_likelihood_function=offcd.gaussian_obs_log_likelihood,
125+
prior_function=partial(const_prior, p=1/(len(sd_list) + 1)),
126+
log_likelihood_class=offline_ll.StudentT(),
125127
truncate=-10)
126128

127129
mask_sd = np.append(0, np.exp(pcp).sum(0)) > probability_threshold
@@ -155,8 +157,8 @@ def _distance_function(self, data=None, called_for=None, baseline=None):
155157
if called_for == "training":
156158
distance = []
157159
for i in range(0, len(data) - 1):
158-
q = stats.kde.gaussian_kde(data[i])
159-
p = stats.kde.gaussian_kde(data[i + 1])
160+
q = stats.gaussian_kde(data[i])
161+
p = stats.gaussian_kde(data[i + 1])
160162

161163
ts_min = min(np.min(data[i]), np.min(data[i + 1]))
162164
ts_max = max(np.max(data[i]), np.max(data[i + 1]))
@@ -176,8 +178,8 @@ def _distance_function(self, data=None, called_for=None, baseline=None):
176178

177179
# If called for scoring, Kl divergence is performed between the scoring window and the baseline
178180
elif called_for == "scoring":
179-
q = stats.kde.gaussian_kde(baseline)
180-
p = stats.kde.gaussian_kde(data)
181+
q = stats.gaussian_kde(baseline)
182+
p = stats.gaussian_kde(data)
181183

182184
ts_min = min(np.min(baseline), np.min(data))
183185
ts_max = max(np.max(baseline), np.max(data))
3.5 MB
Binary file not shown.
Binary file not shown.
3.03 MB
Binary file not shown.

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
bayesian-changepoint-detection>=0.2.dev1
1+
bayesian-changepoint-detection @ git+https://github.com/hildensia/bayesian_changepoint_detection@2dd95f5c1d028116899a842ccb3baa173f9d5be9
22
changepy>=0.3.1
33
hyperopt>=0.1.2
44
numpy>=1.17.5
55
pandas>=0.25.3
66
pykalman>=0.9.5
7-
scipy<=1.2.3
8-
statsmodels>=0.10.2,<=0.12.2
7+
scipy>=1.6.0
8+
statsmodels>=0.13.0
99
scikit-learn>=0.24.2
1010
decorator>=5.1.0

0 commit comments

Comments
 (0)