@@ -30,32 +30,62 @@ def _build_model(self) -> AnomalyOutput:
30
30
31
31
model_kwargs = self .spec .model_kwargs
32
32
# map the output as per anomaly dataset class, 1: outlier, 0: inlier
33
- self .outlier_map = {1 : 0 , - 1 : 1 }
33
+ # self.outlier_map = {1: 0, -1: 1}
34
34
35
35
anomaly_output = AnomalyOutput (date_column = "index" )
36
- #TODO: PDB
37
- import pdb
36
+ # TODO: PDB
38
37
39
- pdb .set_trace ()
38
+ # Set tree parameters
39
+ num_trees = model_kwargs .get ("num_trees" , 200 )
40
+ shingle_size = model_kwargs .get ("shingle_size" , 1 )
41
+ tree_size = model_kwargs .get ("tree_size" , 1000 )
40
42
41
43
for target , df in self .datasets .full_data_dict .items ():
42
- model = RCTree (** model_kwargs )
43
- model .fit (df )
44
- y_pred = model .predict (df )
45
- y_pred = np .vectorize (self .outlier_map .get )(y_pred )
44
+ df_values = df [self .spec .target_column ].astype (float ).values
45
+ points = np .vstack (list (rrcf .shingle (df_values , size = 4 )))
46
46
47
- scores = model .score_samples (df )
47
+ sample_size_range = (1 , 6 )
48
+ n = points .shape [0 ]
49
+ avg_codisp = pd .Series (0.0 , index = np .arange (n ))
50
+ index = np .zeros (n )
48
51
49
- index_col = df .columns [0 ]
52
+ forest = []
53
+ while len (forest ) < num_trees :
54
+ ixs = np .random .choice (n , size = sample_size_range , replace = False )
55
+ trees = [rrcf .RCTree (points [ix ], index_labels = ix ) for ix in ixs ]
56
+ forest .extend (trees )
57
+ print (len (forest ))
50
58
51
- anomaly = pd .DataFrame (
52
- {index_col : df [index_col ], OutputColumns .ANOMALY_COL : y_pred }
53
- ).reset_index (drop = True )
54
- score = pd .DataFrame (
55
- {"index" : df [index_col ], OutputColumns .SCORE_COL : scores }
56
- ).reset_index (drop = True )
59
+ for tree in forest :
60
+ codisp = pd .Series ({leaf : tree .codisp (leaf ) for leaf in tree .leaves })
61
+ avg_codisp [codisp .index ] += codisp
62
+ np .add .at (index , codisp .index .values , 1 )
57
63
58
- anomaly_output .add_output (target , anomaly , score )
64
+ avg_codisp /= index
65
+ avg_codisp .index = df .iloc [(4 - 1 ) :].index
66
+ avg_codisp = (avg_codisp - avg_codisp .min ()) / (
67
+ avg_codisp .max () - avg_codisp .min ()
68
+ )
69
+
70
+ y_pred = (avg_codisp > np .percentile (avg_codisp , 95 )).astype (int )
71
+
72
+ import pdb
73
+
74
+ pdb .set_trace ()
75
+ print ("Done" )
76
+
77
+ # scores = model.score_samples(df)
78
+
79
+ # index_col = df.columns[0]
80
+
81
+ # anomaly = pd.DataFrame(
82
+ # {index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred}
83
+ # ).reset_index(drop=True)
84
+ # score = pd.DataFrame(
85
+ # {"index": df[index_col], OutputColumns.SCORE_COL: scores}
86
+ # ).reset_index(drop=True)
87
+
88
+ # anomaly_output.add_output(target, anomaly, score)
59
89
60
90
return anomaly_output
61
91
0 commit comments