Skip to content

Commit b790e64

Browse files
Update phishing_email_detection_gpt2.py
1 parent c7e8b30 commit b790e64

File tree

1 file changed

+132
-91
lines changed

1 file changed

+132
-91
lines changed

phishing_email_detection_gpt2.py

Lines changed: 132 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
3030
import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
3131
from ast import literal_eval
32+
import time
33+
3234

3335
#
3436
# Load the email data
@@ -141,10 +143,14 @@ def from_config(cls, config):
141143
metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
142144
)
143145

146+
gpt_t0 = time.time()
147+
148+
print(gpt_baseline_model.summary())
149+
144150
history = gpt_baseline_model.fit(
145151
x=X_train, # Input data
146152
y=y_train, # Labels
147-
epochs=20, # Number of training iterations
153+
epochs=4, # Number of training iterations
148154
batch_size=16, # Batch size small due to GPU memory constraints
149155
validation_split=0.2, # Hold out 20% of training data for validation
150156
shuffle=True, # Shuffle data at each epoch
@@ -164,111 +170,146 @@ def from_config(cls, config):
164170
]
165171
)
166172

173+
gpt_t1 = time.time()
174+
gpt_time_on_one_model_min = (gpt_t1 - gpt_t1) / 60
175+
167176
hy = history["history"]
168177
hy_df = pd.DataFrame(hy)
169178
print(hy_df)
170179

180+
### Cerebros model:
171181

182+
# TokenizerLayer class to handle tokenization and return only token_ids
183+
class TokenizerLayer(tf.keras.layers.Layer):
184+
def __init__(self, max_seq_length, **kwargs):
185+
super().__init__(**kwargs)
186+
self.tokenizer = GPT2Tokenizer.from_preset("gpt2_base_en")
187+
self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length)
188+
self.max_seq_length = max_seq_length
172189

190+
def call(self, inputs):
191+
processed = self.preprocessor(inputs) # Accepts tensor of strings, outputs {"token_ids": ...}
192+
return processed["token_ids"] # Output shape: (batch_size, max_seq_length)
173193

194+
def get_config(self):
195+
base_config = super().get_config()
196+
base_config.update({"max_seq_length": self.max_seq_length})
197+
return base_config
174198

175199

200+
VOCAB_SIZE = GPT2Tokenizer.vocabulary_size()
176201

202+
# Create cerebros_base_model
203+
def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=VOCAB_SIZE):
204+
input_layer = Input(shape=(), dtype=tf.string) # Text input
205+
token_ids = TokenizerLayer(max_seq_length)(input_layer)
206+
# Build embedding layer with GPT2 tokenizer's vocabulary size (50257 for GPT2Base)
207+
embedded = tf.keras.layers.Embedding(
208+
input_dim=GPT2Tokenizer.vocabulary_size(), # Uses standard GPT-2 vocab size
209+
output_dim=embedding_dim,
210+
mask_zero=True, # Handle <PAD> tokens
211+
name="custom_embedding"
212+
)(token_ids)
213+
214+
# Flatten for downstream models
215+
flattened = Flatten()(embedded)
216+
model = Model(inputs=input_layer, outputs=flattened)
217+
return model
177218

219+
# Example usage (outputs depend on parameters, set embedding_dim as desired)
220+
cerebros_base_model = build_cerebros_base_model(max_seq_length=96)
178221

179222

223+
"""### Cerebros search for the best model"""
180224

225+
#
226+
# Cerebros configurables
227+
#
228+
activation = 'gelu'
229+
predecessor_level_connection_affinity_factor_first = 49.9999
230+
predecessor_level_connection_affinity_factor_main = 0.31456
231+
max_consecutive_lateral_connections = 22
232+
p_lateral_connection = 0.39256
233+
num_lateral_connection_tries_per_unit = 10
234+
learning_rate = 0.0000511065
235+
epochs = 6 # [1, 100]
236+
batch_size = 13
237+
maximum_levels = 4 # [3,7]
238+
maximum_units_per_level = 8 # [2,10]
239+
maximum_neurons_per_unit = 5 # [2,20]
240+
moities_to_try = 2
241+
tries_per_moity = 1
181242

182-
183-
# base_model = Model(inputs=input_layer, outputs=gpt2_layer)
184-
# base_model.summary()
185-
186-
187-
188-
189-
190-
191-
192-
193-
# """### Cerebros search for the best model"""
194-
195-
# #
196-
# # Cerebros configurables
197-
# #
198-
# activation = 'gelu'
199-
# predecessor_level_connection_affinity_factor_first = 49.9999
200-
# predecessor_level_connection_affinity_factor_main = 0.31456
201-
# max_consecutive_lateral_connections = 22
202-
# p_lateral_connection = 0.39256
203-
# num_lateral_connection_tries_per_unit = 10
204-
# learning_rate = 0.0000511065
205-
# epochs = 6 # [1, 100]
206-
# batch_size = 13
207-
# maximum_levels = 4 # [3,7]
208-
# maximum_units_per_level = 8 # [2,10]
209-
# maximum_neurons_per_unit = 5 # [2,20]
210-
211-
# #
212-
# # Logging
213-
# #
214-
# TIME = pendulum.now(tz='America/New_York').__str__()[:16]\
215-
# .replace('T', '_')\
216-
# .replace(':', '_')\
217-
# .replace('-', '_')
218-
# PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test'
219-
220-
# meta_trial_number = 42 # irrelevant unless in distributed training
221-
222-
# cerebros_automl = SimpleCerebrosRandomSearch(
223-
# unit_type=DenseUnit,
224-
# input_shapes=INPUT_SHAPES,
225-
# output_shapes=OUTPUT_SHAPES,
226-
# training_data=training_x,
227-
# labels=train_labels,
228-
# validation_split=0.35,
229-
# direction='maximize',
230-
# metric_to_rank_by="val_binary_accuracy",
231-
# minimum_levels=2,
232-
# maximum_levels=maximum_levels,
233-
# minimum_units_per_level=1,
234-
# maximum_units_per_level=maximum_units_per_level,
235-
# minimum_neurons_per_unit=1,
236-
# maximum_neurons_per_unit=maximum_neurons_per_unit,
237-
# activation=activation,
238-
# final_activation='sigmoid',
239-
# number_of_architecture_moities_to_try=2,
240-
# number_of_tries_per_architecture_moity=1,
241-
# minimum_skip_connection_depth=1,
242-
# maximum_skip_connection_depth=7,
243-
# predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
244-
# predecessor_level_connection_affinity_factor_first_rounding_rule='ceil',
245-
# predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main,
246-
# predecessor_level_connection_affinity_factor_main_rounding_rule='ceil',
247-
# predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay,
248-
# seed=8675309,
249-
# max_consecutive_lateral_connections=max_consecutive_lateral_connections,
250-
# gate_after_n_lateral_connections=3,
251-
# gate_activation_function=simple_sigmoid,
252-
# p_lateral_connection=p_lateral_connection,
253-
# p_lateral_connection_decay=zero_95_exp_decay,
254-
# num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
255-
# learning_rate=learning_rate,
256-
# loss=tf.keras.losses.CategoricalHinge(),
257-
# metrics=[tf.keras.metrics.BinaryAccuracy(),
258-
# tf.keras.metrics.Precision(),
259-
# tf.keras.metrics.Recall()],
260-
# epochs=epochs,
261-
# project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
262-
# model_graphs='model_graphs',
263-
# batch_size=batch_size,
264-
# meta_trial_number=meta_trial_number,
265-
# base_models=[base_model],
266-
# train_data_dtype=tf.string)
267-
268-
# result = cerebros_automl.run_random_search()
269-
270-
# print(f'Best accuracy achieved is {result}')
271-
# print(f'binary accuracy')
243+
#
244+
# Logging
245+
#
246+
TIME = pendulum.now(tz='America/New_York').__str__()[:16]\
247+
.replace('T', '_')\
248+
.replace(':', '_')\
249+
.replace('-', '_')
250+
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test'
251+
252+
meta_trial_number = 42 # irrelevant unless in distributed training
253+
254+
255+
cerebros_automl = SimpleCerebrosRandomSearch(
256+
unit_type=DenseUnit,
257+
input_shapes=INPUT_SHAPES,
258+
output_shapes=OUTPUT_SHAPES,
259+
training_data=training_x,
260+
labels=train_labels,
261+
validation_split=0.35,
262+
direction='maximize',
263+
metric_to_rank_by="val_accuracy",
264+
minimum_levels=2,
265+
maximum_levels=maximum_levels,
266+
minimum_units_per_level=1,
267+
maximum_units_per_level=maximum_units_per_level,
268+
minimum_neurons_per_unit=1,
269+
maximum_neurons_per_unit=maximum_neurons_per_unit,
270+
activation=activation,
271+
final_activation='sigmoid',
272+
number_of_architecture_moities_to_try=moities_to_try,
273+
number_of_tries_per_architecture_moity=tries_per_moity,
274+
minimum_skip_connection_depth=1,
275+
maximum_skip_connection_depth=7,
276+
predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
277+
predecessor_level_connection_affinity_factor_first_rounding_rule='ceil',
278+
predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main,
279+
predecessor_level_connection_affinity_factor_main_rounding_rule='ceil',
280+
predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay,
281+
seed=8675309,
282+
max_consecutive_lateral_connections=max_consecutive_lateral_connections,
283+
gate_after_n_lateral_connections=3,
284+
gate_activation_function=simple_sigmoid,
285+
p_lateral_connection=p_lateral_connection,
286+
p_lateral_connection_decay=zero_95_exp_decay,
287+
num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
288+
learning_rate=learning_rate,
289+
loss=tf.keras.losses.CategoricalHinge(),
290+
metrics=[tf.keras.metrics.Accuracy(),
291+
tf.keras.metrics.Precision(),
292+
tf.keras.metrics.Recall()],
293+
epochs=epochs,
294+
project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
295+
model_graphs='model_graphs',
296+
batch_size=batch_size,
297+
meta_trial_number=meta_trial_number,
298+
base_models=[cerebros_base_model],
299+
train_data_dtype=tf.string)
300+
301+
cerebros_t0 = time.time()
302+
result = cerebros_automl.run_random_search()
303+
cerebros_t1 = time.time()
304+
cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60
305+
cerebros_time_per_model = cerebros_time_all_models_min / (moities_to_try * tries_per_moity)
306+
307+
print(f"Cerebros trained 2 models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.")
308+
print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.")
309+
310+
311+
print(f'Cerebros best accuracy achieved is {result}')
312+
print(f'val set accuracy')
272313

273314
# """### Testing the best model found"""
274315

0 commit comments

Comments
 (0)