Skip to content

Commit c7e8b30

Browse files
Update phishing_email_detection_gpt2.py
Added a baseline fine tuning of the full GPT2 to compare against Cerebros text classifier.
1 parent 8904966 commit c7e8b30

File tree

1 file changed

+161
-99
lines changed

1 file changed

+161
-99
lines changed

phishing_email_detection_gpt2.py

Lines changed: 161 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,15 @@
6565
#
6666
# Tensors for training data and labels
6767
#
68-
training_x = [tf.constant(X_train)]
69-
train_labels = [tf.constant(y_train)]
68+
69+
# Training data for baseline model
70+
baseline_train_x = tf.constant(X_train)
71+
baseline_train_y = tf.constant(y_train)
72+
73+
# Packaged for Cerebros (multimodal, takes inputs as a list)
74+
training_x = [baseline_train_x]
75+
train_labels = [baseline_train_y]
76+
7077
#
7178
# Input and output shapes
7279
#
@@ -90,9 +97,9 @@ def __init__(self, max_seq_length, **kwargs):
9097
# Set whether the GPT2 model's layers are trainable
9198
#self.encoder.trainable = False
9299
for layer in self.encoder.layers:
93-
layer.trainable = False
100+
layer.trainable = True
94101
#
95-
self.encoder.layers[-2].trainable = True
102+
# self.encoder.layers[-2].trainable = True
96103
#
97104
# Set the maximum sequence length for tokenization
98105
self.max_seq_length = max_seq_length
@@ -121,101 +128,156 @@ def from_config(cls, config):
121128
# GPT2 configurables
122129
max_seq_length = 96
123130

124-
# Base model
131+
# GPT Baseline Model
125132
input_layer = Input(shape=(), dtype=tf.string)
126133
gpt2_layer = GPT2Layer(max_seq_length)(input_layer)
127134
#output = Flatten()(gpt2_layer)
128-
base_model = Model(inputs=input_layer, outputs=gpt2_layer)
129-
base_model.summary()
130-
131-
"""### Cerebros search for the best model"""
132-
133-
#
134-
# Cerebros configurables
135-
#
136-
activation = 'gelu'
137-
predecessor_level_connection_affinity_factor_first = 49.9999
138-
predecessor_level_connection_affinity_factor_main = 0.31456
139-
max_consecutive_lateral_connections = 22
140-
p_lateral_connection = 0.39256
141-
num_lateral_connection_tries_per_unit = 10
142-
learning_rate = 0.0000511065
143-
epochs = 6 # [1, 100]
144-
batch_size = 13
145-
maximum_levels = 4 # [3,7]
146-
maximum_units_per_level = 8 # [2,10]
147-
maximum_neurons_per_unit = 5 # [2,20]
148-
149-
#
150-
# Logging
151-
#
152-
TIME = pendulum.now(tz='America/New_York').__str__()[:16]\
153-
.replace('T', '_')\
154-
.replace(':', '_')\
155-
.replace('-', '_')
156-
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test'
157-
158-
meta_trial_number = 42 # irrelevant unless in distributed training
159-
160-
cerebros_automl = SimpleCerebrosRandomSearch(
161-
unit_type=DenseUnit,
162-
input_shapes=INPUT_SHAPES,
163-
output_shapes=OUTPUT_SHAPES,
164-
training_data=training_x,
165-
labels=train_labels,
166-
validation_split=0.35,
167-
direction='maximize',
168-
metric_to_rank_by="val_binary_accuracy",
169-
minimum_levels=2,
170-
maximum_levels=maximum_levels,
171-
minimum_units_per_level=1,
172-
maximum_units_per_level=maximum_units_per_level,
173-
minimum_neurons_per_unit=1,
174-
maximum_neurons_per_unit=maximum_neurons_per_unit,
175-
activation=activation,
176-
final_activation='sigmoid',
177-
number_of_architecture_moities_to_try=2,
178-
number_of_tries_per_architecture_moity=1,
179-
minimum_skip_connection_depth=1,
180-
maximum_skip_connection_depth=7,
181-
predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
182-
predecessor_level_connection_affinity_factor_first_rounding_rule='ceil',
183-
predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main,
184-
predecessor_level_connection_affinity_factor_main_rounding_rule='ceil',
185-
predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay,
186-
seed=8675309,
187-
max_consecutive_lateral_connections=max_consecutive_lateral_connections,
188-
gate_after_n_lateral_connections=3,
189-
gate_activation_function=simple_sigmoid,
190-
p_lateral_connection=p_lateral_connection,
191-
p_lateral_connection_decay=zero_95_exp_decay,
192-
num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
193-
learning_rate=learning_rate,
194-
loss=tf.keras.losses.CategoricalHinge(),
195-
metrics=[tf.keras.metrics.BinaryAccuracy(),
196-
tf.keras.metrics.Precision(),
197-
tf.keras.metrics.Recall()],
198-
epochs=epochs,
199-
project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
200-
model_graphs='model_graphs',
201-
batch_size=batch_size,
202-
meta_trial_number=meta_trial_number,
203-
base_models=[base_model],
204-
train_data_dtype=tf.string)
205-
206-
result = cerebros_automl.run_random_search()
207-
208-
print(f'Best accuracy achieved is {result}')
209-
print(f'binary accuracy')
210-
211-
"""### Testing the best model found"""
212-
213-
#
214-
# Load the best model (taking into account that it has a custom layer)
215-
#
216-
best_model_found =\
217-
tf.keras.models.load_model(cerebros_automl.best_model_path,\
218-
custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)})
219-
220-
print('Evaluating on the test dataset')
221-
best_model_found.evaluate(X_test, y_test)
135+
binary_output = tf.keras.layers.Dense(1, activation='sigmoid')(gpt2_layer)
136+
gpt_baseline_model = Model(inputs=input_layer, outputs=binary_output)
137+
138+
gpt_baseline_model.compile(
139+
optimizer=Adam(learning_rate=1e-4), # Small LR since we're fine-tuning GPT
140+
loss='binary_crossentropy',
141+
metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
142+
)
143+
144+
history = gpt_baseline_model.fit(
145+
x=X_train, # Input data
146+
y=y_train, # Labels
147+
epochs=20, # Number of training iterations
148+
batch_size=16, # Batch size small due to GPU memory constraints
149+
validation_split=0.2, # Hold out 20% of training data for validation
150+
shuffle=True, # Shuffle data at each epoch
151+
callbacks=[
152+
tf.keras.callbacks.EarlyStopping(
153+
monitor='val_loss',
154+
patience=3,
155+
restore_best_weights=True,
156+
min_delta=0.001
157+
),
158+
tf.keras.callbacks.ReduceLROnPlateau(
159+
monitor='val_loss',
160+
factor=0.2,
161+
patience=2,
162+
min_lr=1e-6
163+
)
164+
]
165+
)
166+
167+
hy = history["history"]
168+
hy_df = pd.DataFrame(hy)
169+
print(hy_df)
170+
171+
172+
173+
174+
175+
176+
177+
178+
179+
180+
181+
182+
183+
# base_model = Model(inputs=input_layer, outputs=gpt2_layer)
184+
# base_model.summary()
185+
186+
187+
188+
189+
190+
191+
192+
193+
# """### Cerebros search for the best model"""
194+
195+
# #
196+
# # Cerebros configurables
197+
# #
198+
# activation = 'gelu'
199+
# predecessor_level_connection_affinity_factor_first = 49.9999
200+
# predecessor_level_connection_affinity_factor_main = 0.31456
201+
# max_consecutive_lateral_connections = 22
202+
# p_lateral_connection = 0.39256
203+
# num_lateral_connection_tries_per_unit = 10
204+
# learning_rate = 0.0000511065
205+
# epochs = 6 # [1, 100]
206+
# batch_size = 13
207+
# maximum_levels = 4 # [3,7]
208+
# maximum_units_per_level = 8 # [2,10]
209+
# maximum_neurons_per_unit = 5 # [2,20]
210+
211+
# #
212+
# # Logging
213+
# #
214+
# TIME = pendulum.now(tz='America/New_York').__str__()[:16]\
215+
# .replace('T', '_')\
216+
# .replace(':', '_')\
217+
# .replace('-', '_')
218+
# PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test'
219+
220+
# meta_trial_number = 42 # irrelevant unless in distributed training
221+
222+
# cerebros_automl = SimpleCerebrosRandomSearch(
223+
# unit_type=DenseUnit,
224+
# input_shapes=INPUT_SHAPES,
225+
# output_shapes=OUTPUT_SHAPES,
226+
# training_data=training_x,
227+
# labels=train_labels,
228+
# validation_split=0.35,
229+
# direction='maximize',
230+
# metric_to_rank_by="val_binary_accuracy",
231+
# minimum_levels=2,
232+
# maximum_levels=maximum_levels,
233+
# minimum_units_per_level=1,
234+
# maximum_units_per_level=maximum_units_per_level,
235+
# minimum_neurons_per_unit=1,
236+
# maximum_neurons_per_unit=maximum_neurons_per_unit,
237+
# activation=activation,
238+
# final_activation='sigmoid',
239+
# number_of_architecture_moities_to_try=2,
240+
# number_of_tries_per_architecture_moity=1,
241+
# minimum_skip_connection_depth=1,
242+
# maximum_skip_connection_depth=7,
243+
# predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
244+
# predecessor_level_connection_affinity_factor_first_rounding_rule='ceil',
245+
# predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main,
246+
# predecessor_level_connection_affinity_factor_main_rounding_rule='ceil',
247+
# predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay,
248+
# seed=8675309,
249+
# max_consecutive_lateral_connections=max_consecutive_lateral_connections,
250+
# gate_after_n_lateral_connections=3,
251+
# gate_activation_function=simple_sigmoid,
252+
# p_lateral_connection=p_lateral_connection,
253+
# p_lateral_connection_decay=zero_95_exp_decay,
254+
# num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
255+
# learning_rate=learning_rate,
256+
# loss=tf.keras.losses.CategoricalHinge(),
257+
# metrics=[tf.keras.metrics.BinaryAccuracy(),
258+
# tf.keras.metrics.Precision(),
259+
# tf.keras.metrics.Recall()],
260+
# epochs=epochs,
261+
# project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
262+
# model_graphs='model_graphs',
263+
# batch_size=batch_size,
264+
# meta_trial_number=meta_trial_number,
265+
# base_models=[base_model],
266+
# train_data_dtype=tf.string)
267+
268+
# result = cerebros_automl.run_random_search()
269+
270+
# print(f'Best accuracy achieved is {result}')
271+
# print(f'binary accuracy')
272+
273+
# """### Testing the best model found"""
274+
275+
# #
276+
# # Load the best model (taking into account that it has a custom layer)
277+
# #
278+
# best_model_found =\
279+
# tf.keras.models.load_model(cerebros_automl.best_model_path,\
280+
# custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)})
281+
282+
# print('Evaluating on the test dataset')
283+
# best_model_found.evaluate(X_test, y_test)

0 commit comments

Comments
 (0)