Skip to content

Commit 0cfb488

Browse files
Update phishing_email_detection_gpt2.py
Amendments to Cerebros model.
1 parent 15ec9c2 commit 0cfb488

File tree

1 file changed

+66
-35
lines changed

1 file changed

+66
-35
lines changed

phishing_email_detection_gpt2.py

Lines changed: 66 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -181,46 +181,71 @@ def from_config(cls, config):
181181

182182
# TokenizerLayer class to handle tokenization and return only token_ids
183183
class TokenizerLayer(tf.keras.layers.Layer):
184+
184185
def __init__(self, max_seq_length, **kwargs):
185-
super().__init__(**kwargs)
186-
self.tokenizer = GPT2Tokenizer.from_preset("gpt2_base_en")
187-
self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length)
186+
#
187+
super(GPT2Layer, self).__init__(**kwargs)
188+
#
189+
# Load the GPT2 tokenizer, preprocessor and model
190+
self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en") # "gpt2_base_en"
191+
self.preprocessor = GPT2Preprocessor(self.tokenizer,
192+
sequence_length=max_seq_length)
193+
# self.encoder = GPT2Backbone.from_preset("gpt2_base_en")
194+
#
195+
# Set whether the GPT2 model's layers are trainable
196+
# self.encoder.trainable = False
197+
# for layer in self.encoder.layers:
198+
# layer.trainable = False
199+
#
200+
# self.encoder.layers[-2].trainable = True
201+
#
202+
# Set the maximum sequence length for tokenization
188203
self.max_seq_length = max_seq_length
189204

190205
def call(self, inputs):
191-
processed = self.preprocessor(inputs) # Accepts tensor of strings, outputs {"token_ids": ...}
192-
return processed["token_ids"] # Output shape: (batch_size, max_seq_length)
206+
#
207+
# Output the GPT2 embedding
208+
prep = self.preprocessor([inputs])
209+
# embedding = self.encoder(prep)
210+
# avg_pool = tf.reduce_mean(embedding, axis=1)
211+
#
212+
return prep['token_ids']
193213

194214
def get_config(self):
195-
base_config = super().get_config()
196-
base_config.update({"max_seq_length": self.max_seq_length})
197-
return base_config
215+
#
216+
config = super(GPT2Layer, self).get_config()
217+
config.update({'max_seq_length': self.max_seq_length})
218+
#
219+
return config
220+
221+
@classmethod
222+
def from_config(cls, config):
223+
#
224+
return cls(max_seq_length=config['max_seq_length'])
198225

226+
# GPT2 configurables
199227

200-
VOCAB_SIZE = GPT2Tokenizer.vocabulary_size()
228+
max_seq_length = 900
201229

202-
# Create cerebros_base_model
203-
def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=VOCAB_SIZE):
204-
input_layer = Input(shape=(), dtype=tf.string) # Text input
205-
token_ids = TokenizerLayer(max_seq_length)(input_layer)
206-
# Build embedding layer with GPT2 tokenizer's vocabulary size (50257 for GPT2Base)
207-
embedded = tf.keras.layers.Embedding(
208-
input_dim=GPT2Tokenizer.vocabulary_size(), # Uses standard GPT-2 vocab size
209-
output_dim=embedding_dim,
210-
mask_zero=True, # Handle <PAD> tokens
211-
name="custom_embedding"
212-
)(token_ids)
213-
214-
# Flatten for downstream models
215-
flattened = Flatten()(embedded)
216-
dropout = tf.keras.layers.Dropout(.6)(flattened)
217-
model = Model(inputs=input_layer, outputs=dropout)
218-
return model
230+
inp = tf.keras.layers.Input(shape=(), dtype=tf.string)
231+
gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length)
232+
VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size()
233+
tokens = gp2_tokenizer(inp)
219234

220235

221-
# Example usage (outputs depend on parameters, set embedding_dim as desired)
222-
cerebros_base_model = build_cerebros_base_model(max_seq_length=96)
236+
embedded =\
237+
tf.keras.layers.Embedding(
238+
input_dim=VOCABULARY_SIZE,
239+
output_dim=15,
240+
input_length=max_seq_length,
241+
mask_zero=True)(tokens)
242+
dropout_embedded = tf.keras.layers.Dropout(0.6)(embedded)
243+
flattened = tf.keras.layers.Flatten()(dropout_embedded)
223244

245+
cerebros_base_model =\
246+
tf.keras.Model(
247+
inputs=inp,
248+
outputs=flattened)
224249

225250
"""### Cerebros search for the best model"""
226251

@@ -234,11 +259,17 @@ def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=V
234259
p_lateral_connection = 0.39256
235260
num_lateral_connection_tries_per_unit = 10
236261
learning_rate = 0.0000511065
237-
epochs = 6 # [1, 100]
238-
batch_size = 13
239-
maximum_levels = 4 # [3,7]
240-
maximum_units_per_level = 8 # [2,10]
262+
epochs = 15 # [1, 100]
263+
batch_size = 20
264+
minimum_levels = 2
265+
maximum_levels = 4 # [3,7]
266+
267+
minimum_units_per_level = 4
268+
maximum_units_per_level = 8
269+
270+
minimum_neurons_per_unit = 1
241271
maximum_neurons_per_unit = 5 # [2,20]
272+
242273
moities_to_try = 2
243274
tries_per_moity = 1
244275

@@ -263,11 +294,11 @@ def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=V
263294
validation_split=0.35,
264295
direction='maximize',
265296
metric_to_rank_by="val_accuracy",
266-
minimum_levels=2,
297+
minimum_levels=minimum_levels,
267298
maximum_levels=maximum_levels,
268-
minimum_units_per_level=1,
299+
minimum_units_per_level=minimum_units_per_level,
269300
maximum_units_per_level=maximum_units_per_level,
270-
minimum_neurons_per_unit=1,
301+
minimum_neurons_per_unit=minimum_neurons_per_unit,
271302
maximum_neurons_per_unit=maximum_neurons_per_unit,
272303
activation=activation,
273304
final_activation='sigmoid',

0 commit comments

Comments
 (0)