From 30164c71dc542c1160ebfde09ac625078533e9f1 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 14:16:42 -0400 Subject: [PATCH 001/100] Update automerge.yml Comment temporarily disable time-consuming workflows. Comment out BERT based text classification workflow possibly permanently, as this is obsolete. --- .github/workflows/automerge.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 29b75cc..7e85568 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -46,12 +46,12 @@ jobs: run: python3 regression-example-ames-no-preproc.py - name: Test distributed random search Ames by running - Val set run: python3 regression-example-ames-no-preproc-val-set.py - - name: Test text classifier - random search - ham-spam - run: python3 text-class-ham-or-spam.py - timeout-minutes: 90 - - name: Test image classifier - small subset of CIFAR10 - timeout-minutes: 90 - run: python3 cifar10-example.py + # - name: Test text classifier - random search - ham-spam + # run: python3 text-class-ham-or-spam.py + # timeout-minutes: 90 + # - name: Test image classifier - small subset of CIFAR10 # add back + # timeout-minutes: 90 + # run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding timeout-minutes: 120 run: python3 phishing_email_detection_gpt2.py From 89049660678384c3039f5467471bfe0b97eec8d8 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 14:17:59 -0400 Subject: [PATCH 002/100] Update automerge.yml Add branch to workflow. --- .github/workflows/automerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 7e85568..82aba07 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "148-tensorflow-upgrades" ] + branches: [ "main", "154-benchmark-inference-times---cerebros-model-vs-original-gpt-2" ] permissions: contents: read From c7e8b3093577c99d712c70d29cea2bc70b5c8eeb Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 15:17:21 -0400 Subject: [PATCH 003/100] Update phishing_email_detection_gpt2.py Added a baseline fine tuning of the full GPT2 to compare against Cerebros text classifier. --- phishing_email_detection_gpt2.py | 260 +++++++++++++++++++------------ 1 file changed, 161 insertions(+), 99 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 9920ce9..d991b21 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -65,8 +65,15 @@ # # Tensors for training data and labels # -training_x = [tf.constant(X_train)] -train_labels = [tf.constant(y_train)] + +# Training data for baseline model +baseline_train_x = tf.constant(X_train) +baseline_train_y = tf.constant(y_train) + +# Packaged for Cerebros (multimodal, takes inputs as a list) +training_x = [baseline_train_x] +train_labels = [baseline_train_y] + # # Input and output shapes # @@ -90,9 +97,9 @@ def __init__(self, max_seq_length, **kwargs): # Set whether the GPT2 model's layers are trainable #self.encoder.trainable = False for layer in self.encoder.layers: - layer.trainable = False + layer.trainable = True # - self.encoder.layers[-2].trainable = True + # self.encoder.layers[-2].trainable = True # # Set the maximum sequence length for tokenization self.max_seq_length = max_seq_length @@ -121,101 +128,156 @@ def from_config(cls, config): # GPT2 configurables max_seq_length = 96 -# Base model +# GPT Baseline Model input_layer = Input(shape=(), dtype=tf.string) gpt2_layer = GPT2Layer(max_seq_length)(input_layer) #output = Flatten()(gpt2_layer) -base_model = Model(inputs=input_layer, outputs=gpt2_layer) -base_model.summary() - -"""### Cerebros search for the best model""" - -# -# Cerebros configurables -# -activation = 'gelu' -predecessor_level_connection_affinity_factor_first = 49.9999 -predecessor_level_connection_affinity_factor_main = 0.31456 -max_consecutive_lateral_connections = 22 -p_lateral_connection = 0.39256 -num_lateral_connection_tries_per_unit = 10 -learning_rate = 0.0000511065 -epochs = 6 # [1, 100] -batch_size = 13 -maximum_levels = 4 # [3,7] -maximum_units_per_level = 8 # [2,10] -maximum_neurons_per_unit = 5 # [2,20] - -# -# Logging -# -TIME = pendulum.now(tz='America/New_York').__str__()[:16]\ - .replace('T', '_')\ - .replace(':', '_')\ - .replace('-', '_') -PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test' - -meta_trial_number = 42 # irrelevant unless in distributed training - -cerebros_automl = SimpleCerebrosRandomSearch( - unit_type=DenseUnit, - input_shapes=INPUT_SHAPES, - output_shapes=OUTPUT_SHAPES, - training_data=training_x, - labels=train_labels, - validation_split=0.35, - direction='maximize', - metric_to_rank_by="val_binary_accuracy", - minimum_levels=2, - maximum_levels=maximum_levels, - minimum_units_per_level=1, - maximum_units_per_level=maximum_units_per_level, - minimum_neurons_per_unit=1, - maximum_neurons_per_unit=maximum_neurons_per_unit, - activation=activation, - final_activation='sigmoid', - number_of_architecture_moities_to_try=2, - number_of_tries_per_architecture_moity=1, - minimum_skip_connection_depth=1, - maximum_skip_connection_depth=7, - predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first, - predecessor_level_connection_affinity_factor_first_rounding_rule='ceil', - predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main, - predecessor_level_connection_affinity_factor_main_rounding_rule='ceil', - predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay, - seed=8675309, - max_consecutive_lateral_connections=max_consecutive_lateral_connections, - gate_after_n_lateral_connections=3, - gate_activation_function=simple_sigmoid, - p_lateral_connection=p_lateral_connection, - p_lateral_connection_decay=zero_95_exp_decay, - num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, - learning_rate=learning_rate, - loss=tf.keras.losses.CategoricalHinge(), - metrics=[tf.keras.metrics.BinaryAccuracy(), - tf.keras.metrics.Precision(), - tf.keras.metrics.Recall()], - epochs=epochs, - project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", - model_graphs='model_graphs', - batch_size=batch_size, - meta_trial_number=meta_trial_number, - base_models=[base_model], - train_data_dtype=tf.string) - -result = cerebros_automl.run_random_search() - -print(f'Best accuracy achieved is {result}') -print(f'binary accuracy') - -"""### Testing the best model found""" - -# -# Load the best model (taking into account that it has a custom layer) -# -best_model_found =\ -tf.keras.models.load_model(cerebros_automl.best_model_path,\ -custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)}) - -print('Evaluating on the test dataset') -best_model_found.evaluate(X_test, y_test) +binary_output = tf.keras.layers.Dense(1, activation='sigmoid')(gpt2_layer) +gpt_baseline_model = Model(inputs=input_layer, outputs=binary_output) + +gpt_baseline_model.compile( + optimizer=Adam(learning_rate=1e-4), # Small LR since we're fine-tuning GPT + loss='binary_crossentropy', + metrics=['accuracy', tf.keras.metrics.AUC(name='auc')] +) + +history = gpt_baseline_model.fit( + x=X_train, # Input data + y=y_train, # Labels + epochs=20, # Number of training iterations + batch_size=16, # Batch size small due to GPU memory constraints + validation_split=0.2, # Hold out 20% of training data for validation + shuffle=True, # Shuffle data at each epoch + callbacks=[ + tf.keras.callbacks.EarlyStopping( + monitor='val_loss', + patience=3, + restore_best_weights=True, + min_delta=0.001 + ), + tf.keras.callbacks.ReduceLROnPlateau( + monitor='val_loss', + factor=0.2, + patience=2, + min_lr=1e-6 + ) + ] +) + +hy = history["history"] +hy_df = pd.DataFrame(hy) +print(hy_df) + + + + + + + + + + + + + +# base_model = Model(inputs=input_layer, outputs=gpt2_layer) +# base_model.summary() + + + + + + + + +# """### Cerebros search for the best model""" + +# # +# # Cerebros configurables +# # +# activation = 'gelu' +# predecessor_level_connection_affinity_factor_first = 49.9999 +# predecessor_level_connection_affinity_factor_main = 0.31456 +# max_consecutive_lateral_connections = 22 +# p_lateral_connection = 0.39256 +# num_lateral_connection_tries_per_unit = 10 +# learning_rate = 0.0000511065 +# epochs = 6 # [1, 100] +# batch_size = 13 +# maximum_levels = 4 # [3,7] +# maximum_units_per_level = 8 # [2,10] +# maximum_neurons_per_unit = 5 # [2,20] + +# # +# # Logging +# # +# TIME = pendulum.now(tz='America/New_York').__str__()[:16]\ +# .replace('T', '_')\ +# .replace(':', '_')\ +# .replace('-', '_') +# PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test' + +# meta_trial_number = 42 # irrelevant unless in distributed training + +# cerebros_automl = SimpleCerebrosRandomSearch( +# unit_type=DenseUnit, +# input_shapes=INPUT_SHAPES, +# output_shapes=OUTPUT_SHAPES, +# training_data=training_x, +# labels=train_labels, +# validation_split=0.35, +# direction='maximize', +# metric_to_rank_by="val_binary_accuracy", +# minimum_levels=2, +# maximum_levels=maximum_levels, +# minimum_units_per_level=1, +# maximum_units_per_level=maximum_units_per_level, +# minimum_neurons_per_unit=1, +# maximum_neurons_per_unit=maximum_neurons_per_unit, +# activation=activation, +# final_activation='sigmoid', +# number_of_architecture_moities_to_try=2, +# number_of_tries_per_architecture_moity=1, +# minimum_skip_connection_depth=1, +# maximum_skip_connection_depth=7, +# predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first, +# predecessor_level_connection_affinity_factor_first_rounding_rule='ceil', +# predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main, +# predecessor_level_connection_affinity_factor_main_rounding_rule='ceil', +# predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay, +# seed=8675309, +# max_consecutive_lateral_connections=max_consecutive_lateral_connections, +# gate_after_n_lateral_connections=3, +# gate_activation_function=simple_sigmoid, +# p_lateral_connection=p_lateral_connection, +# p_lateral_connection_decay=zero_95_exp_decay, +# num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, +# learning_rate=learning_rate, +# loss=tf.keras.losses.CategoricalHinge(), +# metrics=[tf.keras.metrics.BinaryAccuracy(), +# tf.keras.metrics.Precision(), +# tf.keras.metrics.Recall()], +# epochs=epochs, +# project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", +# model_graphs='model_graphs', +# batch_size=batch_size, +# meta_trial_number=meta_trial_number, +# base_models=[base_model], +# train_data_dtype=tf.string) + +# result = cerebros_automl.run_random_search() + +# print(f'Best accuracy achieved is {result}') +# print(f'binary accuracy') + +# """### Testing the best model found""" + +# # +# # Load the best model (taking into account that it has a custom layer) +# # +# best_model_found =\ +# tf.keras.models.load_model(cerebros_automl.best_model_path,\ +# custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)}) + +# print('Evaluating on the test dataset') +# best_model_found.evaluate(X_test, y_test) From b790e64cbe54dda9d16e30dd5d0f35fb2180243e Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 17:03:18 -0400 Subject: [PATCH 004/100] Update phishing_email_detection_gpt2.py --- phishing_email_detection_gpt2.py | 223 ++++++++++++++++++------------- 1 file changed, 132 insertions(+), 91 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index d991b21..2106461 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -29,6 +29,8 @@ from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\ import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid from ast import literal_eval +import time + # # Load the email data @@ -141,10 +143,14 @@ def from_config(cls, config): metrics=['accuracy', tf.keras.metrics.AUC(name='auc')] ) +gpt_t0 = time.time() + +print(gpt_baseline_model.summary()) + history = gpt_baseline_model.fit( x=X_train, # Input data y=y_train, # Labels - epochs=20, # Number of training iterations + epochs=4, # Number of training iterations batch_size=16, # Batch size small due to GPU memory constraints validation_split=0.2, # Hold out 20% of training data for validation shuffle=True, # Shuffle data at each epoch @@ -164,111 +170,146 @@ def from_config(cls, config): ] ) +gpt_t1 = time.time() +gpt_time_on_one_model_min = (gpt_t1 - gpt_t1) / 60 + hy = history["history"] hy_df = pd.DataFrame(hy) print(hy_df) +### Cerebros model: +# TokenizerLayer class to handle tokenization and return only token_ids +class TokenizerLayer(tf.keras.layers.Layer): + def __init__(self, max_seq_length, **kwargs): + super().__init__(**kwargs) + self.tokenizer = GPT2Tokenizer.from_preset("gpt2_base_en") + self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length) + self.max_seq_length = max_seq_length + def call(self, inputs): + processed = self.preprocessor(inputs) # Accepts tensor of strings, outputs {"token_ids": ...} + return processed["token_ids"] # Output shape: (batch_size, max_seq_length) + def get_config(self): + base_config = super().get_config() + base_config.update({"max_seq_length": self.max_seq_length}) + return base_config +VOCAB_SIZE = GPT2Tokenizer.vocabulary_size() +# Create cerebros_base_model +def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=VOCAB_SIZE): + input_layer = Input(shape=(), dtype=tf.string) # Text input + token_ids = TokenizerLayer(max_seq_length)(input_layer) + # Build embedding layer with GPT2 tokenizer's vocabulary size (50257 for GPT2Base) + embedded = tf.keras.layers.Embedding( + input_dim=GPT2Tokenizer.vocabulary_size(), # Uses standard GPT-2 vocab size + output_dim=embedding_dim, + mask_zero=True, # Handle tokens + name="custom_embedding" + )(token_ids) + + # Flatten for downstream models + flattened = Flatten()(embedded) + model = Model(inputs=input_layer, outputs=flattened) + return model +# Example usage (outputs depend on parameters, set embedding_dim as desired) +cerebros_base_model = build_cerebros_base_model(max_seq_length=96) +"""### Cerebros search for the best model""" +# +# Cerebros configurables +# +activation = 'gelu' +predecessor_level_connection_affinity_factor_first = 49.9999 +predecessor_level_connection_affinity_factor_main = 0.31456 +max_consecutive_lateral_connections = 22 +p_lateral_connection = 0.39256 +num_lateral_connection_tries_per_unit = 10 +learning_rate = 0.0000511065 +epochs = 6 # [1, 100] +batch_size = 13 +maximum_levels = 4 # [3,7] +maximum_units_per_level = 8 # [2,10] +maximum_neurons_per_unit = 5 # [2,20] +moities_to_try = 2 +tries_per_moity = 1 - -# base_model = Model(inputs=input_layer, outputs=gpt2_layer) -# base_model.summary() - - - - - - - - -# """### Cerebros search for the best model""" - -# # -# # Cerebros configurables -# # -# activation = 'gelu' -# predecessor_level_connection_affinity_factor_first = 49.9999 -# predecessor_level_connection_affinity_factor_main = 0.31456 -# max_consecutive_lateral_connections = 22 -# p_lateral_connection = 0.39256 -# num_lateral_connection_tries_per_unit = 10 -# learning_rate = 0.0000511065 -# epochs = 6 # [1, 100] -# batch_size = 13 -# maximum_levels = 4 # [3,7] -# maximum_units_per_level = 8 # [2,10] -# maximum_neurons_per_unit = 5 # [2,20] - -# # -# # Logging -# # -# TIME = pendulum.now(tz='America/New_York').__str__()[:16]\ -# .replace('T', '_')\ -# .replace(':', '_')\ -# .replace('-', '_') -# PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test' - -# meta_trial_number = 42 # irrelevant unless in distributed training - -# cerebros_automl = SimpleCerebrosRandomSearch( -# unit_type=DenseUnit, -# input_shapes=INPUT_SHAPES, -# output_shapes=OUTPUT_SHAPES, -# training_data=training_x, -# labels=train_labels, -# validation_split=0.35, -# direction='maximize', -# metric_to_rank_by="val_binary_accuracy", -# minimum_levels=2, -# maximum_levels=maximum_levels, -# minimum_units_per_level=1, -# maximum_units_per_level=maximum_units_per_level, -# minimum_neurons_per_unit=1, -# maximum_neurons_per_unit=maximum_neurons_per_unit, -# activation=activation, -# final_activation='sigmoid', -# number_of_architecture_moities_to_try=2, -# number_of_tries_per_architecture_moity=1, -# minimum_skip_connection_depth=1, -# maximum_skip_connection_depth=7, -# predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first, -# predecessor_level_connection_affinity_factor_first_rounding_rule='ceil', -# predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main, -# predecessor_level_connection_affinity_factor_main_rounding_rule='ceil', -# predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay, -# seed=8675309, -# max_consecutive_lateral_connections=max_consecutive_lateral_connections, -# gate_after_n_lateral_connections=3, -# gate_activation_function=simple_sigmoid, -# p_lateral_connection=p_lateral_connection, -# p_lateral_connection_decay=zero_95_exp_decay, -# num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, -# learning_rate=learning_rate, -# loss=tf.keras.losses.CategoricalHinge(), -# metrics=[tf.keras.metrics.BinaryAccuracy(), -# tf.keras.metrics.Precision(), -# tf.keras.metrics.Recall()], -# epochs=epochs, -# project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", -# model_graphs='model_graphs', -# batch_size=batch_size, -# meta_trial_number=meta_trial_number, -# base_models=[base_model], -# train_data_dtype=tf.string) - -# result = cerebros_automl.run_random_search() - -# print(f'Best accuracy achieved is {result}') -# print(f'binary accuracy') +# +# Logging +# +TIME = pendulum.now(tz='America/New_York').__str__()[:16]\ + .replace('T', '_')\ + .replace(':', '_')\ + .replace('-', '_') +PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test' + +meta_trial_number = 42 # irrelevant unless in distributed training + + +cerebros_automl = SimpleCerebrosRandomSearch( + unit_type=DenseUnit, + input_shapes=INPUT_SHAPES, + output_shapes=OUTPUT_SHAPES, + training_data=training_x, + labels=train_labels, + validation_split=0.35, + direction='maximize', + metric_to_rank_by="val_accuracy", + minimum_levels=2, + maximum_levels=maximum_levels, + minimum_units_per_level=1, + maximum_units_per_level=maximum_units_per_level, + minimum_neurons_per_unit=1, + maximum_neurons_per_unit=maximum_neurons_per_unit, + activation=activation, + final_activation='sigmoid', + number_of_architecture_moities_to_try=moities_to_try, + number_of_tries_per_architecture_moity=tries_per_moity, + minimum_skip_connection_depth=1, + maximum_skip_connection_depth=7, + predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first, + predecessor_level_connection_affinity_factor_first_rounding_rule='ceil', + predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main, + predecessor_level_connection_affinity_factor_main_rounding_rule='ceil', + predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay, + seed=8675309, + max_consecutive_lateral_connections=max_consecutive_lateral_connections, + gate_after_n_lateral_connections=3, + gate_activation_function=simple_sigmoid, + p_lateral_connection=p_lateral_connection, + p_lateral_connection_decay=zero_95_exp_decay, + num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, + learning_rate=learning_rate, + loss=tf.keras.losses.CategoricalHinge(), + metrics=[tf.keras.metrics.Accuracy(), + tf.keras.metrics.Precision(), + tf.keras.metrics.Recall()], + epochs=epochs, + project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", + model_graphs='model_graphs', + batch_size=batch_size, + meta_trial_number=meta_trial_number, + base_models=[cerebros_base_model], + train_data_dtype=tf.string) + +cerebros_t0 = time.time() +result = cerebros_automl.run_random_search() +cerebros_t1 = time.time() +cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60 +cerebros_time_per_model = cerebros_time_all_models_min / (moities_to_try * tries_per_moity) + +print(f"Cerebros trained 2 models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") +print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") + + +print(f'Cerebros best accuracy achieved is {result}') +print(f'val set accuracy') # """### Testing the best model found""" From 15ec9c2c6444648610a68ffa9ebcf9e0e872c505 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 17:09:42 -0400 Subject: [PATCH 005/100] Update phishing_email_detection_gpt2.py Forgot to add dropout. --- phishing_email_detection_gpt2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 2106461..05f2af5 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -213,9 +213,11 @@ def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=V # Flatten for downstream models flattened = Flatten()(embedded) - model = Model(inputs=input_layer, outputs=flattened) + dropout = tf.keras.layers.Dropout(.6)(flattened) + model = Model(inputs=input_layer, outputs=dropout) return model + # Example usage (outputs depend on parameters, set embedding_dim as desired) cerebros_base_model = build_cerebros_base_model(max_seq_length=96) From 0cfb4889c1a9dadbb28c5bbfeac2aa211ce56ef4 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 17:40:05 -0400 Subject: [PATCH 006/100] Update phishing_email_detection_gpt2.py Amendments to Cerebros model. --- phishing_email_detection_gpt2.py | 101 ++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 35 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 05f2af5..71d34ee 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -181,46 +181,71 @@ def from_config(cls, config): # TokenizerLayer class to handle tokenization and return only token_ids class TokenizerLayer(tf.keras.layers.Layer): + def __init__(self, max_seq_length, **kwargs): - super().__init__(**kwargs) - self.tokenizer = GPT2Tokenizer.from_preset("gpt2_base_en") - self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length) + # + super(GPT2Layer, self).__init__(**kwargs) + # + # Load the GPT2 tokenizer, preprocessor and model + self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en") # "gpt2_base_en" + self.preprocessor = GPT2Preprocessor(self.tokenizer, + sequence_length=max_seq_length) + # self.encoder = GPT2Backbone.from_preset("gpt2_base_en") + # + # Set whether the GPT2 model's layers are trainable + # self.encoder.trainable = False + # for layer in self.encoder.layers: + # layer.trainable = False + # + # self.encoder.layers[-2].trainable = True + # + # Set the maximum sequence length for tokenization self.max_seq_length = max_seq_length def call(self, inputs): - processed = self.preprocessor(inputs) # Accepts tensor of strings, outputs {"token_ids": ...} - return processed["token_ids"] # Output shape: (batch_size, max_seq_length) + # + # Output the GPT2 embedding + prep = self.preprocessor([inputs]) + # embedding = self.encoder(prep) + # avg_pool = tf.reduce_mean(embedding, axis=1) + # + return prep['token_ids'] def get_config(self): - base_config = super().get_config() - base_config.update({"max_seq_length": self.max_seq_length}) - return base_config + # + config = super(GPT2Layer, self).get_config() + config.update({'max_seq_length': self.max_seq_length}) + # + return config + + @classmethod + def from_config(cls, config): + # + return cls(max_seq_length=config['max_seq_length']) +# GPT2 configurables -VOCAB_SIZE = GPT2Tokenizer.vocabulary_size() +max_seq_length = 900 -# Create cerebros_base_model -def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=VOCAB_SIZE): - input_layer = Input(shape=(), dtype=tf.string) # Text input - token_ids = TokenizerLayer(max_seq_length)(input_layer) - # Build embedding layer with GPT2 tokenizer's vocabulary size (50257 for GPT2Base) - embedded = tf.keras.layers.Embedding( - input_dim=GPT2Tokenizer.vocabulary_size(), # Uses standard GPT-2 vocab size - output_dim=embedding_dim, - mask_zero=True, # Handle tokens - name="custom_embedding" - )(token_ids) - - # Flatten for downstream models - flattened = Flatten()(embedded) - dropout = tf.keras.layers.Dropout(.6)(flattened) - model = Model(inputs=input_layer, outputs=dropout) - return model +inp = tf.keras.layers.Input(shape=(), dtype=tf.string) +gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) +VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size() +tokens = gp2_tokenizer(inp) -# Example usage (outputs depend on parameters, set embedding_dim as desired) -cerebros_base_model = build_cerebros_base_model(max_seq_length=96) +embedded =\ + tf.keras.layers.Embedding( + input_dim=VOCABULARY_SIZE, + output_dim=15, + input_length=max_seq_length, + mask_zero=True)(tokens) +dropout_embedded = tf.keras.layers.Dropout(0.6)(embedded) +flattened = tf.keras.layers.Flatten()(dropout_embedded) +cerebros_base_model =\ + tf.keras.Model( + inputs=inp, + outputs=flattened) """### Cerebros search for the best model""" @@ -234,11 +259,17 @@ def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=V p_lateral_connection = 0.39256 num_lateral_connection_tries_per_unit = 10 learning_rate = 0.0000511065 -epochs = 6 # [1, 100] -batch_size = 13 -maximum_levels = 4 # [3,7] -maximum_units_per_level = 8 # [2,10] +epochs = 15 # [1, 100] +batch_size = 20 +minimum_levels = 2 +maximum_levels = 4 # [3,7] + +minimum_units_per_level = 4 +maximum_units_per_level = 8 + +minimum_neurons_per_unit = 1 maximum_neurons_per_unit = 5 # [2,20] + moities_to_try = 2 tries_per_moity = 1 @@ -263,11 +294,11 @@ def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=V validation_split=0.35, direction='maximize', metric_to_rank_by="val_accuracy", - minimum_levels=2, + minimum_levels=minimum_levels, maximum_levels=maximum_levels, - minimum_units_per_level=1, + minimum_units_per_level=minimum_units_per_level, maximum_units_per_level=maximum_units_per_level, - minimum_neurons_per_unit=1, + minimum_neurons_per_unit=minimum_neurons_per_unit, maximum_neurons_per_unit=maximum_neurons_per_unit, activation=activation, final_activation='sigmoid', From 6f8695908a10a7b587ad2ed7bfc3e1f28776316a Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 17:43:56 -0400 Subject: [PATCH 007/100] Update phishing_email_detection_gpt2.py Reduce seq length to accelerate job completion. --- phishing_email_detection_gpt2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 71d34ee..d5700b2 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -225,7 +225,9 @@ def from_config(cls, config): # GPT2 configurables +# Optimal for accuracy: max_seq_length = 900 +max_seq_length = 250 inp = tf.keras.layers.Input(shape=(), dtype=tf.string) gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) From 830a2dcbf513f03f32027a39303a60b9392b4727 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 17:44:43 -0400 Subject: [PATCH 008/100] Update automerge.yml Up timeout to 300 min. --- .github/workflows/automerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 82aba07..9490c3e 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -53,5 +53,5 @@ jobs: # timeout-minutes: 90 # run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding - timeout-minutes: 120 + timeout-minutes: 300 run: python3 phishing_email_detection_gpt2.py From 407f90cb8759eaeb745fb7362c490c23c8dbe954 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 18:54:18 -0400 Subject: [PATCH 009/100] Update phishing_email_detection_gpt2.py Correct history indexing error. --- phishing_email_detection_gpt2.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index d5700b2..125fd4d 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -173,8 +173,7 @@ def from_config(cls, config): gpt_t1 = time.time() gpt_time_on_one_model_min = (gpt_t1 - gpt_t1) / 60 -hy = history["history"] -hy_df = pd.DataFrame(hy) +hy_df = pd.DataFrame(history.history) print(hy_df) ### Cerebros model: From d5bdbce83a96cf7345a0b950d95cc29de7b07920 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 18:57:52 -0400 Subject: [PATCH 010/100] Update phishing_email_detection_gpt2.py Temporary test to fast forward to cerebros model. --- phishing_email_detection_gpt2.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 125fd4d..c721d34 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -137,6 +137,8 @@ def from_config(cls, config): binary_output = tf.keras.layers.Dense(1, activation='sigmoid')(gpt2_layer) gpt_baseline_model = Model(inputs=input_layer, outputs=binary_output) +## Un - string out this +""" gpt_baseline_model.compile( optimizer=Adam(learning_rate=1e-4), # Small LR since we're fine-tuning GPT loss='binary_crossentropy', @@ -176,6 +178,8 @@ def from_config(cls, config): hy_df = pd.DataFrame(history.history) print(hy_df) +""" + ### Cerebros model: # TokenizerLayer class to handle tokenization and return only token_ids From d8db0f1b9160ff73caef2f385def3f7a7c164b3a Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 19:01:16 -0400 Subject: [PATCH 011/100] Update phishing_email_detection_gpt2.py Comment out an artifact of GPT test so we this can lint and run. --- phishing_email_detection_gpt2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index c721d34..c00dfa4 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -343,7 +343,8 @@ def from_config(cls, config): cerebros_time_per_model = cerebros_time_all_models_min / (moities_to_try * tries_per_moity) print(f"Cerebros trained 2 models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +# Un-comment this !!!!! +# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') From 014b3c3a48e369395404f64bcbd497e4bdb48b1d Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 19:16:06 -0400 Subject: [PATCH 012/100] Update phishing_email_detection_gpt2.py Fix errors from trying to work too fast ... --- phishing_email_detection_gpt2.py | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index c00dfa4..787a8b2 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -186,44 +186,22 @@ def from_config(cls, config): class TokenizerLayer(tf.keras.layers.Layer): def __init__(self, max_seq_length, **kwargs): - # - super(GPT2Layer, self).__init__(**kwargs) - # - # Load the GPT2 tokenizer, preprocessor and model - self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en") # "gpt2_base_en" - self.preprocessor = GPT2Preprocessor(self.tokenizer, - sequence_length=max_seq_length) - # self.encoder = GPT2Backbone.from_preset("gpt2_base_en") - # - # Set whether the GPT2 model's layers are trainable - # self.encoder.trainable = False - # for layer in self.encoder.layers: - # layer.trainable = False - # - # self.encoder.layers[-2].trainable = True - # - # Set the maximum sequence length for tokenization + super(TokenizerLayer, self).__init__(**kwargs) # Update this line + self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en") + self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length) self.max_seq_length = max_seq_length def call(self, inputs): - # - # Output the GPT2 embedding prep = self.preprocessor([inputs]) - # embedding = self.encoder(prep) - # avg_pool = tf.reduce_mean(embedding, axis=1) - # return prep['token_ids'] def get_config(self): - # - config = super(GPT2Layer, self).get_config() + config = super(TokenizerLayer, self).get_config() config.update({'max_seq_length': self.max_seq_length}) - # return config @classmethod def from_config(cls, config): - # return cls(max_seq_length=config['max_seq_length']) # GPT2 configurables From 0b67f881f0d08befb4a3549acd71ce1333fbe5b5 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 19:42:46 -0400 Subject: [PATCH 013/100] Update phishing_email_detection_gpt2.py Re-corrected the metrics BinaryAccuracy to correct AI introduced error. --- phishing_email_detection_gpt2.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 787a8b2..202f54c 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -70,7 +70,7 @@ # Training data for baseline model baseline_train_x = tf.constant(X_train) -baseline_train_y = tf.constant(y_train) +baseline_train_y = tf.constant(y_train, dtype=tf.int8) # Packaged for Cerebros (multimodal, takes inputs as a list) training_x = [baseline_train_x] @@ -142,7 +142,10 @@ def from_config(cls, config): gpt_baseline_model.compile( optimizer=Adam(learning_rate=1e-4), # Small LR since we're fine-tuning GPT loss='binary_crossentropy', - metrics=['accuracy', tf.keras.metrics.AUC(name='auc')] + # metrics=['accuracy', tf.keras.metrics.AUC(name='auc')] + metrics=[tf.keras.metrics.BinaryAccuracy(), + tf.keras.metrics.Precision(), + tf.keras.metrics.Recall()] ) gpt_t0 = time.time() @@ -303,9 +306,9 @@ def from_config(cls, config): num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, learning_rate=learning_rate, loss=tf.keras.losses.CategoricalHinge(), - metrics=[tf.keras.metrics.Accuracy(), - tf.keras.metrics.Precision(), - tf.keras.metrics.Recall()], + metrics=[tf.keras.metrics.BinaryAccuracy(), + tf.keras.metrics.Precision(), + tf.keras.metrics.Recall()], epochs=epochs, project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", model_graphs='model_graphs', From a480dfdbd7ca762fbfd3a60865045dc5f18b1858 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 19:54:18 -0400 Subject: [PATCH 014/100] Update phishing_email_detection_gpt2.py Correct metric to rank by (binary accuracy) ... --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 202f54c..273ded6 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -279,7 +279,7 @@ def from_config(cls, config): labels=train_labels, validation_split=0.35, direction='maximize', - metric_to_rank_by="val_accuracy", + metric_to_rank_by="val_binary_accuracy", minimum_levels=minimum_levels, maximum_levels=maximum_levels, minimum_units_per_level=minimum_units_per_level, From 0e72e61db1d482a9e5b1c26b8f9904c3164bd5eb Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 19:57:11 -0400 Subject: [PATCH 015/100] Update phishing_email_detection_gpt2.py Uncomment out GPT test ... --- phishing_email_detection_gpt2.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 273ded6..c073350 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -135,10 +135,10 @@ def from_config(cls, config): gpt2_layer = GPT2Layer(max_seq_length)(input_layer) #output = Flatten()(gpt2_layer) binary_output = tf.keras.layers.Dense(1, activation='sigmoid')(gpt2_layer) + gpt_baseline_model = Model(inputs=input_layer, outputs=binary_output) -## Un - string out this -""" + gpt_baseline_model.compile( optimizer=Adam(learning_rate=1e-4), # Small LR since we're fine-tuning GPT loss='binary_crossentropy', @@ -181,7 +181,6 @@ def from_config(cls, config): hy_df = pd.DataFrame(history.history) print(hy_df) -""" ### Cerebros model: @@ -321,11 +320,11 @@ def from_config(cls, config): result = cerebros_automl.run_random_search() cerebros_t1 = time.time() cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60 -cerebros_time_per_model = cerebros_time_all_models_min / (moities_to_try * tries_per_moity) +models_tried = moities_to_try * tries_per_moity +cerebros_time_per_model = cerebros_time_all_models_min / models_tried -print(f"Cerebros trained 2 models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -# Un-comment this !!!!! -# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") +print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') From 3cd5945f9b3b07a20f3f536ade3153d4c3ffde7c Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 20:27:11 -0400 Subject: [PATCH 016/100] Update phishing_email_detection_gpt2.py Upped number of trials to 5. --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index c073350..0b45d24 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -255,7 +255,7 @@ def from_config(cls, config): minimum_neurons_per_unit = 1 maximum_neurons_per_unit = 5 # [2,20] -moities_to_try = 2 +moities_to_try = 5 tries_per_moity = 1 # From 6a9e88d3799e91ba4ef2aead23dd1113713d6b89 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sat, 22 Mar 2025 22:40:13 -0400 Subject: [PATCH 017/100] Update phishing_email_detection_gpt2.py Make seq len 750, fix typo. --- phishing_email_detection_gpt2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 0b45d24..7159ced 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -176,7 +176,7 @@ def from_config(cls, config): ) gpt_t1 = time.time() -gpt_time_on_one_model_min = (gpt_t1 - gpt_t1) / 60 +gpt_time_on_one_model_min = (gpt_t1 - gpt_t0) / 60 hy_df = pd.DataFrame(history.history) print(hy_df) @@ -210,7 +210,7 @@ def from_config(cls, config): # Optimal for accuracy: max_seq_length = 900 -max_seq_length = 250 +max_seq_length = 750 inp = tf.keras.layers.Input(shape=(), dtype=tf.string) gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) From f24a8583b5fa5a35777b183d1186a643a94d5534 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sun, 23 Mar 2025 15:11:46 -0400 Subject: [PATCH 018/100] Update phishing_email_detection_gpt2.py Try 1024 seq len. --- phishing_email_detection_gpt2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 7159ced..32253d4 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -208,9 +208,9 @@ def from_config(cls, config): # GPT2 configurables -# Optimal for accuracy: -max_seq_length = 900 -max_seq_length = 750 +# Optimal for accuracy thus far: +# max_seq_length = 900 +max_seq_length = 1024 inp = tf.keras.layers.Input(shape=(), dtype=tf.string) gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) From 4e157563ae7a1c2130561ae88df82139dbf3d42d Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sun, 23 Mar 2025 15:13:10 -0400 Subject: [PATCH 019/100] Update automerge.yml Added branch to the workflow... --- .github/workflows/automerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 9490c3e..9ab79a0 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "154-benchmark-inference-times---cerebros-model-vs-original-gpt-2" ] + branches: [ "main", "156-try-1024-seq-length-with-cerebros-model-from-154" ] permissions: contents: read From 9a4db1554911173f37bd83c3c4cc2689228f5393 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Tue, 25 Mar 2025 18:49:07 -0400 Subject: [PATCH 020/100] Update phishing_email_detection_gpt2.py Added a positional embedding and a LayerNorm to the text embedding. --- phishing_email_detection_gpt2.py | 35 ++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 32253d4..0cc9b4d 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -84,6 +84,7 @@ """### A custom GPT2 encoder layer for text embedding""" +""" un - string out class GPT2Layer(tf.keras.layers.Layer): def __init__(self, max_seq_length, **kwargs): @@ -181,6 +182,7 @@ def from_config(cls, config): hy_df = pd.DataFrame(history.history) print(hy_df) +""" # end un - string out ### Cerebros model: @@ -224,13 +226,28 @@ def from_config(cls, config): output_dim=15, input_length=max_seq_length, mask_zero=True)(tokens) -dropout_embedded = tf.keras.layers.Dropout(0.6)(embedded) -flattened = tf.keras.layers.Flatten()(dropout_embedded) -cerebros_base_model =\ - tf.keras.Model( - inputs=inp, - outputs=flattened) +x = tf.keras.layers.add([embedded, position_embedding]) +x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x) +x = tf.keras.layers.Dropout(0.6)(x) # AI suggested 0.4 +flattened = tf.keras.layers.Flatten()(x) + +cerebros_base_model = tf.keras.Model( + inputs=inp, + outputs=flattened # Output enhanced embeddings now +) + + + + + +# dropout_embedded = tf.keras.layers.Dropout(0.6)(embedded) +# flattened = tf.keras.layers.Flatten()(dropout_embedded) + +# cerebros_base_model =\ +# tf.keras.Model( +# inputs=inp, +# outputs=flattened) """### Cerebros search for the best model""" @@ -304,7 +321,8 @@ def from_config(cls, config): p_lateral_connection_decay=zero_95_exp_decay, num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, learning_rate=learning_rate, - loss=tf.keras.losses.CategoricalHinge(), + loss=tf.keras.losses.BinaryCrossentropy(), + # loss=tf.keras.losses.CategoricalHinge(), metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()], @@ -324,7 +342,8 @@ def from_config(cls, config): cerebros_time_per_model = cerebros_time_all_models_min / models_tried print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +# Un-comment out the next line +# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') From 59cfa23b5ee8c91d5431609f580f825470db55d4 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Tue, 25 Mar 2025 18:53:10 -0400 Subject: [PATCH 021/100] Update phishing_email_detection_gpt2.py Missed position embedding in copy and paste ... --- phishing_email_detection_gpt2.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 0cc9b4d..5336fc5 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -227,6 +227,12 @@ def from_config(cls, config): input_length=max_seq_length, mask_zero=True)(tokens) +position_embedding = tf.keras.layers.PositionEmbedding( + input_dim=max_seq_length, + output_dim=EMBEDDING_DIM, + embeddings_initializer="uniform" +)(embedded) + x = tf.keras.layers.add([embedded, position_embedding]) x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x) x = tf.keras.layers.Dropout(0.6)(x) # AI suggested 0.4 From d928a5469cbbd3f922521bffa568b41c26dfa115 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Tue, 25 Mar 2025 19:10:16 -0400 Subject: [PATCH 022/100] Update phishing_email_detection_gpt2.py Synchronize embedding dim across embeddings. --- phishing_email_detection_gpt2.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 5336fc5..3fea678 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -220,12 +220,13 @@ def from_config(cls, config): tokens = gp2_tokenizer(inp) -embedded =\ - tf.keras.layers.Embedding( - input_dim=VOCABULARY_SIZE, - output_dim=15, - input_length=max_seq_length, - mask_zero=True)(tokens) +EMBEDDING_DIM = 15 # Define EMBEDDING_DIM here, to match your embedding layer. + +embedded = tf.keras.layers.Embedding( + input_dim=VOCABULARY_SIZE, + output_dim=EMBEDDING_DIM, + input_length=max_seq_length, + mask_zero=True)(tokens) position_embedding = tf.keras.layers.PositionEmbedding( input_dim=max_seq_length, From 3c25a2235fca470cb526b85bfb03ff24a3071e53 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Tue, 25 Mar 2025 19:46:32 -0400 Subject: [PATCH 023/100] Update phishing_email_detection_gpt2.py Corrected import of PositionEmbedding. --- phishing_email_detection_gpt2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 3fea678..052670f 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -14,6 +14,7 @@ import tensorflow as tf import tensorflow_text from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone +from keras_nlp.layers import PositionEmbedding from sklearn.model_selection import train_test_split from sklearn.utils import shuffle from tensorflow.keras.utils import to_categorical @@ -228,10 +229,9 @@ def from_config(cls, config): input_length=max_seq_length, mask_zero=True)(tokens) -position_embedding = tf.keras.layers.PositionEmbedding( - input_dim=max_seq_length, - output_dim=EMBEDDING_DIM, - embeddings_initializer="uniform" +position_embedding = PositionEmbedding( + sequence_length=max_seq_length, + initializer="uniform", )(embedded) x = tf.keras.layers.add([embedded, position_embedding]) From 88a1bd57c385e5b40248a8ec6c9ac1ecb1125ead Mon Sep 17 00:00:00 2001 From: David Thrower Date: Tue, 25 Mar 2025 22:50:03 -0400 Subject: [PATCH 024/100] Update phishing_email_detection_gpt2.py Remove layernorm, concat instead of add. --- phishing_email_detection_gpt2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 052670f..489edc7 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -234,8 +234,9 @@ def from_config(cls, config): initializer="uniform", )(embedded) -x = tf.keras.layers.add([embedded, position_embedding]) -x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x) +# x = tf.keras.layers.add([embedded, position_embedding]) +x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) +# x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x) x = tf.keras.layers.Dropout(0.6)(x) # AI suggested 0.4 flattened = tf.keras.layers.Flatten()(x) From 42d9c4f0cddf200faea640b9301549ddefe90f20 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 26 Mar 2025 01:05:40 -0400 Subject: [PATCH 025/100] Update phishing_email_detection_gpt2.py Try addition to merge embeddings without LayerNorm --- phishing_email_detection_gpt2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 489edc7..c40f922 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -234,8 +234,8 @@ def from_config(cls, config): initializer="uniform", )(embedded) -# x = tf.keras.layers.add([embedded, position_embedding]) -x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) +x = tf.keras.layers.add([embedded, position_embedding]) +# x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) # x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x) x = tf.keras.layers.Dropout(0.6)(x) # AI suggested 0.4 flattened = tf.keras.layers.Flatten()(x) From ed4641e1d717863b4fadd36ecd511989f6de5ea1 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 26 Mar 2025 12:54:09 -0400 Subject: [PATCH 026/100] Update phishing_email_detection_gpt2.py Restore optimal run with position embedding. Reduce max levels to fit the optimal run and reduce overhead. Test this to see if it works. if successful, add back the commented out comparison and PR. Then open an issue to optimize the params around this new model. We may need to run this on Katib to optimize the hyperparameters, as the model is fundamentally different than the original and can probably be optimized considerably. --- phishing_email_detection_gpt2.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index c40f922..b0b1d80 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -220,7 +220,8 @@ def from_config(cls, config): VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size() tokens = gp2_tokenizer(inp) - +# On larger hardware, this could probably be increased considerably and +# Probably would improve performance ... EMBEDDING_DIM = 15 # Define EMBEDDING_DIM here, to match your embedding layer. embedded = tf.keras.layers.Embedding( @@ -234,9 +235,11 @@ def from_config(cls, config): initializer="uniform", )(embedded) -x = tf.keras.layers.add([embedded, position_embedding]) -# x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) -# x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x) +# As an FYI, we tried an add layer both with and without +# LayerNorm ... It degraded accuracy +# Just an FYI for anyone trying to apply conventional wisdom +# to save you the time ... +x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) x = tf.keras.layers.Dropout(0.6)(x) # AI suggested 0.4 flattened = tf.keras.layers.Flatten()(x) @@ -272,7 +275,7 @@ def from_config(cls, config): epochs = 15 # [1, 100] batch_size = 20 minimum_levels = 2 -maximum_levels = 4 # [3,7] +maximum_levels = 3 # [3,7] minimum_units_per_level = 4 maximum_units_per_level = 8 From cdb445511a3fe81786b934827853ff2be1b4ee73 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 26 Mar 2025 14:50:24 -0400 Subject: [PATCH 027/100] Update phishing_email_detection_gpt2.py Hard set levels to the known optimum. --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index b0b1d80..f1105dc 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -274,7 +274,7 @@ def from_config(cls, config): learning_rate = 0.0000511065 epochs = 15 # [1, 100] batch_size = 20 -minimum_levels = 2 +minimum_levels = 3 maximum_levels = 3 # [3,7] minimum_units_per_level = 4 From 048eb1bf907adb339360e8057c32a0d8a43ae885 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 26 Mar 2025 15:34:54 -0400 Subject: [PATCH 028/100] Update phishing_email_detection_gpt2.py Corrected hard set on levels to correct optima. --- phishing_email_detection_gpt2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index f1105dc..9bb7e84 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -274,8 +274,8 @@ def from_config(cls, config): learning_rate = 0.0000511065 epochs = 15 # [1, 100] batch_size = 20 -minimum_levels = 3 -maximum_levels = 3 # [3,7] +minimum_levels = 4 +maximum_levels = 4 # [3,7] minimum_units_per_level = 4 maximum_units_per_level = 8 From b800cf7664e342ccfccfdb6500ca34edea05df86 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 26 Mar 2025 20:53:13 -0400 Subject: [PATCH 029/100] Update phishing_email_detection_gpt2.py Restore the best model yet. --- phishing_email_detection_gpt2.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 9bb7e84..3d99c85 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -85,7 +85,7 @@ """### A custom GPT2 encoder layer for text embedding""" -""" un - string out + class GPT2Layer(tf.keras.layers.Layer): def __init__(self, max_seq_length, **kwargs): @@ -183,7 +183,6 @@ def from_config(cls, config): hy_df = pd.DataFrame(history.history) print(hy_df) -""" # end un - string out ### Cerebros model: @@ -274,8 +273,8 @@ def from_config(cls, config): learning_rate = 0.0000511065 epochs = 15 # [1, 100] batch_size = 20 -minimum_levels = 4 -maximum_levels = 4 # [3,7] +minimum_levels = 2 +maximum_levels = 3 # [3,7] minimum_units_per_level = 4 maximum_units_per_level = 8 @@ -353,8 +352,7 @@ def from_config(cls, config): cerebros_time_per_model = cerebros_time_all_models_min / models_tried print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -# Un-comment out the next line -# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') From 7930a2d102c0a155bd8f386882d8adf342189bf3 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 26 Mar 2025 20:55:08 -0400 Subject: [PATCH 030/100] Update automerge.yml Add back the CICD test for image CLS. Prepare for PR. --- .github/workflows/automerge.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 9ab79a0..8a3c062 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -49,9 +49,9 @@ jobs: # - name: Test text classifier - random search - ham-spam # run: python3 text-class-ham-or-spam.py # timeout-minutes: 90 - # - name: Test image classifier - small subset of CIFAR10 # add back - # timeout-minutes: 90 - # run: python3 cifar10-example.py + - name: Test image classifier - small subset of CIFAR10 # add back + timeout-minutes: 90 + run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding timeout-minutes: 300 run: python3 phishing_email_detection_gpt2.py From e6ae27ca25be65eb00890f7663d7ad59c7a6c8c1 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sun, 30 Mar 2025 16:06:07 -0400 Subject: [PATCH 031/100] Update automerge.yml Comment out workflows that we don't need in dev. Delete permanantly disused workflows --- .github/workflows/automerge.yml | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 8a3c062..0efdd14 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "156-try-1024-seq-length-with-cerebros-model-from-154" ] + branches: [ "main", "158-try-adamw-optimizer" ] permissions: contents: read @@ -33,25 +33,16 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics -# - name: Test by running. -# run: python3 cerebros.py -# - name: Test distributed by running. -# run: python3 test_simple_cerebros_gridsearch.py -# - name: Test distributed random search wine by running. -# run: python3 random_search.py -# - name: Test CerebrosRealNeuronNetwork -# run: python3 realnn-regression-example-ames-no-preproc.py -# timeout-minutes: 45 - - name: Test distributed random search Ames by running - run: python3 regression-example-ames-no-preproc.py - - name: Test distributed random search Ames by running - Val set - run: python3 regression-example-ames-no-preproc-val-set.py + # - name: Test distributed random search Ames by running + # run: python3 regression-example-ames-no-preproc.py + # - name: Test distributed random search Ames by running - Val set + # run: python3 regression-example-ames-no-preproc-val-set.py # - name: Test text classifier - random search - ham-spam # run: python3 text-class-ham-or-spam.py # timeout-minutes: 90 - - name: Test image classifier - small subset of CIFAR10 # add back - timeout-minutes: 90 - run: python3 cifar10-example.py + # - name: Test image classifier - small subset of CIFAR10 # add back + # timeout-minutes: 90 + # run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding - timeout-minutes: 300 + timeout-minutes: 420 run: python3 phishing_email_detection_gpt2.py From 0eab09e7094dd355c417084b8e8f28f66b81a15c Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sun, 30 Mar 2025 16:23:20 -0400 Subject: [PATCH 032/100] Update neural_network_future.py Made AdamW the default optimizer. We need to parameterize this and an optional hyperparameter for the weight_decay. --- cerebros/neuralnetworkfuture/neural_network_future.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cerebros/neuralnetworkfuture/neural_network_future.py b/cerebros/neuralnetworkfuture/neural_network_future.py index b91adf6..4643794 100644 --- a/cerebros/neuralnetworkfuture/neural_network_future.py +++ b/cerebros/neuralnetworkfuture/neural_network_future.py @@ -332,8 +332,10 @@ def compile_neural_network(self): self.materialized_neural_network.compile( loss=self.loss, metrics=self.metrics, - optimizer=tf.keras.optimizers.Adam( - learning_rate=self.learning_rate), + optimizer=tf.keras.optimizers.AdamW( + learning_rate=self.learning_rate, + weight_decay=0.004 # Add weight decay parameter + ), jit_compile=jit_compile) def util_parse_connectivity_csv(self): From 8939f3cdb01668c7d57702f5484771124c4fc3f7 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Sun, 30 Mar 2025 16:31:26 -0400 Subject: [PATCH 033/100] Update phishing_email_detection_gpt2.py Test with default params with AdamW. --- phishing_email_detection_gpt2.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 3d99c85..ed79320 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -85,6 +85,7 @@ """### A custom GPT2 encoder layer for text embedding""" +""" class GPT2Layer(tf.keras.layers.Layer): @@ -183,6 +184,7 @@ def from_config(cls, config): hy_df = pd.DataFrame(history.history) print(hy_df) +""" ### Cerebros model: @@ -248,17 +250,6 @@ def from_config(cls, config): ) - - - -# dropout_embedded = tf.keras.layers.Dropout(0.6)(embedded) -# flattened = tf.keras.layers.Flatten()(dropout_embedded) - -# cerebros_base_model =\ -# tf.keras.Model( -# inputs=inp, -# outputs=flattened) - """### Cerebros search for the best model""" # @@ -352,7 +343,7 @@ def from_config(cls, config): cerebros_time_per_model = cerebros_time_all_models_min / models_tried print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') @@ -360,12 +351,3 @@ def from_config(cls, config): # """### Testing the best model found""" -# # -# # Load the best model (taking into account that it has a custom layer) -# # -# best_model_found =\ -# tf.keras.models.load_model(cerebros_automl.best_model_path,\ -# custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)}) - -# print('Evaluating on the test dataset') -# best_model_found.evaluate(X_test, y_test) From 966f71451716e212cceab7bea4dc109a4224421c Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 2 Apr 2025 16:49:02 -0400 Subject: [PATCH 034/100] Update phishing_email_detection_gpt2.py Combined best hyperparams from the hyperparameter optimization study with AdamW optimizer. --- phishing_email_detection_gpt2.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index ed79320..605e014 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -223,7 +223,7 @@ def from_config(cls, config): # On larger hardware, this could probably be increased considerably and # Probably would improve performance ... -EMBEDDING_DIM = 15 # Define EMBEDDING_DIM here, to match your embedding layer. +EMBEDDING_DIM = 23 # Define EMBEDDING_DIM here, to match your embedding layer. embedded = tf.keras.layers.Embedding( input_dim=VOCABULARY_SIZE, @@ -241,7 +241,7 @@ def from_config(cls, config): # Just an FYI for anyone trying to apply conventional wisdom # to save you the time ... x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) -x = tf.keras.layers.Dropout(0.6)(x) # AI suggested 0.4 +x = tf.keras.layers.Dropout(0.4)(x) # AI suggested 0.4 flattened = tf.keras.layers.Flatten()(x) cerebros_base_model = tf.keras.Model( @@ -255,23 +255,23 @@ def from_config(cls, config): # # Cerebros configurables # -activation = 'gelu' -predecessor_level_connection_affinity_factor_first = 49.9999 -predecessor_level_connection_affinity_factor_main = 0.31456 -max_consecutive_lateral_connections = 22 -p_lateral_connection = 0.39256 -num_lateral_connection_tries_per_unit = 10 -learning_rate = 0.0000511065 +activation = "relu" +predecessor_level_connection_affinity_factor_first = 10 +predecessor_level_connection_affinity_factor_main = 40 +max_consecutive_lateral_connections = 20 +p_lateral_connection = 30 +num_lateral_connection_tries_per_unit = 25 +learning_rate = 3 * 10 ** -3 epochs = 15 # [1, 100] -batch_size = 20 +batch_size = 17 minimum_levels = 2 -maximum_levels = 3 # [3,7] +maximum_levels = 2 # [3,7] minimum_units_per_level = 4 -maximum_units_per_level = 8 +maximum_units_per_level = 7 minimum_neurons_per_unit = 1 -maximum_neurons_per_unit = 5 # [2,20] +maximum_neurons_per_unit = 2 moities_to_try = 5 tries_per_moity = 1 From 9724e9d0f284e32e69ac4d6e7e15aff0105b0f8a Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 2 Apr 2025 16:49:44 -0400 Subject: [PATCH 035/100] Update automerge.yml Add branch to workflow to make it start. --- .github/workflows/automerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 0efdd14..1e7a494 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "158-try-adamw-optimizer" ] + branches: [ "main", "160-try-nlp-optima-from-2025-03-30-study-with-adamw" ] permissions: contents: read From 380928dc4ca8a2eaf08410b8df6089e59ac322b3 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 2 Apr 2025 19:03:28 -0400 Subject: [PATCH 036/100] Update automerge.yml Add back all to be used workflows. --- .github/workflows/automerge.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 1e7a494..f44c4b0 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -33,16 +33,16 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - # - name: Test distributed random search Ames by running - # run: python3 regression-example-ames-no-preproc.py - # - name: Test distributed random search Ames by running - Val set - # run: python3 regression-example-ames-no-preproc-val-set.py + - name: Test distributed random search Ames by running + run: python3 regression-example-ames-no-preproc.py + - name: Test distributed random search Ames by running - Val set + run: python3 regression-example-ames-no-preproc-val-set.py # - name: Test text classifier - random search - ham-spam # run: python3 text-class-ham-or-spam.py # timeout-minutes: 90 - # - name: Test image classifier - small subset of CIFAR10 # add back - # timeout-minutes: 90 - # run: python3 cifar10-example.py + - name: Test image classifier - small subset of CIFAR10 # add back + timeout-minutes: 90 + run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding timeout-minutes: 420 run: python3 phishing_email_detection_gpt2.py From 9323f5f880113c76d7f5329ff7e21e2f6d8018cc Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 2 Apr 2025 19:05:24 -0400 Subject: [PATCH 037/100] Update phishing_email_detection_gpt2.py Added back the GPT baseline model for comparison. --- phishing_email_detection_gpt2.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 605e014..25cdbc6 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -85,7 +85,6 @@ """### A custom GPT2 encoder layer for text embedding""" -""" class GPT2Layer(tf.keras.layers.Layer): @@ -184,7 +183,6 @@ def from_config(cls, config): hy_df = pd.DataFrame(history.history) print(hy_df) -""" ### Cerebros model: @@ -343,7 +341,7 @@ def from_config(cls, config): cerebros_time_per_model = cerebros_time_all_models_min / models_tried print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') From f683fb8e6c24ef9d92a06872573df6a19cef7fc6 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 2 Apr 2025 19:44:39 -0400 Subject: [PATCH 038/100] Update phishing_email_detection_gpt2.py Optimize NPL workflow for time's sake. --- phishing_email_detection_gpt2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 25cdbc6..91c1451 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -157,7 +157,7 @@ def from_config(cls, config): history = gpt_baseline_model.fit( x=X_train, # Input data y=y_train, # Labels - epochs=4, # Number of training iterations + epochs=3, # Number of training iterations batch_size=16, # Batch size small due to GPU memory constraints validation_split=0.2, # Hold out 20% of training data for validation shuffle=True, # Shuffle data at each epoch @@ -341,7 +341,7 @@ def from_config(cls, config): cerebros_time_per_model = cerebros_time_all_models_min / models_tried print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') From 69d9d1d1dcaec13a362b8ab255a4c6db19a693e9 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 3 Apr 2025 15:03:32 -0400 Subject: [PATCH 039/100] Update requirements.txt Added tqdm 4.67.1 to requirements. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 1964f13..146b1e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ pyvis==0.3.2 plotly==5.20.0 matplotlib==3.8.4 imageio==2.34.0 +tqdm==4.67.1 From ffb0e901c3323422f792beb2d58304970bead1dc Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 3 Apr 2025 15:27:20 -0400 Subject: [PATCH 040/100] Update simple_cerebros_random_search.py Try adding a global progress bar. --- .../simple_cerebros_random_search.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py index 125582c..b0690f5 100644 --- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py +++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py @@ -518,8 +518,11 @@ def run_moity_permutations(self, spec, subtrial_number, lock): return 0 def run_random_search(self): + iter_trial = 1 processes = [] - for i in np.arange(self.number_of_architecture_moities_to_try): + for i in tqdm(np.arange(self.number_of_architecture_moities_to_try), + desc=f"Moiety {iter_trial}" of {self.number_of_architecture_moities_to_try} running", + colour="#16ceeb"): self.parse_neural_network_structural_spec_random() spec = self.get_neural_network_spec() @@ -537,6 +540,7 @@ def run_random_search(self): p.start() for p in processes: p.join() + iter_trial += 1 # final_oracles = pd.concat(oracles, ignore_index=False) # if self.direction == "maximize": # return float(final_oracles[self.metric_to_rank_by].values.max()) From 9fafcccdc1ee7729d9945e200dc0d053904c415e Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 3 Apr 2025 15:28:23 -0400 Subject: [PATCH 041/100] Update automerge.yml Added branch to the workflow. --- .github/workflows/automerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index f44c4b0..d7ecd0a 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "160-try-nlp-optima-from-2025-03-30-study-with-adamw" ] + branches: [ "main", "162-add-a-tqdm-global-progress-bar-to-nas-search-task" ] permissions: contents: read From aba7589ff4b06d770597b6b3aab12b16af7d5a7e Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 3 Apr 2025 15:31:44 -0400 Subject: [PATCH 042/100] Update simple_cerebros_random_search.py Syntax correction. --- .../simplecerebrosrandomsearch/simple_cerebros_random_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py index b0690f5..0063f0e 100644 --- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py +++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py @@ -521,7 +521,7 @@ def run_random_search(self): iter_trial = 1 processes = [] for i in tqdm(np.arange(self.number_of_architecture_moities_to_try), - desc=f"Moiety {iter_trial}" of {self.number_of_architecture_moities_to_try} running", + desc=f"Moiety {iter_trial} of {self.number_of_architecture_moities_to_try} running", colour="#16ceeb"): self.parse_neural_network_structural_spec_random() spec = self.get_neural_network_spec() From f6284986d50eeeb01780da4e649ef2be655d33cc Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 3 Apr 2025 15:33:47 -0400 Subject: [PATCH 043/100] Update simple_cerebros_random_search.py Added import statement ... --- .../simplecerebrosrandomsearch/simple_cerebros_random_search.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py index 0063f0e..b48c601 100644 --- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py +++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import tensorflow as tf +from tqdm import tqdm from cerebros.denseautomlstructuralcomponent.\ dense_automl_structural_component \ import DenseAutoMlStructuralComponent, DenseLateralConnectivity, \ From 689f00394751ece3efeb0ab2d470576deed99614 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 3 Apr 2025 15:44:11 -0400 Subject: [PATCH 044/100] Update simple_cerebros_random_search.py Try to deal with progress bar floating away. --- .../simplecerebrosrandomsearch/simple_cerebros_random_search.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py index b48c601..1007557 100644 --- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py +++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py @@ -523,6 +523,7 @@ def run_random_search(self): processes = [] for i in tqdm(np.arange(self.number_of_architecture_moities_to_try), desc=f"Moiety {iter_trial} of {self.number_of_architecture_moities_to_try} running", + ascii=True, colour="#16ceeb"): self.parse_neural_network_structural_spec_random() spec = self.get_neural_network_spec() From 40c35827748b86bc22e982078fa77a2a7214cdc5 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 3 Apr 2025 16:17:19 -0400 Subject: [PATCH 045/100] Update simple_cerebros_random_search.py Fix increment of iter_trial. --- .../simplecerebrosrandomsearch/simple_cerebros_random_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py index 1007557..52041e2 100644 --- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py +++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py @@ -525,6 +525,7 @@ def run_random_search(self): desc=f"Moiety {iter_trial} of {self.number_of_architecture_moities_to_try} running", ascii=True, colour="#16ceeb"): + iter_trial += 1 self.parse_neural_network_structural_spec_random() spec = self.get_neural_network_spec() @@ -542,7 +543,6 @@ def run_random_search(self): p.start() for p in processes: p.join() - iter_trial += 1 # final_oracles = pd.concat(oracles, ignore_index=False) # if self.direction == "maximize": # return float(final_oracles[self.metric_to_rank_by].values.max()) From 642452ff4c2f1cc198c52e58be63f0660ed9db50 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 3 Apr 2025 16:28:23 -0400 Subject: [PATCH 046/100] Update simple_cerebros_random_search.py Use self.trial_number as the basis for trial in tqdm. --- .../simple_cerebros_random_search.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py index 52041e2..06ba512 100644 --- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py +++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py @@ -519,13 +519,12 @@ def run_moity_permutations(self, spec, subtrial_number, lock): return 0 def run_random_search(self): - iter_trial = 1 processes = [] for i in tqdm(np.arange(self.number_of_architecture_moities_to_try), - desc=f"Moiety {iter_trial} of {self.number_of_architecture_moities_to_try} running", + desc=f"Moiety {self.trial_number + 1} of {self.number_of_architecture_moities_to_try} running", ascii=True, colour="#16ceeb"): - iter_trial += 1 + self.parse_neural_network_structural_spec_random() spec = self.get_neural_network_spec() From 6f7c1f098e8174688acf583260c982360d90f698 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 3 Apr 2025 16:33:31 -0400 Subject: [PATCH 047/100] Update simple_cerebros_random_search.py --- .../simple_cerebros_random_search.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py index 06ba512..a412f79 100644 --- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py +++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py @@ -521,8 +521,7 @@ def run_moity_permutations(self, spec, subtrial_number, lock): def run_random_search(self): processes = [] for i in tqdm(np.arange(self.number_of_architecture_moities_to_try), - desc=f"Moiety {self.trial_number + 1} of {self.number_of_architecture_moities_to_try} running", - ascii=True, + desc=f"Global task progress", colour="#16ceeb"): self.parse_neural_network_structural_spec_random() From 713ac96a0f15755f42cf671cf1e67e961f4a9979 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 3 Apr 2025 16:36:48 -0400 Subject: [PATCH 048/100] Update simple_cerebros_random_search.py F string with no arguments replaced with regular string. --- .../simplecerebrosrandomsearch/simple_cerebros_random_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py index a412f79..0874e99 100644 --- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py +++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py @@ -521,7 +521,7 @@ def run_moity_permutations(self, spec, subtrial_number, lock): def run_random_search(self): processes = [] for i in tqdm(np.arange(self.number_of_architecture_moities_to_try), - desc=f"Global task progress", + desc="Global task progress", colour="#16ceeb"): self.parse_neural_network_structural_spec_random() From 7db4b152155de3cc2c0e71f72f574746d40dfec9 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Mon, 7 Apr 2025 20:02:57 -0400 Subject: [PATCH 049/100] Update phishing_email_detection_gpt2.py First attempt to integrate a rotary positional embedding. --- phishing_email_detection_gpt2.py | 40 +++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 91c1451..56c6d79 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -208,6 +208,38 @@ def get_config(self): def from_config(cls, config): return cls(max_seq_length=config['max_seq_length']) + +class RotaryPositionEmbedding(tf.keras.layers.Layer): + def __init__(self, max_seq_length, d_model, **kwargs): + super().__init__(**kwargs) + self.max_seq_length = max_seq_length + self.d_model = d_model + assert d_model % 2 == 0, "d_model must be even" + + # Precompute rotation matrices + inv_freq = 1.0 / (10000 ** (tf.range(0, d_model, 2, dtype=tf.float32) / d_model)) + positions = tf.range(max_seq_length, dtype=tf.float32) + sinusoid = tf.einsum('i,j->ij', positions, inv_freq) + + self.sin = tf.sin(sinusoid) + self.cos = tf.cos(sinusoid) + + def call(self, x): + batch_size = tf.shape(x)[0] + seq_len = tf.shape(x)[1] + + # Split dimensions into pairs + x = tf.reshape(x, [batch_size, seq_len, self.d_model//2, 2]) + + # Apply rotation + x_rot = tf.stack([ + x[..., 0] * self.cos[:seq_len] - x[..., 1] * self.sin[:seq_len], + x[..., 0] * self.sin[:seq_len] + x[..., 1] * self.cos[:seq_len] + ], axis=-1) + + return tf.reshape(x_rot, [batch_size, seq_len, self.d_model]) + + # GPT2 configurables # Optimal for accuracy thus far: @@ -221,7 +253,8 @@ def from_config(cls, config): # On larger hardware, this could probably be increased considerably and # Probably would improve performance ... -EMBEDDING_DIM = 23 # Define EMBEDDING_DIM here, to match your embedding layer. +EMBEDDING_N = 12 # Define EMBEDDING_DIM here, to match your embedding layer. +EMBEDDING_DIM = int(EMBEDDING_N * 2) embedded = tf.keras.layers.Embedding( input_dim=VOCABULARY_SIZE, @@ -229,9 +262,10 @@ def from_config(cls, config): input_length=max_seq_length, mask_zero=True)(tokens) -position_embedding = PositionEmbedding( +position_embedding = RotaryPositionEmbedding( sequence_length=max_seq_length, - initializer="uniform", + sequence_length=EMBEDDING_DIM + # initializer="uniform", )(embedded) # As an FYI, we tried an add layer both with and without From e3ad67d49dc18b5803587ccbe2e799a5903af036 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Mon, 7 Apr 2025 20:04:01 -0400 Subject: [PATCH 050/100] Update automerge.yml Add branch to workflow. --- .github/workflows/automerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index d7ecd0a..ba7af25 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "162-add-a-tqdm-global-progress-bar-to-nas-search-task" ] + branches: [ "main", "166-try-rope-embedding" ] permissions: contents: read From 2389efd1da81e0d203719210c18652d0de5ac9c0 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Mon, 7 Apr 2025 20:38:49 -0400 Subject: [PATCH 051/100] Update phishing_email_detection_gpt2.py Fix name of arg. --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 56c6d79..55f4b5d 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -264,7 +264,7 @@ def call(self, x): position_embedding = RotaryPositionEmbedding( sequence_length=max_seq_length, - sequence_length=EMBEDDING_DIM + d_model=EMBEDDING_DIM, # initializer="uniform", )(embedded) From 3bd57f38283780702db734f9f8fdcb8cf59a1c94 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Mon, 7 Apr 2025 21:56:16 -0400 Subject: [PATCH 052/100] Update phishing_email_detection_gpt2.py Syntax: Arg names ... --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 55f4b5d..c1b1c58 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -263,7 +263,7 @@ def call(self, x): mask_zero=True)(tokens) position_embedding = RotaryPositionEmbedding( - sequence_length=max_seq_length, + max_seq_length=max_seq_length, d_model=EMBEDDING_DIM, # initializer="uniform", )(embedded) From f88afbd297b71f30c334e782824003dfd4e62bce Mon Sep 17 00:00:00 2001 From: David Thrower Date: Tue, 8 Apr 2025 00:08:52 -0400 Subject: [PATCH 053/100] Update phishing_email_detection_gpt2.py --- phishing_email_detection_gpt2.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index c1b1c58..720bbf1 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -209,6 +209,7 @@ def from_config(cls, config): return cls(max_seq_length=config['max_seq_length']) + class RotaryPositionEmbedding(tf.keras.layers.Layer): def __init__(self, max_seq_length, d_model, **kwargs): super().__init__(**kwargs) @@ -218,32 +219,36 @@ def __init__(self, max_seq_length, d_model, **kwargs): # Precompute rotation matrices inv_freq = 1.0 / (10000 ** (tf.range(0, d_model, 2, dtype=tf.float32) / d_model)) + self.inv_freq = tf.cast(inv_freq, tf.float32) positions = tf.range(max_seq_length, dtype=tf.float32) - sinusoid = tf.einsum('i,j->ij', positions, inv_freq) - - self.sin = tf.sin(sinusoid) - self.cos = tf.cos(sinusoid) + self.sin = tf.sin(tf.einsum('i,j->ij', positions, inv_freq)) + self.cos = tf.cos(tf.einsum('i,j->ij', positions, inv_freq)) def call(self, x): batch_size = tf.shape(x)[0] seq_len = tf.shape(x)[1] - # Split dimensions into pairs - x = tf.reshape(x, [batch_size, seq_len, self.d_model//2, 2]) + # Compute sine and cosine matrices for current sequence length + sinusoid = tf.einsum('i,j->ij', tf.range(seq_len, dtype=tf.float32), self.inv_freq) + current_sin = tf.sin(sinusoid) + current_cos = tf.cos(sinusoid) - # Apply rotation - x_rot = tf.stack([ - x[..., 0] * self.cos[:seq_len] - x[..., 1] * self.sin[:seq_len], - x[..., 0] * self.sin[:seq_len] + x[..., 1] * self.cos[:seq_len] + # Split dimensions and apply rotation using einsum + x = tf.reshape(x, [batch_size, seq_len, self.d_model//2, 2]) + rotated = tf.stack([ + x[..., 0] * current_cos - x[..., 1] * current_sin, + x[..., 0] * current_sin + x[..., 1] * current_cos ], axis=-1) - return tf.reshape(x_rot, [batch_size, seq_len, self.d_model]) + # Reshape back and apply dropout + return tf.reshape(rotated, [batch_size, seq_len, self.d_model]) + + # GPT2 configurables # Optimal for accuracy thus far: -# max_seq_length = 900 max_seq_length = 1024 inp = tf.keras.layers.Input(shape=(), dtype=tf.string) From fec5b036f2536f183fc742122cb4a3e8aacceb07 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Tue, 8 Apr 2025 00:09:55 -0400 Subject: [PATCH 054/100] Update phishing_email_detection_gpt2.py String out baseline model for quick testing of ROPE model. --- phishing_email_detection_gpt2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 720bbf1..a104d02 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -85,7 +85,7 @@ """### A custom GPT2 encoder layer for text embedding""" - +""" class GPT2Layer(tf.keras.layers.Layer): def __init__(self, max_seq_length, **kwargs): @@ -182,7 +182,7 @@ def from_config(cls, config): hy_df = pd.DataFrame(history.history) print(hy_df) - +""" ### Cerebros model: From 5ca0ec7d583c39408fedc41170e1250d16f7496e Mon Sep 17 00:00:00 2001 From: David Thrower Date: Tue, 8 Apr 2025 00:11:41 -0400 Subject: [PATCH 055/100] Update automerge.yml Rapid test of the RoPE model. --- .github/workflows/automerge.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index ba7af25..d1415f3 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "166-try-rope-embedding" ] + branches: [ "main", "167-fix-computational-issues-with-rope-embedding" ] permissions: contents: read @@ -33,16 +33,16 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test distributed random search Ames by running - run: python3 regression-example-ames-no-preproc.py - - name: Test distributed random search Ames by running - Val set - run: python3 regression-example-ames-no-preproc-val-set.py + # - name: Test distributed random search Ames by running + # run: python3 regression-example-ames-no-preproc.py + # - name: Test distributed random search Ames by running - Val set + # run: python3 regression-example-ames-no-preproc-val-set.py # - name: Test text classifier - random search - ham-spam # run: python3 text-class-ham-or-spam.py # timeout-minutes: 90 - - name: Test image classifier - small subset of CIFAR10 # add back - timeout-minutes: 90 - run: python3 cifar10-example.py + # - name: Test image classifier - small subset of CIFAR10 # add back + # timeout-minutes: 90 + # run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding timeout-minutes: 420 run: python3 phishing_email_detection_gpt2.py From 46293949716d7c5ff5f6b70ce31f07b58f4266c7 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Tue, 8 Apr 2025 00:18:58 -0400 Subject: [PATCH 056/100] Update phishing_email_detection_gpt2.py Comment out extraneous print. --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index a104d02..8fd4ee5 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -380,7 +380,7 @@ def call(self, x): cerebros_time_per_model = cerebros_time_all_models_min / models_tried print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') From ee85cae701d21f9642b656ac11f3f7318c8f4dde Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 00:21:51 -0400 Subject: [PATCH 057/100] Update phishing_email_detection_gpt2.py Try Irope layer. --- phishing_email_detection_gpt2.py | 86 +++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 8fd4ee5..f9e29ca 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -210,38 +210,64 @@ def from_config(cls, config): -class RotaryPositionEmbedding(tf.keras.layers.Layer): - def __init__(self, max_seq_length, d_model, **kwargs): +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers + +class RotaryEmbedding(keras.layers.Layer): + def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): super().__init__(**kwargs) - self.max_seq_length = max_seq_length - self.d_model = d_model - assert d_model % 2 == 0, "d_model must be even" - - # Precompute rotation matrices - inv_freq = 1.0 / (10000 ** (tf.range(0, d_model, 2, dtype=tf.float32) / d_model)) - self.inv_freq = tf.cast(inv_freq, tf.float32) - positions = tf.range(max_seq_length, dtype=tf.float32) - self.sin = tf.sin(tf.einsum('i,j->ij', positions, inv_freq)) - self.cos = tf.cos(tf.einsum('i,j->ij', positions, inv_freq)) - + self.dim = dim + self.max_seq_len = max_seq_len + self.temperature = temperature + + def build(self, input_shape): + super().build(input_shape) + inv_freq = 1.0 / (self.temperature ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim)) + position = tf.range(self.max_seq_len, dtype=tf.float32) + sinusoid = tf.einsum("i,j->ij", position, inv_freq) + self.sin_cache = tf.sin(sinusoid) + self.cos_cache = tf.cos(sinusoid) + + def call(self, x, seq_len=None): + batch_size = tf.shape(x)[0] + seq_len = tf.shape(x)[1] if seq_len is None else seq_len + sin = self.sin_cache[:seq_len] + cos = self.cos_cache[:seq_len] + return tf.cast(sin, x.dtype), tf.cast(cos, x.dtype) + +def split_alternate(x): + shape = tf.shape(x) + x = tf.reshape(x, [shape[0], shape[1], shape[2] // 2, 2]) + x = tf.transpose(x, [0, 1, 3, 2]) + x = tf.reshape(x, [shape[0], shape[1], -1]) + return x + +def rotate_half(x): + x = split_alternate(x) + d = x.shape[-1] + return x[..., d//2:] + +def apply_rotary_pos_emb(x, sin, cos): + x_rotated = x * cos + rotate_half(x) * sin + return x_rotated + +class InterleavedRoPE(layers.Layer): + def __init__(self, dim, max_seq_len=1024, **kwargs): + super().__init__(**kwargs) + self.dim = dim + self.max_seq_len = max_seq_len + self.rotary_emb = RotaryEmbedding(dim, max_seq_len) + def call(self, x): batch_size = tf.shape(x)[0] seq_len = tf.shape(x)[1] - # Compute sine and cosine matrices for current sequence length - sinusoid = tf.einsum('i,j->ij', tf.range(seq_len, dtype=tf.float32), self.inv_freq) - current_sin = tf.sin(sinusoid) - current_cos = tf.cos(sinusoid) - - # Split dimensions and apply rotation using einsum - x = tf.reshape(x, [batch_size, seq_len, self.d_model//2, 2]) - rotated = tf.stack([ - x[..., 0] * current_cos - x[..., 1] * current_sin, - x[..., 0] * current_sin + x[..., 1] * current_cos - ], axis=-1) - - # Reshape back and apply dropout - return tf.reshape(rotated, [batch_size, seq_len, self.d_model]) + sin, cos = self.rotary_emb(x, seq_len) + x = apply_rotary_pos_emb(x, sin, cos) + return x + + @@ -249,7 +275,7 @@ def call(self, x): # GPT2 configurables # Optimal for accuracy thus far: -max_seq_length = 1024 +max_seq_length = 1024 * 2 inp = tf.keras.layers.Input(shape=(), dtype=tf.string) gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) @@ -267,9 +293,9 @@ def call(self, x): input_length=max_seq_length, mask_zero=True)(tokens) -position_embedding = RotaryPositionEmbedding( +position_embedding = InterleavedRoPE( + dim=EMBEDDING_DIM, max_seq_length=max_seq_length, - d_model=EMBEDDING_DIM, # initializer="uniform", )(embedded) From 51ee5e995ee9ff6c20dd3c94273b43f7f6822795 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 00:23:23 -0400 Subject: [PATCH 058/100] Update automerge.yml Add branch to workflow. --- .github/workflows/automerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index d1415f3..441e93e 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "167-fix-computational-issues-with-rope-embedding" ] + branches: [ "main", "168-try-integrating-a-irope-layer" ] permissions: contents: read From 7e7cd36e713c6f891a870e940c95fd6f6b3e94e0 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 00:28:33 -0400 Subject: [PATCH 059/100] Update phishing_email_detection_gpt2.py Corrected name of arg max_seq_len --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index f9e29ca..fbc90ae 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -295,7 +295,7 @@ def call(self, x): position_embedding = InterleavedRoPE( dim=EMBEDDING_DIM, - max_seq_length=max_seq_length, + max_seq_len=max_seq_length, # initializer="uniform", )(embedded) From 87270f87128163cabe42955f98acdb99d0dbc40e Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 00:31:16 -0400 Subject: [PATCH 060/100] Update phishing_email_detection_gpt2.py Remove duplicate imports ... --- phishing_email_detection_gpt2.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index fbc90ae..49bea71 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -210,11 +210,8 @@ def from_config(cls, config): -import tensorflow as tf -from tensorflow import keras -from tensorflow.keras import layers -class RotaryEmbedding(keras.layers.Layer): +class RotaryEmbedding(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): super().__init__(**kwargs) self.dim = dim @@ -252,7 +249,7 @@ def apply_rotary_pos_emb(x, sin, cos): x_rotated = x * cos + rotate_half(x) * sin return x_rotated -class InterleavedRoPE(layers.Layer): +class InterleavedRoPE(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, **kwargs): super().__init__(**kwargs) self.dim = dim From ccb19b1fd5bb9fe8942398df8beb48ffe0d8757b Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 00:42:44 -0400 Subject: [PATCH 061/100] Update phishing_email_detection_gpt2.py Fix issue: Max sequence length supported by tokenizer ... --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 49bea71..f62c8f1 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -272,7 +272,7 @@ def call(self, x): # GPT2 configurables # Optimal for accuracy thus far: -max_seq_length = 1024 * 2 +max_seq_length = 1024 inp = tf.keras.layers.Input(shape=(), dtype=tf.string) gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) From 10a48c19e560b3e7d3057932991de9aa0ecfed4d Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 00:53:00 -0400 Subject: [PATCH 062/100] Update phishing_email_detection_gpt2.py Fix dimensionality mismatch ... --- phishing_email_detection_gpt2.py | 744 +++++++++++++++++++++++++++++++ 1 file changed, 744 insertions(+) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index f62c8f1..03b988f 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -211,6 +211,542 @@ def from_config(cls, config): + +# Please explain the error: + +``` +Traceback (most recent call last): + File "/home/runner/work/cerebros-core-algorithm-alpha/cerebros-core-algorithm-alpha/phishing_email_detection_gpt2.py", line 293, in + position_embedding = InterleavedRoPE( + ^^^^^^^^^^^^^^^^ + File "/opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler + raise e.with_traceback(filtered_tb) from None + File "/tmp/__autograph_generated_file1glgbmnr.py", line 13, in tf__call + x = ag__.converted_call(ag__.ld(apply_rotary_pos_emb), (ag__.ld(x), ag__.ld(sin), ag__.ld(cos)), None, fscope) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/__autograph_generated_fileqgikeir3.py", line 10, in tf__apply_rotary_pos_emb + x_rotated = ag__.ld(x) * ag__.ld(cos) + ag__.converted_call(ag__.ld(rotate_half), (ag__.ld(x),), None, fscope) * ag__.ld(sin) + ~~~~~~~~~~~^~~~~~~~~~~~~~ +ValueError: Exception encountered when calling layer "interleaved_ro_pe" (type InterleavedRoPE). +in user code: + File "/home/runner/work/cerebros-core-algorithm-alpha/cerebros-core-algorithm-alpha/phishing_email_detection_gpt2.py", line 264, in call * + x = apply_rotary_pos_emb(x, sin, cos) + File "/home/runner/work/cerebros-core-algorithm-alpha/cerebros-core-algorithm-alpha/phishing_email_detection_gpt2.py", line 249, in apply_rotary_pos_emb * + x_rotated = x * cos + rotate_half(x) * sin + ValueError: Dimensions must be equal, but are 24 and 12 for '{{node interleaved_ro_pe/mul}} = Mul[T=DT_FLOAT](Placeholder, interleaved_ro_pe/rotary_embedding/strided_slice_2)' with input shapes: [?,1024,24], [1024,12]. +Call arguments received by layer "interleaved_ro_pe" (type InterleavedRoPE): + • x=tf.Tensor(shape=(None, 1024, 24), dtype=float32) +``` + + +# From the code + +``` +import tensorflow as tf +import tensorflow_text +from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone +from keras_nlp.layers import PositionEmbedding +from sklearn.model_selection import train_test_split +from sklearn.utils import shuffle +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.models import Model +from tensorflow.keras.layers import Input, Flatten +import pandas as pd +import numpy as np +from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\ + import SimpleCerebrosRandomSearch +import pendulum +from cerebros.units.units import DenseUnit +from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\ + import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid +from ast import literal_eval +import time + + +# +# Load the email data +# +df = pd.read_csv("Phishing_Email.csv") +# +# Get the rows where 'Email Text' is a string, remove everything else +# +df = df[df['Email Text'].apply(lambda x: isinstance(x, str))] +# +# Reset the index +# +df.reset_index(drop=True, inplace=True) + +# +# Binary label for email type: positive type is "phishing" +# +label_mapping = {"Safe Email": 0, "Phishing Email": 1} +df["Binary Label"] = df["Email Type"].map(label_mapping) +# +# Data and labels ready +# +X = df["Email Text"].to_numpy() +y = df["Binary Label"].to_numpy() +# +# Shuffle the data +# +X, y = shuffle(X, y) + +# Train / test split : we give 85% of the data for *testing* +X_train, X_test, y_train, y_test = \ +train_test_split(X, y, test_size=0.85, shuffle=False) + +# +# Tensors for training data and labels +# + +# Training data for baseline model +baseline_train_x = tf.constant(X_train) +baseline_train_y = tf.constant(y_train, dtype=tf.int8) + +# Packaged for Cerebros (multimodal, takes inputs as a list) +training_x = [baseline_train_x] +train_labels = [baseline_train_y] + +# +# Input and output shapes +# +INPUT_SHAPES = [()] +OUTPUT_SHAPES = [1] + +"""### A custom GPT2 encoder layer for text embedding""" + +""" +class GPT2Layer(tf.keras.layers.Layer): + + def __init__(self, max_seq_length, **kwargs): + # + super(GPT2Layer, self).__init__(**kwargs) + # + # Load the GPT2 tokenizer, preprocessor and model + self.tokenizer = GPT2Tokenizer.from_preset("gpt2_base_en") + self.preprocessor = GPT2Preprocessor(self.tokenizer, + sequence_length=max_seq_length) + self.encoder = GPT2Backbone.from_preset("gpt2_base_en") + # + # Set whether the GPT2 model's layers are trainable + #self.encoder.trainable = False + for layer in self.encoder.layers: + layer.trainable = True + # + # self.encoder.layers[-2].trainable = True + # + # Set the maximum sequence length for tokenization + self.max_seq_length = max_seq_length + + def call(self, inputs): + # + # Output the GPT2 embedding + prep = self.preprocessor([inputs]) + embedding = self.encoder(prep) + avg_pool = tf.reduce_mean(embedding, axis=1) + # + return avg_pool + + def get_config(self): + # + config = super(GPT2Layer, self).get_config() + config.update({'max_seq_length': self.max_seq_length}) + # + return config + + @classmethod + def from_config(cls, config): + # + return cls(max_seq_length=config['max_seq_length']) + +# GPT2 configurables +max_seq_length = 96 + +# GPT Baseline Model +input_layer = Input(shape=(), dtype=tf.string) +gpt2_layer = GPT2Layer(max_seq_length)(input_layer) +#output = Flatten()(gpt2_layer) +binary_output = tf.keras.layers.Dense(1, activation='sigmoid')(gpt2_layer) + +gpt_baseline_model = Model(inputs=input_layer, outputs=binary_output) + + +gpt_baseline_model.compile( + optimizer=Adam(learning_rate=1e-4), # Small LR since we're fine-tuning GPT + loss='binary_crossentropy', + # metrics=['accuracy', tf.keras.metrics.AUC(name='auc')] + metrics=[tf.keras.metrics.BinaryAccuracy(), + tf.keras.metrics.Precision(), + tf.keras.metrics.Recall()] +) + +gpt_t0 = time.time() + +print(gpt_baseline_model.summary()) + +history = gpt_baseline_model.fit( + x=X_train, # Input data + y=y_train, # Labels + epochs=3, # Number of training iterations + batch_size=16, # Batch size small due to GPU memory constraints + validation_split=0.2, # Hold out 20% of training data for validation + shuffle=True, # Shuffle data at each epoch + callbacks=[ + tf.keras.callbacks.EarlyStopping( + monitor='val_loss', + patience=3, + restore_best_weights=True, + min_delta=0.001 + ), + tf.keras.callbacks.ReduceLROnPlateau( + monitor='val_loss', + factor=0.2, + patience=2, + min_lr=1e-6 + ) + ] +) + +gpt_t1 = time.time() +gpt_time_on_one_model_min = (gpt_t1 - gpt_t0) / 60 + +hy_df = pd.DataFrame(history.history) +print(hy_df) +""" + +### Cerebros model: + +# TokenizerLayer class to handle tokenization and return only token_ids +class TokenizerLayer(tf.keras.layers.Layer): + + def __init__(self, max_seq_length, **kwargs): + super(TokenizerLayer, self).__init__(**kwargs) # Update this line + self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en") + self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length) + self.max_seq_length = max_seq_length + + def call(self, inputs): + prep = self.preprocessor([inputs]) + return prep['token_ids'] + + def get_config(self): + config = super(TokenizerLayer, self).get_config() + config.update({'max_seq_length': self.max_seq_length}) + return config + + @classmethod + def from_config(cls, config): + return cls(max_seq_length=config['max_seq_length']) + + + + +class RotaryEmbedding(tf.keras.layers.Layer): + def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): + super().__init__(**kwargs) + self.dim = dim + self.max_seq_len = max_seq_len + self.temperature = temperature + + def build(self, input_shape): + super().build(input_shape) + inv_freq = 1.0 / (self.temperature ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim)) + position = tf.range(self.max_seq_len, dtype=tf.float32) + sinusoid = tf.einsum("i,j->ij", position, inv_freq) + self.sin_cache = tf.sin(sinusoid) + self.cos_cache = tf.cos(sinusoid) + + def call(self, x, seq_len=None): + batch_size = tf.shape(x)[0] + seq_len = tf.shape(x)[1] if seq_len is None else seq_len + sin = self.sin_cache[:seq_len] + cos = self.cos_cache[:seq_len] + return tf.cast(sin, x.dtype), tf.cast(cos, x.dtype) + +def split_alternate(x): + shape = tf.shape(x) + x = tf.reshape(x, [shape[0], shape[1], shape[2] // 2, 2]) + x = tf.transpose(x, [0, 1, 3, 2]) + x = tf.reshape(x, [shape[0], shape[1], -1]) + return x + +def rotate_half(x): + x = split_alternate(x) + d = x.shape[-1] + return x[..., d//2:] + +def apply_rotary_pos_emb(x, sin, cos): + x_rotated = x * cos + rotate_half(x) * sin + return x_rotated + +class InterleavedRoPE(tf.keras.layers.Layer): + def __init__(self, dim, max_seq_len=1024, **kwargs): + super().__init__(**kwargs) + self.dim = dim + self.max_seq_len = max_seq_len + self.rotary_emb = RotaryEmbedding(dim, max_seq_len) + + def call(self, x): + batch_size = tf.shape(x)[0] + seq_len = tf.shape(x)[1] + + sin, cos = self.rotary_emb(x, seq_len) + x = apply_rotary_pos_emb(x, sin, cos) + return x + + + + + + +# GPT2 configurables + +# Optimal for accuracy thus far: +max_seq_length = 1024 + +inp = tf.keras.layers.Input(shape=(), dtype=tf.string) +gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) +VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size() +tokens = gp2_tokenizer(inp) + +# On larger hardware, this could probably be increased considerably and +# Probably would improve performance ... +EMBEDDING_N = 12 # Define EMBEDDING_DIM here, to match your embedding layer. +EMBEDDING_DIM = int(EMBEDDING_N * 2) + +embedded = tf.keras.layers.Embedding( + input_dim=VOCABULARY_SIZE, + output_dim=EMBEDDING_DIM, + input_length=max_seq_length, + mask_zero=True)(tokens) + +position_embedding = InterleavedRoPE( + dim=EMBEDDING_DIM, + max_seq_len=max_seq_length, + # initializer="uniform", +)(embedded) + +# As an FYI, we tried an add layer both with and without +# LayerNorm ... It degraded accuracy +# Just an FYI for anyone trying to apply conventional wisdom +# to save you the time ... +x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) +x = tf.keras.layers.Dropout(0.4)(x) # AI suggested 0.4 +flattened = tf.keras.layers.Flatten()(x) + +cerebros_base_model = tf.keras.Model( + inputs=inp, + outputs=flattened # Output enhanced embeddings now +) + + +"""### Cerebros search for the best model""" + +# +# Cerebros configurables +# +activation = "relu" +predecessor_level_connection_affinity_factor_first = 10 +predecessor_level_connection_affinity_factor_main = 40 +max_consecutive_lateral_connections = 20 +p_lateral_connection = 30 +num_lateral_connection_tries_per_unit = 25 +learning_rate = 3 * 10 ** -3 +epochs = 15 # [1, 100] +batch_size = 17 +minimum_levels = 2 +maximum_levels = 2 # [3,7] + +minimum_units_per_level = 4 +maximum_units_per_level = 7 + +minimum_neurons_per_unit = 1 +maximum_neurons_per_unit = 2 + +moities_to_try = 5 +tries_per_moity = 1 + +# +# Logging +# +TIME = pendulum.now(tz='America/New_York').__str__()[:16]\ + .replace('T', '_')\ + .replace(':', '_')\ + .replace('-', '_') +PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test' + +meta_trial_number = 42 # irrelevant unless in distributed training + + +cerebros_automl = SimpleCerebrosRandomSearch( + unit_type=DenseUnit, + input_shapes=INPUT_SHAPES, + output_shapes=OUTPUT_SHAPES, + training_data=training_x, + labels=train_labels, + validation_split=0.35, + direction='maximize', + metric_to_rank_by="val_binary_accuracy", + minimum_levels=minimum_levels, + maximum_levels=maximum_levels, + minimum_units_per_level=minimum_units_per_level, + maximum_units_per_level=maximum_units_per_level, + minimum_neurons_per_unit=minimum_neurons_per_unit, + maximum_neurons_per_unit=maximum_neurons_per_unit, + activation=activation, + final_activation='sigmoid', + number_of_architecture_moities_to_try=moities_to_try, + number_of_tries_per_architecture_moity=tries_per_moity, + minimum_skip_connection_depth=1, + maximum_skip_connection_depth=7, + predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first, + predecessor_level_connection_affinity_factor_first_rounding_rule='ceil', + predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main, + predecessor_level_connection_affinity_factor_main_rounding_rule='ceil', + predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay, + seed=8675309, + max_consecutive_lateral_connections=max_consecutive_lateral_connections, + gate_after_n_lateral_connections=3, + gate_activation_function=simple_sigmoid, + p_lateral_connection=p_lateral_connection, + p_lateral_connection_decay=zero_95_exp_decay, + num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, + learning_rate=learning_rate, + loss=tf.keras.losses.BinaryCrossentropy(), + # loss=tf.keras.losses.CategoricalHinge(), + metrics=[tf.keras.metrics.BinaryAccuracy(), + tf.keras.metrics.Precision(), + tf.keras.metrics.Recall()], + epochs=epochs, + project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", + model_graphs='model_graphs', + batch_size=batch_size, + meta_trial_number=meta_trial_number, + base_models=[cerebros_base_model], + train_data_dtype=tf.string) + +cerebros_t0 = time.time() +result = cerebros_automl.run_random_search() +cerebros_t1 = time.time() +cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60 +models_tried = moities_to_try * tries_per_moity +cerebros_time_per_model = cerebros_time_all_models_min / models_tried + +print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") +# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") + + +print(f'Cerebros best accuracy achieved is {result}') +print(f'val set accuracy') + +# """### Testing the best model found""" +``` + + +From the code +``` +import tensorflow as tf +import tensorflow_text +from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone +from keras_nlp.layers import PositionEmbedding +from sklearn.model_selection import train_test_split +from sklearn.utils import shuffle +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.models import Model +from tensorflow.keras.layers import Input, Flatten +import pandas as pd +import numpy as np +from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\ + import SimpleCerebrosRandomSearch +import pendulum +from cerebros.units.units import DenseUnit +from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\ + import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid +from ast import literal_eval +import time + + +# +# Load the email data +# +df = pd.read_csv("Phishing_Email.csv") +# +# Get the rows where 'Email Text' is a string, remove everything else +# +df = df[df['Email Text'].apply(lambda x: isinstance(x, str))] +# +# Reset the index +# +df.reset_index(drop=True, inplace=True) + +# +# Binary label for email type: positive type is "phishing" +# +label_mapping = {"Safe Email": 0, "Phishing Email": 1} +df["Binary Label"] = df["Email Type"].map(label_mapping) +# +# Data and labels ready +# +X = df["Email Text"].to_numpy() +y = df["Binary Label"].to_numpy() +# +# Shuffle the data +# +X, y = shuffle(X, y) + +# Train / test split : we give 85% of the data for *testing* +X_train, X_test, y_train, y_test = \ +train_test_split(X, y, test_size=0.85, shuffle=False) + +# +# Tensors for training data and labels +# + +# Training data for baseline model +baseline_train_x = tf.constant(X_train) +baseline_train_y = tf.constant(y_train, dtype=tf.int8) + +# Packaged for Cerebros (multimodal, takes inputs as a list) +training_x = [baseline_train_x] +train_labels = [baseline_train_y] + +# +# Input and output shapes +# +INPUT_SHAPES = [()] +OUTPUT_SHAPES = [1] + + + +### Cerebros model: + +# TokenizerLayer class to handle tokenization and return only token_ids +class TokenizerLayer(tf.keras.layers.Layer): + + def __init__(self, max_seq_length, **kwargs): + super(TokenizerLayer, self).__init__(**kwargs) # Update this line + self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en") + self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length) + self.max_seq_length = max_seq_length + + def call(self, inputs): + prep = self.preprocessor([inputs]) + return prep['token_ids'] + + def get_config(self): + config = super(TokenizerLayer, self).get_config() + config.update({'max_seq_length': self.max_seq_length}) + return config + + @classmethod + def from_config(cls, config): + return cls(max_seq_length=config['max_seq_length']) + + + + class RotaryEmbedding(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): super().__init__(**kwargs) @@ -406,6 +942,214 @@ def call(self, x): # print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +print(f'Cerebros best accuracy achieved is {result}') +print(f'val set accuracy') + +# """### Testing the best model found""" +``` + + + + + class RotaryEmbedding(tf.keras.layers.Layer): + def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): + super().__init__(**kwargs) + self.dim = dim + self.max_seq_len = max_seq_len + self.temperature = temperature + + def build(self, input_shape): + super().build(input_shape) + inv_freq = 1.0 / (self.temperature ** (tf.range(0, self.dim // 2, dtype=tf.float32) / (self.dim // 2))) + position = tf.range(self.max_seq_len, dtype=tf.float32) + sinusoid = tf.einsum("i,j->ij", position, inv_freq) + sin = tf.sin(sinusoid) + cos = tf.cos(sinusoid) + self.sin_cache = tf.concat([sin, sin], axis=-1) + self.cos_cache = tf.concat([cos, cos], axis=-1) + + def call(self, x, seq_len=None): + batch_size = tf.shape(x)[0] + seq_len = tf.shape(x)[1] if seq_len is None else seq_len + sin = self.sin_cache[:seq_len] + cos = self.cos_cache[:seq_len] + return tf.cast(sin, x.dtype), tf.cast(cos, x.dtype) + + + +def split_alternate(x): + shape = tf.shape(x) + x = tf.reshape(x, [shape[0], shape[1], shape[2] // 2, 2]) + x = tf.transpose(x, [0, 1, 3, 2]) + x = tf.reshape(x, [shape[0], shape[1], -1]) + return x + +def rotate_half(x): + x = split_alternate(x) + d = x.shape[-1] + return x[..., d//2:] + +def apply_rotary_pos_emb(x, sin, cos): + x_rotated = x * cos + rotate_half(x) * sin + return x_rotated + +class InterleavedRoPE(tf.keras.layers.Layer): + def __init__(self, dim, max_seq_len=1024, **kwargs): + super().__init__(**kwargs) + self.dim = dim + self.max_seq_len = max_seq_len + self.rotary_emb = RotaryEmbedding(dim, max_seq_len) + + def call(self, x): + batch_size = tf.shape(x)[0] + seq_len = tf.shape(x)[1] + + sin, cos = self.rotary_emb(x, seq_len) + x = apply_rotary_pos_emb(x, sin, cos) + return x + + + + + + +# GPT2 configurables + +# Optimal for accuracy thus far: +max_seq_length = 1024 + +inp = tf.keras.layers.Input(shape=(), dtype=tf.string) +gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) +VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size() +tokens = gp2_tokenizer(inp) + +# On larger hardware, this could probably be increased considerably and +# Probably would improve performance ... +EMBEDDING_N = 12 # Define EMBEDDING_DIM here, to match your embedding layer. +EMBEDDING_DIM = int(EMBEDDING_N * 2) + +embedded = tf.keras.layers.Embedding( + input_dim=VOCABULARY_SIZE, + output_dim=EMBEDDING_DIM, + input_length=max_seq_length, + mask_zero=True)(tokens) + +position_embedding = InterleavedRoPE( + dim=EMBEDDING_DIM, + max_seq_len=max_seq_length, + # initializer="uniform", +)(embedded) + +# As an FYI, we tried an add layer both with and without +# LayerNorm ... It degraded accuracy +# Just an FYI for anyone trying to apply conventional wisdom +# to save you the time ... +x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) +x = tf.keras.layers.Dropout(0.4)(x) # AI suggested 0.4 +flattened = tf.keras.layers.Flatten()(x) + +cerebros_base_model = tf.keras.Model( + inputs=inp, + outputs=flattened # Output enhanced embeddings now +) + + +"""### Cerebros search for the best model""" + +# +# Cerebros configurables +# +activation = "relu" +predecessor_level_connection_affinity_factor_first = 10 +predecessor_level_connection_affinity_factor_main = 40 +max_consecutive_lateral_connections = 20 +p_lateral_connection = 30 +num_lateral_connection_tries_per_unit = 25 +learning_rate = 3 * 10 ** -3 +epochs = 15 # [1, 100] +batch_size = 17 +minimum_levels = 2 +maximum_levels = 2 # [3,7] + +minimum_units_per_level = 4 +maximum_units_per_level = 7 + +minimum_neurons_per_unit = 1 +maximum_neurons_per_unit = 2 + +moities_to_try = 5 +tries_per_moity = 1 + +# +# Logging +# +TIME = pendulum.now(tz='America/New_York').__str__()[:16]\ + .replace('T', '_')\ + .replace(':', '_')\ + .replace('-', '_') +PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test' + +meta_trial_number = 42 # irrelevant unless in distributed training + + +cerebros_automl = SimpleCerebrosRandomSearch( + unit_type=DenseUnit, + input_shapes=INPUT_SHAPES, + output_shapes=OUTPUT_SHAPES, + training_data=training_x, + labels=train_labels, + validation_split=0.35, + direction='maximize', + metric_to_rank_by="val_binary_accuracy", + minimum_levels=minimum_levels, + maximum_levels=maximum_levels, + minimum_units_per_level=minimum_units_per_level, + maximum_units_per_level=maximum_units_per_level, + minimum_neurons_per_unit=minimum_neurons_per_unit, + maximum_neurons_per_unit=maximum_neurons_per_unit, + activation=activation, + final_activation='sigmoid', + number_of_architecture_moities_to_try=moities_to_try, + number_of_tries_per_architecture_moity=tries_per_moity, + minimum_skip_connection_depth=1, + maximum_skip_connection_depth=7, + predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first, + predecessor_level_connection_affinity_factor_first_rounding_rule='ceil', + predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main, + predecessor_level_connection_affinity_factor_main_rounding_rule='ceil', + predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay, + seed=8675309, + max_consecutive_lateral_connections=max_consecutive_lateral_connections, + gate_after_n_lateral_connections=3, + gate_activation_function=simple_sigmoid, + p_lateral_connection=p_lateral_connection, + p_lateral_connection_decay=zero_95_exp_decay, + num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, + learning_rate=learning_rate, + loss=tf.keras.losses.BinaryCrossentropy(), + # loss=tf.keras.losses.CategoricalHinge(), + metrics=[tf.keras.metrics.BinaryAccuracy(), + tf.keras.metrics.Precision(), + tf.keras.metrics.Recall()], + epochs=epochs, + project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", + model_graphs='model_graphs', + batch_size=batch_size, + meta_trial_number=meta_trial_number, + base_models=[cerebros_base_model], + train_data_dtype=tf.string) + +cerebros_t0 = time.time() +result = cerebros_automl.run_random_search() +cerebros_t1 = time.time() +cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60 +models_tried = moities_to_try * tries_per_moity +cerebros_time_per_model = cerebros_time_all_models_min / models_tried + +print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") +# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") + + print(f'Cerebros best accuracy achieved is {result}') print(f'val set accuracy') From b231398af85c318c26f88ea3082a966eb68183e4 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 00:58:34 -0400 Subject: [PATCH 063/100] Update phishing_email_detection_gpt2.py Correction of copy and paste error --- phishing_email_detection_gpt2.py | 742 ------------------------------- 1 file changed, 742 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 03b988f..2ea5b1b 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -211,542 +211,6 @@ def from_config(cls, config): - -# Please explain the error: - -``` -Traceback (most recent call last): - File "/home/runner/work/cerebros-core-algorithm-alpha/cerebros-core-algorithm-alpha/phishing_email_detection_gpt2.py", line 293, in - position_embedding = InterleavedRoPE( - ^^^^^^^^^^^^^^^^ - File "/opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler - raise e.with_traceback(filtered_tb) from None - File "/tmp/__autograph_generated_file1glgbmnr.py", line 13, in tf__call - x = ag__.converted_call(ag__.ld(apply_rotary_pos_emb), (ag__.ld(x), ag__.ld(sin), ag__.ld(cos)), None, fscope) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/tmp/__autograph_generated_fileqgikeir3.py", line 10, in tf__apply_rotary_pos_emb - x_rotated = ag__.ld(x) * ag__.ld(cos) + ag__.converted_call(ag__.ld(rotate_half), (ag__.ld(x),), None, fscope) * ag__.ld(sin) - ~~~~~~~~~~~^~~~~~~~~~~~~~ -ValueError: Exception encountered when calling layer "interleaved_ro_pe" (type InterleavedRoPE). -in user code: - File "/home/runner/work/cerebros-core-algorithm-alpha/cerebros-core-algorithm-alpha/phishing_email_detection_gpt2.py", line 264, in call * - x = apply_rotary_pos_emb(x, sin, cos) - File "/home/runner/work/cerebros-core-algorithm-alpha/cerebros-core-algorithm-alpha/phishing_email_detection_gpt2.py", line 249, in apply_rotary_pos_emb * - x_rotated = x * cos + rotate_half(x) * sin - ValueError: Dimensions must be equal, but are 24 and 12 for '{{node interleaved_ro_pe/mul}} = Mul[T=DT_FLOAT](Placeholder, interleaved_ro_pe/rotary_embedding/strided_slice_2)' with input shapes: [?,1024,24], [1024,12]. -Call arguments received by layer "interleaved_ro_pe" (type InterleavedRoPE): - • x=tf.Tensor(shape=(None, 1024, 24), dtype=float32) -``` - - -# From the code - -``` -import tensorflow as tf -import tensorflow_text -from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone -from keras_nlp.layers import PositionEmbedding -from sklearn.model_selection import train_test_split -from sklearn.utils import shuffle -from tensorflow.keras.utils import to_categorical -from tensorflow.keras.optimizers import Adam -from tensorflow.keras.models import Model -from tensorflow.keras.layers import Input, Flatten -import pandas as pd -import numpy as np -from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\ - import SimpleCerebrosRandomSearch -import pendulum -from cerebros.units.units import DenseUnit -from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\ - import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid -from ast import literal_eval -import time - - -# -# Load the email data -# -df = pd.read_csv("Phishing_Email.csv") -# -# Get the rows where 'Email Text' is a string, remove everything else -# -df = df[df['Email Text'].apply(lambda x: isinstance(x, str))] -# -# Reset the index -# -df.reset_index(drop=True, inplace=True) - -# -# Binary label for email type: positive type is "phishing" -# -label_mapping = {"Safe Email": 0, "Phishing Email": 1} -df["Binary Label"] = df["Email Type"].map(label_mapping) -# -# Data and labels ready -# -X = df["Email Text"].to_numpy() -y = df["Binary Label"].to_numpy() -# -# Shuffle the data -# -X, y = shuffle(X, y) - -# Train / test split : we give 85% of the data for *testing* -X_train, X_test, y_train, y_test = \ -train_test_split(X, y, test_size=0.85, shuffle=False) - -# -# Tensors for training data and labels -# - -# Training data for baseline model -baseline_train_x = tf.constant(X_train) -baseline_train_y = tf.constant(y_train, dtype=tf.int8) - -# Packaged for Cerebros (multimodal, takes inputs as a list) -training_x = [baseline_train_x] -train_labels = [baseline_train_y] - -# -# Input and output shapes -# -INPUT_SHAPES = [()] -OUTPUT_SHAPES = [1] - -"""### A custom GPT2 encoder layer for text embedding""" - -""" -class GPT2Layer(tf.keras.layers.Layer): - - def __init__(self, max_seq_length, **kwargs): - # - super(GPT2Layer, self).__init__(**kwargs) - # - # Load the GPT2 tokenizer, preprocessor and model - self.tokenizer = GPT2Tokenizer.from_preset("gpt2_base_en") - self.preprocessor = GPT2Preprocessor(self.tokenizer, - sequence_length=max_seq_length) - self.encoder = GPT2Backbone.from_preset("gpt2_base_en") - # - # Set whether the GPT2 model's layers are trainable - #self.encoder.trainable = False - for layer in self.encoder.layers: - layer.trainable = True - # - # self.encoder.layers[-2].trainable = True - # - # Set the maximum sequence length for tokenization - self.max_seq_length = max_seq_length - - def call(self, inputs): - # - # Output the GPT2 embedding - prep = self.preprocessor([inputs]) - embedding = self.encoder(prep) - avg_pool = tf.reduce_mean(embedding, axis=1) - # - return avg_pool - - def get_config(self): - # - config = super(GPT2Layer, self).get_config() - config.update({'max_seq_length': self.max_seq_length}) - # - return config - - @classmethod - def from_config(cls, config): - # - return cls(max_seq_length=config['max_seq_length']) - -# GPT2 configurables -max_seq_length = 96 - -# GPT Baseline Model -input_layer = Input(shape=(), dtype=tf.string) -gpt2_layer = GPT2Layer(max_seq_length)(input_layer) -#output = Flatten()(gpt2_layer) -binary_output = tf.keras.layers.Dense(1, activation='sigmoid')(gpt2_layer) - -gpt_baseline_model = Model(inputs=input_layer, outputs=binary_output) - - -gpt_baseline_model.compile( - optimizer=Adam(learning_rate=1e-4), # Small LR since we're fine-tuning GPT - loss='binary_crossentropy', - # metrics=['accuracy', tf.keras.metrics.AUC(name='auc')] - metrics=[tf.keras.metrics.BinaryAccuracy(), - tf.keras.metrics.Precision(), - tf.keras.metrics.Recall()] -) - -gpt_t0 = time.time() - -print(gpt_baseline_model.summary()) - -history = gpt_baseline_model.fit( - x=X_train, # Input data - y=y_train, # Labels - epochs=3, # Number of training iterations - batch_size=16, # Batch size small due to GPU memory constraints - validation_split=0.2, # Hold out 20% of training data for validation - shuffle=True, # Shuffle data at each epoch - callbacks=[ - tf.keras.callbacks.EarlyStopping( - monitor='val_loss', - patience=3, - restore_best_weights=True, - min_delta=0.001 - ), - tf.keras.callbacks.ReduceLROnPlateau( - monitor='val_loss', - factor=0.2, - patience=2, - min_lr=1e-6 - ) - ] -) - -gpt_t1 = time.time() -gpt_time_on_one_model_min = (gpt_t1 - gpt_t0) / 60 - -hy_df = pd.DataFrame(history.history) -print(hy_df) -""" - -### Cerebros model: - -# TokenizerLayer class to handle tokenization and return only token_ids -class TokenizerLayer(tf.keras.layers.Layer): - - def __init__(self, max_seq_length, **kwargs): - super(TokenizerLayer, self).__init__(**kwargs) # Update this line - self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en") - self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length) - self.max_seq_length = max_seq_length - - def call(self, inputs): - prep = self.preprocessor([inputs]) - return prep['token_ids'] - - def get_config(self): - config = super(TokenizerLayer, self).get_config() - config.update({'max_seq_length': self.max_seq_length}) - return config - - @classmethod - def from_config(cls, config): - return cls(max_seq_length=config['max_seq_length']) - - - - -class RotaryEmbedding(tf.keras.layers.Layer): - def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): - super().__init__(**kwargs) - self.dim = dim - self.max_seq_len = max_seq_len - self.temperature = temperature - - def build(self, input_shape): - super().build(input_shape) - inv_freq = 1.0 / (self.temperature ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim)) - position = tf.range(self.max_seq_len, dtype=tf.float32) - sinusoid = tf.einsum("i,j->ij", position, inv_freq) - self.sin_cache = tf.sin(sinusoid) - self.cos_cache = tf.cos(sinusoid) - - def call(self, x, seq_len=None): - batch_size = tf.shape(x)[0] - seq_len = tf.shape(x)[1] if seq_len is None else seq_len - sin = self.sin_cache[:seq_len] - cos = self.cos_cache[:seq_len] - return tf.cast(sin, x.dtype), tf.cast(cos, x.dtype) - -def split_alternate(x): - shape = tf.shape(x) - x = tf.reshape(x, [shape[0], shape[1], shape[2] // 2, 2]) - x = tf.transpose(x, [0, 1, 3, 2]) - x = tf.reshape(x, [shape[0], shape[1], -1]) - return x - -def rotate_half(x): - x = split_alternate(x) - d = x.shape[-1] - return x[..., d//2:] - -def apply_rotary_pos_emb(x, sin, cos): - x_rotated = x * cos + rotate_half(x) * sin - return x_rotated - -class InterleavedRoPE(tf.keras.layers.Layer): - def __init__(self, dim, max_seq_len=1024, **kwargs): - super().__init__(**kwargs) - self.dim = dim - self.max_seq_len = max_seq_len - self.rotary_emb = RotaryEmbedding(dim, max_seq_len) - - def call(self, x): - batch_size = tf.shape(x)[0] - seq_len = tf.shape(x)[1] - - sin, cos = self.rotary_emb(x, seq_len) - x = apply_rotary_pos_emb(x, sin, cos) - return x - - - - - - -# GPT2 configurables - -# Optimal for accuracy thus far: -max_seq_length = 1024 - -inp = tf.keras.layers.Input(shape=(), dtype=tf.string) -gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) -VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size() -tokens = gp2_tokenizer(inp) - -# On larger hardware, this could probably be increased considerably and -# Probably would improve performance ... -EMBEDDING_N = 12 # Define EMBEDDING_DIM here, to match your embedding layer. -EMBEDDING_DIM = int(EMBEDDING_N * 2) - -embedded = tf.keras.layers.Embedding( - input_dim=VOCABULARY_SIZE, - output_dim=EMBEDDING_DIM, - input_length=max_seq_length, - mask_zero=True)(tokens) - -position_embedding = InterleavedRoPE( - dim=EMBEDDING_DIM, - max_seq_len=max_seq_length, - # initializer="uniform", -)(embedded) - -# As an FYI, we tried an add layer both with and without -# LayerNorm ... It degraded accuracy -# Just an FYI for anyone trying to apply conventional wisdom -# to save you the time ... -x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) -x = tf.keras.layers.Dropout(0.4)(x) # AI suggested 0.4 -flattened = tf.keras.layers.Flatten()(x) - -cerebros_base_model = tf.keras.Model( - inputs=inp, - outputs=flattened # Output enhanced embeddings now -) - - -"""### Cerebros search for the best model""" - -# -# Cerebros configurables -# -activation = "relu" -predecessor_level_connection_affinity_factor_first = 10 -predecessor_level_connection_affinity_factor_main = 40 -max_consecutive_lateral_connections = 20 -p_lateral_connection = 30 -num_lateral_connection_tries_per_unit = 25 -learning_rate = 3 * 10 ** -3 -epochs = 15 # [1, 100] -batch_size = 17 -minimum_levels = 2 -maximum_levels = 2 # [3,7] - -minimum_units_per_level = 4 -maximum_units_per_level = 7 - -minimum_neurons_per_unit = 1 -maximum_neurons_per_unit = 2 - -moities_to_try = 5 -tries_per_moity = 1 - -# -# Logging -# -TIME = pendulum.now(tz='America/New_York').__str__()[:16]\ - .replace('T', '_')\ - .replace(':', '_')\ - .replace('-', '_') -PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test' - -meta_trial_number = 42 # irrelevant unless in distributed training - - -cerebros_automl = SimpleCerebrosRandomSearch( - unit_type=DenseUnit, - input_shapes=INPUT_SHAPES, - output_shapes=OUTPUT_SHAPES, - training_data=training_x, - labels=train_labels, - validation_split=0.35, - direction='maximize', - metric_to_rank_by="val_binary_accuracy", - minimum_levels=minimum_levels, - maximum_levels=maximum_levels, - minimum_units_per_level=minimum_units_per_level, - maximum_units_per_level=maximum_units_per_level, - minimum_neurons_per_unit=minimum_neurons_per_unit, - maximum_neurons_per_unit=maximum_neurons_per_unit, - activation=activation, - final_activation='sigmoid', - number_of_architecture_moities_to_try=moities_to_try, - number_of_tries_per_architecture_moity=tries_per_moity, - minimum_skip_connection_depth=1, - maximum_skip_connection_depth=7, - predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first, - predecessor_level_connection_affinity_factor_first_rounding_rule='ceil', - predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main, - predecessor_level_connection_affinity_factor_main_rounding_rule='ceil', - predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay, - seed=8675309, - max_consecutive_lateral_connections=max_consecutive_lateral_connections, - gate_after_n_lateral_connections=3, - gate_activation_function=simple_sigmoid, - p_lateral_connection=p_lateral_connection, - p_lateral_connection_decay=zero_95_exp_decay, - num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, - learning_rate=learning_rate, - loss=tf.keras.losses.BinaryCrossentropy(), - # loss=tf.keras.losses.CategoricalHinge(), - metrics=[tf.keras.metrics.BinaryAccuracy(), - tf.keras.metrics.Precision(), - tf.keras.metrics.Recall()], - epochs=epochs, - project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", - model_graphs='model_graphs', - batch_size=batch_size, - meta_trial_number=meta_trial_number, - base_models=[cerebros_base_model], - train_data_dtype=tf.string) - -cerebros_t0 = time.time() -result = cerebros_automl.run_random_search() -cerebros_t1 = time.time() -cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60 -models_tried = moities_to_try * tries_per_moity -cerebros_time_per_model = cerebros_time_all_models_min / models_tried - -print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") - - -print(f'Cerebros best accuracy achieved is {result}') -print(f'val set accuracy') - -# """### Testing the best model found""" -``` - - -From the code -``` -import tensorflow as tf -import tensorflow_text -from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone -from keras_nlp.layers import PositionEmbedding -from sklearn.model_selection import train_test_split -from sklearn.utils import shuffle -from tensorflow.keras.utils import to_categorical -from tensorflow.keras.optimizers import Adam -from tensorflow.keras.models import Model -from tensorflow.keras.layers import Input, Flatten -import pandas as pd -import numpy as np -from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\ - import SimpleCerebrosRandomSearch -import pendulum -from cerebros.units.units import DenseUnit -from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\ - import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid -from ast import literal_eval -import time - - -# -# Load the email data -# -df = pd.read_csv("Phishing_Email.csv") -# -# Get the rows where 'Email Text' is a string, remove everything else -# -df = df[df['Email Text'].apply(lambda x: isinstance(x, str))] -# -# Reset the index -# -df.reset_index(drop=True, inplace=True) - -# -# Binary label for email type: positive type is "phishing" -# -label_mapping = {"Safe Email": 0, "Phishing Email": 1} -df["Binary Label"] = df["Email Type"].map(label_mapping) -# -# Data and labels ready -# -X = df["Email Text"].to_numpy() -y = df["Binary Label"].to_numpy() -# -# Shuffle the data -# -X, y = shuffle(X, y) - -# Train / test split : we give 85% of the data for *testing* -X_train, X_test, y_train, y_test = \ -train_test_split(X, y, test_size=0.85, shuffle=False) - -# -# Tensors for training data and labels -# - -# Training data for baseline model -baseline_train_x = tf.constant(X_train) -baseline_train_y = tf.constant(y_train, dtype=tf.int8) - -# Packaged for Cerebros (multimodal, takes inputs as a list) -training_x = [baseline_train_x] -train_labels = [baseline_train_y] - -# -# Input and output shapes -# -INPUT_SHAPES = [()] -OUTPUT_SHAPES = [1] - - - -### Cerebros model: - -# TokenizerLayer class to handle tokenization and return only token_ids -class TokenizerLayer(tf.keras.layers.Layer): - - def __init__(self, max_seq_length, **kwargs): - super(TokenizerLayer, self).__init__(**kwargs) # Update this line - self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en") - self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length) - self.max_seq_length = max_seq_length - - def call(self, inputs): - prep = self.preprocessor([inputs]) - return prep['token_ids'] - - def get_config(self): - config = super(TokenizerLayer, self).get_config() - config.update({'max_seq_length': self.max_seq_length}) - return config - - @classmethod - def from_config(cls, config): - return cls(max_seq_length=config['max_seq_length']) - - - - class RotaryEmbedding(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): super().__init__(**kwargs) @@ -754,210 +218,6 @@ def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): self.max_seq_len = max_seq_len self.temperature = temperature - def build(self, input_shape): - super().build(input_shape) - inv_freq = 1.0 / (self.temperature ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim)) - position = tf.range(self.max_seq_len, dtype=tf.float32) - sinusoid = tf.einsum("i,j->ij", position, inv_freq) - self.sin_cache = tf.sin(sinusoid) - self.cos_cache = tf.cos(sinusoid) - - def call(self, x, seq_len=None): - batch_size = tf.shape(x)[0] - seq_len = tf.shape(x)[1] if seq_len is None else seq_len - sin = self.sin_cache[:seq_len] - cos = self.cos_cache[:seq_len] - return tf.cast(sin, x.dtype), tf.cast(cos, x.dtype) - -def split_alternate(x): - shape = tf.shape(x) - x = tf.reshape(x, [shape[0], shape[1], shape[2] // 2, 2]) - x = tf.transpose(x, [0, 1, 3, 2]) - x = tf.reshape(x, [shape[0], shape[1], -1]) - return x - -def rotate_half(x): - x = split_alternate(x) - d = x.shape[-1] - return x[..., d//2:] - -def apply_rotary_pos_emb(x, sin, cos): - x_rotated = x * cos + rotate_half(x) * sin - return x_rotated - -class InterleavedRoPE(tf.keras.layers.Layer): - def __init__(self, dim, max_seq_len=1024, **kwargs): - super().__init__(**kwargs) - self.dim = dim - self.max_seq_len = max_seq_len - self.rotary_emb = RotaryEmbedding(dim, max_seq_len) - - def call(self, x): - batch_size = tf.shape(x)[0] - seq_len = tf.shape(x)[1] - - sin, cos = self.rotary_emb(x, seq_len) - x = apply_rotary_pos_emb(x, sin, cos) - return x - - - - - - -# GPT2 configurables - -# Optimal for accuracy thus far: -max_seq_length = 1024 - -inp = tf.keras.layers.Input(shape=(), dtype=tf.string) -gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) -VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size() -tokens = gp2_tokenizer(inp) - -# On larger hardware, this could probably be increased considerably and -# Probably would improve performance ... -EMBEDDING_N = 12 # Define EMBEDDING_DIM here, to match your embedding layer. -EMBEDDING_DIM = int(EMBEDDING_N * 2) - -embedded = tf.keras.layers.Embedding( - input_dim=VOCABULARY_SIZE, - output_dim=EMBEDDING_DIM, - input_length=max_seq_length, - mask_zero=True)(tokens) - -position_embedding = InterleavedRoPE( - dim=EMBEDDING_DIM, - max_seq_len=max_seq_length, - # initializer="uniform", -)(embedded) - -# As an FYI, we tried an add layer both with and without -# LayerNorm ... It degraded accuracy -# Just an FYI for anyone trying to apply conventional wisdom -# to save you the time ... -x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) -x = tf.keras.layers.Dropout(0.4)(x) # AI suggested 0.4 -flattened = tf.keras.layers.Flatten()(x) - -cerebros_base_model = tf.keras.Model( - inputs=inp, - outputs=flattened # Output enhanced embeddings now -) - - -"""### Cerebros search for the best model""" - -# -# Cerebros configurables -# -activation = "relu" -predecessor_level_connection_affinity_factor_first = 10 -predecessor_level_connection_affinity_factor_main = 40 -max_consecutive_lateral_connections = 20 -p_lateral_connection = 30 -num_lateral_connection_tries_per_unit = 25 -learning_rate = 3 * 10 ** -3 -epochs = 15 # [1, 100] -batch_size = 17 -minimum_levels = 2 -maximum_levels = 2 # [3,7] - -minimum_units_per_level = 4 -maximum_units_per_level = 7 - -minimum_neurons_per_unit = 1 -maximum_neurons_per_unit = 2 - -moities_to_try = 5 -tries_per_moity = 1 - -# -# Logging -# -TIME = pendulum.now(tz='America/New_York').__str__()[:16]\ - .replace('T', '_')\ - .replace(':', '_')\ - .replace('-', '_') -PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test' - -meta_trial_number = 42 # irrelevant unless in distributed training - - -cerebros_automl = SimpleCerebrosRandomSearch( - unit_type=DenseUnit, - input_shapes=INPUT_SHAPES, - output_shapes=OUTPUT_SHAPES, - training_data=training_x, - labels=train_labels, - validation_split=0.35, - direction='maximize', - metric_to_rank_by="val_binary_accuracy", - minimum_levels=minimum_levels, - maximum_levels=maximum_levels, - minimum_units_per_level=minimum_units_per_level, - maximum_units_per_level=maximum_units_per_level, - minimum_neurons_per_unit=minimum_neurons_per_unit, - maximum_neurons_per_unit=maximum_neurons_per_unit, - activation=activation, - final_activation='sigmoid', - number_of_architecture_moities_to_try=moities_to_try, - number_of_tries_per_architecture_moity=tries_per_moity, - minimum_skip_connection_depth=1, - maximum_skip_connection_depth=7, - predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first, - predecessor_level_connection_affinity_factor_first_rounding_rule='ceil', - predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main, - predecessor_level_connection_affinity_factor_main_rounding_rule='ceil', - predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay, - seed=8675309, - max_consecutive_lateral_connections=max_consecutive_lateral_connections, - gate_after_n_lateral_connections=3, - gate_activation_function=simple_sigmoid, - p_lateral_connection=p_lateral_connection, - p_lateral_connection_decay=zero_95_exp_decay, - num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, - learning_rate=learning_rate, - loss=tf.keras.losses.BinaryCrossentropy(), - # loss=tf.keras.losses.CategoricalHinge(), - metrics=[tf.keras.metrics.BinaryAccuracy(), - tf.keras.metrics.Precision(), - tf.keras.metrics.Recall()], - epochs=epochs, - project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", - model_graphs='model_graphs', - batch_size=batch_size, - meta_trial_number=meta_trial_number, - base_models=[cerebros_base_model], - train_data_dtype=tf.string) - -cerebros_t0 = time.time() -result = cerebros_automl.run_random_search() -cerebros_t1 = time.time() -cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60 -models_tried = moities_to_try * tries_per_moity -cerebros_time_per_model = cerebros_time_all_models_min / models_tried - -print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") - - -print(f'Cerebros best accuracy achieved is {result}') -print(f'val set accuracy') - -# """### Testing the best model found""" -``` - - - - - class RotaryEmbedding(tf.keras.layers.Layer): - def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): - super().__init__(**kwargs) - self.dim = dim - self.max_seq_len = max_seq_len - self.temperature = temperature - def build(self, input_shape): super().build(input_shape) inv_freq = 1.0 / (self.temperature ** (tf.range(0, self.dim // 2, dtype=tf.float32) / (self.dim // 2))) @@ -975,8 +235,6 @@ def call(self, x, seq_len=None): cos = self.cos_cache[:seq_len] return tf.cast(sin, x.dtype), tf.cast(cos, x.dtype) - - def split_alternate(x): shape = tf.shape(x) x = tf.reshape(x, [shape[0], shape[1], shape[2] // 2, 2]) From 436825950b10ae7d51c74145df2c4210268a3249 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 01:03:19 -0400 Subject: [PATCH 064/100] Update phishing_email_detection_gpt2.py More detail corrections. --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 2ea5b1b..29d1175 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -244,7 +244,7 @@ def split_alternate(x): def rotate_half(x): x = split_alternate(x) - d = x.shape[-1] + d = tf.shape(x)[-1] return x[..., d//2:] def apply_rotary_pos_emb(x, sin, cos): From 5398ce7592538f8080964b9d80dba768e603e1e6 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 01:09:41 -0400 Subject: [PATCH 065/100] Update phishing_email_detection_gpt2.py More dimensionality debugging... --- phishing_email_detection_gpt2.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 29d1175..182363e 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -211,7 +211,7 @@ def from_config(cls, config): -class RotaryEmbedding(tf.keras.layers.Layer): +def RotaryEmbedding(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): super().__init__(**kwargs) self.dim = dim @@ -225,15 +225,18 @@ def build(self, input_shape): sinusoid = tf.einsum("i,j->ij", position, inv_freq) sin = tf.sin(sinusoid) cos = tf.cos(sinusoid) - self.sin_cache = tf.concat([sin, sin], axis=-1) - self.cos_cache = tf.concat([cos, cos], axis=-1) + self.sin_cache = sin + self.cos_cache = cos def call(self, x, seq_len=None): batch_size = tf.shape(x)[0] seq_len = tf.shape(x)[1] if seq_len is None else seq_len sin = self.sin_cache[:seq_len] cos = self.cos_cache[:seq_len] - return tf.cast(sin, x.dtype), tf.cast(cos, x.dtype) + sin = tf.cast(tf.repeat(sin[..., tf.newaxis], self.dim // 2, axis=-1), x.dtype) + cos = tf.cast(tf.repeat(cos[..., tf.newaxis], self.dim // 2, axis=-1), x.dtype) + return sin, cos + def split_alternate(x): shape = tf.shape(x) From 2aeba283fb9fe5507dfbf946e3ef42eac6d5a92d Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 01:12:36 -0400 Subject: [PATCH 066/100] Update phishing_email_detection_gpt2.py Syntax error / hallucination correction ... --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 182363e..9a721d9 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -211,7 +211,7 @@ def from_config(cls, config): -def RotaryEmbedding(tf.keras.layers.Layer): +class RotaryEmbedding(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): super().__init__(**kwargs) self.dim = dim From 4315c515ba1a707149e1e8ce9db88205af5305e6 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 01:16:24 -0400 Subject: [PATCH 067/100] Update phishing_email_detection_gpt2.py ... --- phishing_email_detection_gpt2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 9a721d9..9b5653b 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -251,6 +251,8 @@ def rotate_half(x): return x[..., d//2:] def apply_rotary_pos_emb(x, sin, cos): + cos = tf.reshape(cos, [tf.shape(cos)[0], tf.shape(cos)[1], -1]) + sin = tf.reshape(sin, [tf.shape(sin)[0], tf.shape(sin)[1], -1]) x_rotated = x * cos + rotate_half(x) * sin return x_rotated From d2d0f72ab33923215d1a561c7d941680be6c3cc0 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 01:20:48 -0400 Subject: [PATCH 068/100] Update phishing_email_detection_gpt2.py ... --- phishing_email_detection_gpt2.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 9b5653b..b4f393c 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -211,7 +211,7 @@ def from_config(cls, config): -class RotaryEmbedding(tf.keras.layers.Layer): +def RotaryEmbedding(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): super().__init__(**kwargs) self.dim = dim @@ -233,11 +233,18 @@ def call(self, x, seq_len=None): seq_len = tf.shape(x)[1] if seq_len is None else seq_len sin = self.sin_cache[:seq_len] cos = self.cos_cache[:seq_len] - sin = tf.cast(tf.repeat(sin[..., tf.newaxis], self.dim // 2, axis=-1), x.dtype) - cos = tf.cast(tf.repeat(cos[..., tf.newaxis], self.dim // 2, axis=-1), x.dtype) + sin = tf.cast(tf.repeat(sin[..., tf.newaxis], 2, axis=-1), x.dtype) + cos = tf.cast(tf.repeat(cos[..., tf.newaxis], 2, axis=-1), x.dtype) + sin = tf.reshape(sin, [seq_len, self.dim]) + cos = tf.reshape(cos, [seq_len, self.dim]) + sin = tf.expand_dims(sin, axis=0) + cos = tf.expand_dims(cos, axis=0) + sin = tf.tile(sin, [batch_size, 1, 1]) + cos = tf.tile(cos, [batch_size, 1, 1]) return sin, cos + def split_alternate(x): shape = tf.shape(x) x = tf.reshape(x, [shape[0], shape[1], shape[2] // 2, 2]) From 432df193ecd6481de21135024af79520446a9cee Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 01:23:10 -0400 Subject: [PATCH 069/100] Update phishing_email_detection_gpt2.py Correct AI hallucination based error ... --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index b4f393c..60e24f4 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -211,7 +211,7 @@ def from_config(cls, config): -def RotaryEmbedding(tf.keras.layers.Layer): +class RotaryEmbedding(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): super().__init__(**kwargs) self.dim = dim From 116b888ae56683a40e4633be2473c39ed17beafc Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 01:27:00 -0400 Subject: [PATCH 070/100] Update phishing_email_detection_gpt2.py ... --- phishing_email_detection_gpt2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 60e24f4..7556f85 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -252,10 +252,15 @@ def split_alternate(x): x = tf.reshape(x, [shape[0], shape[1], -1]) return x + def rotate_half(x): x = split_alternate(x) d = tf.shape(x)[-1] - return x[..., d//2:] + x1 = x[..., :d//2] + x2 = x[..., d//2:] + rotated_x = tf.concat([-x2, x1], axis=-1) + return tf.reshape(rotated_x, tf.shape(x)[:-2] + [-1]) + def apply_rotary_pos_emb(x, sin, cos): cos = tf.reshape(cos, [tf.shape(cos)[0], tf.shape(cos)[1], -1]) From b8204d6fa5cf9ff134e3d622f55d640e1ca1f63b Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 01:32:30 -0400 Subject: [PATCH 071/100] Update phishing_email_detection_gpt2.py ... --- phishing_email_detection_gpt2.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 7556f85..5f7b532 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -9,7 +9,10 @@ ## GPT2 + Cerebros for Phishing email detection Initialization -""" +def rotate_half(x): + x = split_alternate(x) + rotated_x = tf.concat([-x[..., x.shape[-1]//2:], x[..., :x.shape[-1]//2]], axis=-1) + return tf.reshape(rotated_x, tf.shape(x))""" import tensorflow as tf import tensorflow_text @@ -255,11 +258,8 @@ def split_alternate(x): def rotate_half(x): x = split_alternate(x) - d = tf.shape(x)[-1] - x1 = x[..., :d//2] - x2 = x[..., d//2:] - rotated_x = tf.concat([-x2, x1], axis=-1) - return tf.reshape(rotated_x, tf.shape(x)[:-2] + [-1]) + rotated_x = tf.concat([-x[..., x.shape[-1]//2:], x[..., :x.shape[-1]//2]], axis=-1) + return tf.reshape(rotated_x, tf.shape(x)) def apply_rotary_pos_emb(x, sin, cos): @@ -268,6 +268,7 @@ def apply_rotary_pos_emb(x, sin, cos): x_rotated = x * cos + rotate_half(x) * sin return x_rotated + class InterleavedRoPE(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, **kwargs): super().__init__(**kwargs) From 7a039dde82a0232d4117134588007d5c8d64cd4e Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 01:37:53 -0400 Subject: [PATCH 072/100] Update phishing_email_detection_gpt2.py ... --- phishing_email_detection_gpt2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 5f7b532..207843b 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -258,7 +258,8 @@ def split_alternate(x): def rotate_half(x): x = split_alternate(x) - rotated_x = tf.concat([-x[..., x.shape[-1]//2:], x[..., :x.shape[-1]//2]], axis=-1) + d = tf.shape(x)[-1] + rotated_x = tf.concat([-x[..., d//2:], x[..., :d//2]], axis=-1) return tf.reshape(rotated_x, tf.shape(x)) From 5fa7a9dc556a2c04bcbbf0dfc8a19b72920a5c5c Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 16:17:25 -0400 Subject: [PATCH 073/100] Update cicd-requirements.txt To requirements, add transformers v 4.51.1 --- cicd-requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cicd-requirements.txt b/cicd-requirements.txt index 749d089..cf8d996 100644 --- a/cicd-requirements.txt +++ b/cicd-requirements.txt @@ -3,3 +3,4 @@ tensorflow-text==2.15.0 keras-nlp==0.9.1 scikit-learn==1.4.1.post1 tensorflow-hub==0.16.1 +transformers==4.51.1 From 096fa27607ebc5baca7709739ac345d0973fa20d Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 17:00:15 -0400 Subject: [PATCH 074/100] Update phishing_email_detection_gpt2.py Test longer seq tokenizer. --- phishing_email_detection_gpt2.py | 72 +++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 207843b..d070666 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -18,6 +18,7 @@ def rotate_half(x): import tensorflow_text from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone from keras_nlp.layers import PositionEmbedding +from transformers import AutoTokenizer from sklearn.model_selection import train_test_split from sklearn.utils import shuffle from tensorflow.keras.utils import to_categorical @@ -189,27 +190,64 @@ def from_config(cls, config): ### Cerebros model: -# TokenizerLayer class to handle tokenization and return only token_ids -class TokenizerLayer(tf.keras.layers.Layer): - - def __init__(self, max_seq_length, **kwargs): - super(TokenizerLayer, self).__init__(**kwargs) # Update this line - self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en") - self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length) +class NewTokenizerLayer(tf.keras.layers.Layer): + """ + A Keras layer that tokenizes input text using a specified tokenizer. + """ + def __init__(self, max_seq_length, tokenizer_checkpoint, **kwargs): + """ + Initializes the NewTokenizerLayer. + Args: + - max_seq_length (int): The maximum sequence length for tokenization. + - tokenizer_checkpoint (str): The checkpoint for the tokenizer to use. + - **kwargs: Additional keyword arguments for the layer. + """ + super(NewTokenizerLayer, self).__init__(**kwargs) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) self.max_seq_length = max_seq_length - def call(self, inputs): - prep = self.preprocessor([inputs]) - return prep['token_ids'] - + """ + Tokenizes the input text. + Args: + - inputs: The input text to tokenize. + Returns: + - The tokenized input IDs. + """ + # Check if inputs is a tensor + if isinstance(inputs, tf.Tensor): + # Convert tensor to a list of strings + inputs = inputs.numpy().astype("U").tolist() + # Tokenize each input string separately + tokenized = self.tokenizer(inputs, + max_length=self.max_seq_length, + padding='max_length', + truncation=True, + return_tensors='tf', + return_overflowing_tokens=False) + # Return the tokenized input IDs + return tokenized['input_ids'] def get_config(self): - config = super(TokenizerLayer, self).get_config() - config.update({'max_seq_length': self.max_seq_length}) + """ + Returns the configuration for the layer. + Returns: + - A dictionary containing the layer's configuration. + """ + config = super(NewTokenizerLayer, self).get_config() + config.update({ + 'max_seq_length': self.max_seq_length, + 'tokenizer_checkpoint': self.tokenizer.name_or_path + }) return config - @classmethod def from_config(cls, config): - return cls(max_seq_length=config['max_seq_length']) + """ + Creates a new instance of the layer from a configuration. + Args: + - config: The configuration dictionary. + Returns: + - A new instance of the layer. + """ + return cls(max_seq_length=config['max_seq_length'], tokenizer_checkpoint=config['tokenizer_checkpoint']) @@ -294,9 +332,11 @@ def call(self, x): # Optimal for accuracy thus far: max_seq_length = 1024 +tokenizer_checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct" inp = tf.keras.layers.Input(shape=(), dtype=tf.string) -gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) +# gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) +gp2_tokenizer = NewTokenizerLayer(max_seq_length=max_seq_length,tokenizer_checkpoint=tokenizer_checkpoint) VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size() tokens = gp2_tokenizer(inp) From 6951f19592651e46921184977eb0e523a7a5445c Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 17:01:18 -0400 Subject: [PATCH 075/100] Update automerge.yml Add branch to tests --- .github/workflows/automerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 441e93e..969dfd5 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "168-try-integrating-a-irope-layer" ] + branches: [ "main", "169-use-a-more-scalable-tokenizer" ] permissions: contents: read From ebb22a7865b9ffce3c219c603a49aa698e272aad Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 17:12:34 -0400 Subject: [PATCH 076/100] Update phishing_email_detection_gpt2.py Fix access to .vocab_size for new tokenizer. --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index d070666..defd230 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -337,7 +337,7 @@ def call(self, x): inp = tf.keras.layers.Input(shape=(), dtype=tf.string) # gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) gp2_tokenizer = NewTokenizerLayer(max_seq_length=max_seq_length,tokenizer_checkpoint=tokenizer_checkpoint) -VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size() +VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocab_size tokens = gp2_tokenizer(inp) # On larger hardware, this could probably be increased considerably and From fdc481239401006bfbd6ea819a3cd054788cd7ff Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 17:16:13 -0400 Subject: [PATCH 077/100] Update phishing_email_detection_gpt2.py Remove incompatible funstionality... --- phishing_email_detection_gpt2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index defd230..39a96b2 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -214,9 +214,9 @@ def call(self, inputs): - The tokenized input IDs. """ # Check if inputs is a tensor - if isinstance(inputs, tf.Tensor): - # Convert tensor to a list of strings - inputs = inputs.numpy().astype("U").tolist() + # if isinstance(inputs, tf.Tensor): + # # Convert tensor to a list of strings + # inputs = inputs.numpy().astype("U").tolist() # Tokenize each input string separately tokenized = self.tokenizer(inputs, max_length=self.max_seq_length, From 7a38e20055a94560550bd765a3d78369a37c0868 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 17:20:06 -0400 Subject: [PATCH 078/100] Update phishing_email_detection_gpt2.py Try to fix issue with batch_size and dtype with string tokenization... --- phishing_email_detection_gpt2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 39a96b2..33f0fec 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -217,7 +217,8 @@ def call(self, inputs): # if isinstance(inputs, tf.Tensor): # # Convert tensor to a list of strings # inputs = inputs.numpy().astype("U").tolist() - # Tokenize each input string separately + + inputs = [x.decode('utf-8') for x in inputs.numpy()] tokenized = self.tokenizer(inputs, max_length=self.max_seq_length, padding='max_length', From a7515beac990452c6f0198c3c0fc8908ff8357dc Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 17:23:55 -0400 Subject: [PATCH 079/100] Update phishing_email_detection_gpt2.py ... --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 33f0fec..2776cfd 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -218,7 +218,7 @@ def call(self, inputs): # # Convert tensor to a list of strings # inputs = inputs.numpy().astype("U").tolist() - inputs = [x.decode('utf-8') for x in inputs.numpy()] + inputs = [x.decode('utf-8') for x in inputs] tokenized = self.tokenizer(inputs, max_length=self.max_seq_length, padding='max_length', From 1dcc2450be0ea87a236f314df951d251f89813d0 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 18:28:01 -0400 Subject: [PATCH 080/100] Update phishing_email_detection_gpt2.py Try with list[str] --- phishing_email_detection_gpt2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 2776cfd..96975ae 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -74,11 +74,11 @@ def rotate_half(x): # # Training data for baseline model -baseline_train_x = tf.constant(X_train) +# baseline_train_x = tf.constant(X_train) baseline_train_y = tf.constant(y_train, dtype=tf.int8) # Packaged for Cerebros (multimodal, takes inputs as a list) -training_x = [baseline_train_x] +training_x = [X_train.tolist()] train_labels = [baseline_train_y] # @@ -218,7 +218,7 @@ def call(self, inputs): # # Convert tensor to a list of strings # inputs = inputs.numpy().astype("U").tolist() - inputs = [x.decode('utf-8') for x in inputs] + # inputs = [x.decode('utf-8') for x in inputs] tokenized = self.tokenizer(inputs, max_length=self.max_seq_length, padding='max_length', From 5cf14344375b38c02ffec98e4612a2b17758897c Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 18:37:24 -0400 Subject: [PATCH 081/100] Update phishing_email_detection_gpt2.py Try inputs = tf.strings.unicode_encode(inputs, 'UTF-8') ... --- phishing_email_detection_gpt2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 96975ae..f83cac5 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -219,6 +219,7 @@ def call(self, inputs): # inputs = inputs.numpy().astype("U").tolist() # inputs = [x.decode('utf-8') for x in inputs] + inputs = tf.strings.unicode_encode(inputs, 'UTF-8') tokenized = self.tokenizer(inputs, max_length=self.max_seq_length, padding='max_length', From f229a476ac63cc95018635af961313424802eaa6 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 18:40:52 -0400 Subject: [PATCH 082/100] Update phishing_email_detection_gpt2.py ... --- phishing_email_detection_gpt2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index f83cac5..a4b9c0d 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -74,11 +74,11 @@ def rotate_half(x): # # Training data for baseline model -# baseline_train_x = tf.constant(X_train) +baseline_train_x = tf.constant(X_train) baseline_train_y = tf.constant(y_train, dtype=tf.int8) # Packaged for Cerebros (multimodal, takes inputs as a list) -training_x = [X_train.tolist()] +training_x = [baseline_train_x] train_labels = [baseline_train_y] # From 5ce1eb86e4d96da497cc03829e5edcc82b45cec7 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 18:47:20 -0400 Subject: [PATCH 083/100] Update phishing_email_detection_gpt2.py ... --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index a4b9c0d..5ee9934 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -74,7 +74,7 @@ def rotate_half(x): # # Training data for baseline model -baseline_train_x = tf.constant(X_train) +baseline_train_x = tf.constant(X_train, dtype=tf.string) baseline_train_y = tf.constant(y_train, dtype=tf.int8) # Packaged for Cerebros (multimodal, takes inputs as a list) From dc4dcc6162a3cc0ab23e03d6b1c053b05f677238 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 18:56:27 -0400 Subject: [PATCH 084/100] Update phishing_email_detection_gpt2.py ... --- phishing_email_detection_gpt2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 5ee9934..26f47d2 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -219,8 +219,9 @@ def call(self, inputs): # inputs = inputs.numpy().astype("U").tolist() # inputs = [x.decode('utf-8') for x in inputs] - inputs = tf.strings.unicode_encode(inputs, 'UTF-8') - tokenized = self.tokenizer(inputs, + # inputs = tf.strings.unicode_encode(inputs, 'UTF-8') + + tokenized = self.tokenizer(inputs.numpy().astype("U").tolist(), max_length=self.max_seq_length, padding='max_length', truncation=True, From 9893bfc55d4f7b753eff79e0c5c70e4992c61085 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 20:53:35 -0400 Subject: [PATCH 085/100] Update phishing_email_detection_gpt2.py AI suggested attempt at a better tokenizer... --- phishing_email_detection_gpt2.py | 93 +++++++++++++++----------------- 1 file changed, 44 insertions(+), 49 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 26f47d2..61629a9 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -190,67 +190,62 @@ def from_config(cls, config): ### Cerebros model: +from transformers import AutoTokenizer +import tensorflow as tf + class NewTokenizerLayer(tf.keras.layers.Layer): - """ - A Keras layer that tokenizes input text using a specified tokenizer. - """ def __init__(self, max_seq_length, tokenizer_checkpoint, **kwargs): - """ - Initializes the NewTokenizerLayer. - Args: - - max_seq_length (int): The maximum sequence length for tokenization. - - tokenizer_checkpoint (str): The checkpoint for the tokenizer to use. - - **kwargs: Additional keyword arguments for the layer. - """ - super(NewTokenizerLayer, self).__init__(**kwargs) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) + super().__init__(**kwargs) self.max_seq_length = max_seq_length + self.tokenizer_checkpoint = tokenizer_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) + + # Ensure tokenizer has a padding token + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + def call(self, inputs): - """ - Tokenizes the input text. - Args: - - inputs: The input text to tokenize. - Returns: - - The tokenized input IDs. - """ - # Check if inputs is a tensor - # if isinstance(inputs, tf.Tensor): - # # Convert tensor to a list of strings - # inputs = inputs.numpy().astype("U").tolist() - - # inputs = [x.decode('utf-8') for x in inputs] - # inputs = tf.strings.unicode_encode(inputs, 'UTF-8') + def tokenize_py_fn(inputs): + # Convert TensorFlow bytes to Python strings + texts = [text.decode('utf-8') for text in inputs.numpy()] + + # Tokenize with Hugging Face tokenizer + tokenized = self.tokenizer( + texts, + max_length=self.max_seq_length, + padding='max_length', + truncation=True, + return_tensors='tf' + ) + return tokenized['input_ids'].numpy() + + # Wrap Python function in TensorFlow operation + input_ids = tf.py_function( + tokenize_py_fn, + [inputs], + Tout=tf.int32 + ) + + # Set shape for downstream layers + batch_size = tf.shape(inputs)[0] + input_ids.set_shape([None, self.max_seq_length]) - tokenized = self.tokenizer(inputs.numpy().astype("U").tolist(), - max_length=self.max_seq_length, - padding='max_length', - truncation=True, - return_tensors='tf', - return_overflowing_tokens=False) - # Return the tokenized input IDs - return tokenized['input_ids'] + return input_ids + def get_config(self): - """ - Returns the configuration for the layer. - Returns: - - A dictionary containing the layer's configuration. - """ - config = super(NewTokenizerLayer, self).get_config() + config = super().get_config() config.update({ 'max_seq_length': self.max_seq_length, - 'tokenizer_checkpoint': self.tokenizer.name_or_path + 'tokenizer_checkpoint': self.tokenizer_checkpoint }) return config + @classmethod def from_config(cls, config): - """ - Creates a new instance of the layer from a configuration. - Args: - - config: The configuration dictionary. - Returns: - - A new instance of the layer. - """ - return cls(max_seq_length=config['max_seq_length'], tokenizer_checkpoint=config['tokenizer_checkpoint']) + return cls( + max_seq_length=config['max_seq_length'], + tokenizer_checkpoint=config['tokenizer_checkpoint'] + ) From 286ba81a1e51493d748ded727bd602a4398248a8 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Wed, 9 Apr 2025 22:08:27 -0400 Subject: [PATCH 086/100] Update phishing_email_detection_gpt2.py Up seq_len --- phishing_email_detection_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 61629a9..b61945b 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -329,7 +329,7 @@ def call(self, x): # GPT2 configurables # Optimal for accuracy thus far: -max_seq_length = 1024 +max_seq_length = int(1024 * 1.5) tokenizer_checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct" inp = tf.keras.layers.Input(shape=(), dtype=tf.string) From 6df20aaefce55bc8ac9c3473c91d3a22913fe87c Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 11:08:09 -0400 Subject: [PATCH 087/100] Update phishing_email_detection_gpt2.py Added back baseline workflow in best trial thus far. --- phishing_email_detection_gpt2.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index b61945b..be59db5 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -89,7 +89,7 @@ def rotate_half(x): """### A custom GPT2 encoder layer for text embedding""" -""" + class GPT2Layer(tf.keras.layers.Layer): def __init__(self, max_seq_length, **kwargs): @@ -186,7 +186,7 @@ def from_config(cls, config): hy_df = pd.DataFrame(history.history) print(hy_df) -""" + ### Cerebros model: @@ -329,11 +329,10 @@ def call(self, x): # GPT2 configurables # Optimal for accuracy thus far: -max_seq_length = int(1024 * 1.5) +max_seq_length = 1536 tokenizer_checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct" inp = tf.keras.layers.Input(shape=(), dtype=tf.string) -# gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) gp2_tokenizer = NewTokenizerLayer(max_seq_length=max_seq_length,tokenizer_checkpoint=tokenizer_checkpoint) VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocab_size tokens = gp2_tokenizer(inp) @@ -381,7 +380,7 @@ def call(self, x): p_lateral_connection = 30 num_lateral_connection_tries_per_unit = 25 learning_rate = 3 * 10 ** -3 -epochs = 15 # [1, 100] +epochs = 15 # batch_size = 17 minimum_levels = 2 maximum_levels = 2 # [3,7] @@ -462,11 +461,10 @@ def call(self, x): cerebros_time_per_model = cerebros_time_all_models_min / models_tried print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') print(f'val set accuracy') # """### Testing the best model found""" - From 794fc237b8739c1dc2c7deb4964a4618e2f3b119 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 11:10:50 -0400 Subject: [PATCH 088/100] Update automerge.yml Added all CICD tests to be used back to best NLP configuration. --- .github/workflows/automerge.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 969dfd5..959a804 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -33,16 +33,16 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - # - name: Test distributed random search Ames by running - # run: python3 regression-example-ames-no-preproc.py - # - name: Test distributed random search Ames by running - Val set - # run: python3 regression-example-ames-no-preproc-val-set.py + - name: Test distributed random search Ames by running + run: python3 regression-example-ames-no-preproc.py + - name: Test distributed random search Ames by running - Val set + run: python3 regression-example-ames-no-preproc-val-set.py # - name: Test text classifier - random search - ham-spam # run: python3 text-class-ham-or-spam.py # timeout-minutes: 90 - # - name: Test image classifier - small subset of CIFAR10 # add back - # timeout-minutes: 90 - # run: python3 cifar10-example.py + - name: Test image classifier - small subset of CIFAR10 # add back + timeout-minutes: 90 + run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding timeout-minutes: 420 run: python3 phishing_email_detection_gpt2.py From 3e467fe6a24f62a699b641a11c90f14e0bab9a35 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 12:13:40 -0400 Subject: [PATCH 089/100] Update requirements.txt Upgrade tf --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 146b1e5..251eb0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -jax==0.4.26 -jaxlib==0.4.26 +jax==jax 0.5.3 +jaxlib==jax 0.5.3 pendulum==3.0.0 -tensorflow==2.15.0 +tensorflow==2.19.0 numpy==1.26.4 pandas==2.2.1 pyvis==0.3.2 From 96897ae80f2d2d41fa8382d9716f08ae484de86c Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 12:15:10 -0400 Subject: [PATCH 090/100] Update cicd-requirements.txt Upgrade tensorflow-text to v 2.19.0 --- cicd-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cicd-requirements.txt b/cicd-requirements.txt index cf8d996..3bc545e 100644 --- a/cicd-requirements.txt +++ b/cicd-requirements.txt @@ -1,5 +1,5 @@ matplotlib==3.8.4 -tensorflow-text==2.15.0 +tensorflow-text==2.19.0 keras-nlp==0.9.1 scikit-learn==1.4.1.post1 tensorflow-hub==0.16.1 From a66e6a63148ef35266c60ce2f19deaca1ec1e771 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 12:16:19 -0400 Subject: [PATCH 091/100] Update automerge.yml Add branch to workflows. --- .github/workflows/automerge.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 959a804..72b8417 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "169-use-a-more-scalable-tokenizer" ] + branches: [ "main", "171-upgrade-tf-2190" ] permissions: contents: read @@ -35,14 +35,14 @@ jobs: # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test distributed random search Ames by running run: python3 regression-example-ames-no-preproc.py - - name: Test distributed random search Ames by running - Val set - run: python3 regression-example-ames-no-preproc-val-set.py + # - name: Test distributed random search Ames by running - Val set + # run: python3 regression-example-ames-no-preproc-val-set.py # - name: Test text classifier - random search - ham-spam # run: python3 text-class-ham-or-spam.py # timeout-minutes: 90 - - name: Test image classifier - small subset of CIFAR10 # add back - timeout-minutes: 90 - run: python3 cifar10-example.py + # - name: Test image classifier - small subset of CIFAR10 # add back + # timeout-minutes: 90 + # run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding timeout-minutes: 420 run: python3 phishing_email_detection_gpt2.py From 5c58d65e5a2af754ecd4e06c58993ff2b7e78a1e Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 12:17:29 -0400 Subject: [PATCH 092/100] Update requirements.txt Typo on requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 251eb0f..e1e3563 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -jax==jax 0.5.3 -jaxlib==jax 0.5.3 +jax==0.5.3 +jaxlib==0.5.3 pendulum==3.0.0 tensorflow==2.19.0 numpy==1.26.4 From 2c417fbf8b574d43f2ac2eaf0772b0939198120a Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 12:25:59 -0400 Subject: [PATCH 093/100] Update phishing_email_detection_gpt2.py Test to fast forward to Cerebros NLP test and check for compatibility. --- phishing_email_detection_gpt2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index be59db5..552755e 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -89,7 +89,7 @@ def rotate_half(x): """### A custom GPT2 encoder layer for text embedding""" - +""" class GPT2Layer(tf.keras.layers.Layer): def __init__(self, max_seq_length, **kwargs): @@ -187,6 +187,7 @@ def from_config(cls, config): hy_df = pd.DataFrame(history.history) print(hy_df) +""" ### Cerebros model: @@ -461,7 +462,7 @@ def call(self, x): cerebros_time_per_model = cerebros_time_all_models_min / models_tried print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') From 44854bee325ec28e1f4cb4fe5e9734473a169934 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 12:37:28 -0400 Subject: [PATCH 094/100] Update phishing_email_detection_gpt2.py Attempt to correct issue with tf v 2.19.0 graph scope. --- phishing_email_detection_gpt2.py | 85 +++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 552755e..4cd3f5b 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -255,34 +255,81 @@ class RotaryEmbedding(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): super().__init__(**kwargs) self.dim = dim - self.max_seq_len = max_seq_len + self.max_seq_len = max_seq_len # Still useful for potential pre-allocation if needed, but not for caching tensors self.temperature = temperature + # No caching in __init__ or build anymore def build(self, input_shape): + # Build is primarily for creating weights. We don't have trainable weights here. + # We can calculate inv_freq here if desired, as it doesn't depend on input shape directly + # and is constant. However, calculating it in call() is also fine. + # Let's calculate it once here to avoid recomputing constants. + # Ensure dim is even + if self.dim % 2 != 0: + raise ValueError(f"Embedding dimension `dim` ({self.dim}) must be even for RotaryEmbedding.") + + inv_freq_base = tf.range(0, self.dim, 2, dtype=tf.float32) # Corrected range for pair dimension + inv_freq = 1.0 / (self.temperature ** (inv_freq_base / self.dim)) # Corrected calculation + self.inv_freq = inv_freq # Store the constant factor super().build(input_shape) - inv_freq = 1.0 / (self.temperature ** (tf.range(0, self.dim // 2, dtype=tf.float32) / (self.dim // 2))) - position = tf.range(self.max_seq_len, dtype=tf.float32) - sinusoid = tf.einsum("i,j->ij", position, inv_freq) - sin = tf.sin(sinusoid) - cos = tf.cos(sinusoid) - self.sin_cache = sin - self.cos_cache = cos - + def call(self, x, seq_len=None): - batch_size = tf.shape(x)[0] - seq_len = tf.shape(x)[1] if seq_len is None else seq_len - sin = self.sin_cache[:seq_len] - cos = self.cos_cache[:seq_len] - sin = tf.cast(tf.repeat(sin[..., tf.newaxis], 2, axis=-1), x.dtype) - cos = tf.cast(tf.repeat(cos[..., tf.newaxis], 2, axis=-1), x.dtype) - sin = tf.reshape(sin, [seq_len, self.dim]) - cos = tf.reshape(cos, [seq_len, self.dim]) - sin = tf.expand_dims(sin, axis=0) - cos = tf.expand_dims(cos, axis=0) + shape = tf.shape(x) + batch_size = shape[0] + # Determine sequence length dynamically from input tensor 'x' + actual_seq_len = shape[1] + + # Use actual_seq_len for calculations + position = tf.range(actual_seq_len, dtype=tf.float32) + # Calculate sinusoid input using einsum or broadcasting + # Einsum approach: + sinusoid_inp = tf.einsum("i,j->ij", position, self.inv_freq) + # Broadcasting approach (might be clearer): + # sinusoid_inp = tf.expand_dims(position, axis=-1) * tf.expand_dims(self.inv_freq, axis=0) + + # Calculate sin and cos based on the actual sequence length + sin = tf.sin(sinusoid_inp) + cos = tf.cos(sinusoid_inp) + + # Repeat sin/cos for interleaving: [a, b] -> [a, a, b, b] + # Original code used repeat then reshape, which might be slightly different + # from direct interleaving depending on interpretation. Let's stick to the + # original logic's apparent intent which leads to pairing. + # We need shape [actual_seq_len, dim] + # sin/cos currently [actual_seq_len, dim/2] + sin = tf.repeat(sin, 2, axis=-1) # Repeat along the last dimension + cos = tf.repeat(cos, 2, axis=-1) # Repeat along the last dimension + + # Expand dims for batch and tile + # Output shape needs to be [batch_size, actual_seq_len, dim] + sin = tf.expand_dims(sin, axis=0) # Shape [1, actual_seq_len, dim] + cos = tf.expand_dims(cos, axis=0) # Shape [1, actual_seq_len, dim] + + # Tile to match the batch size sin = tf.tile(sin, [batch_size, 1, 1]) cos = tf.tile(cos, [batch_size, 1, 1]) + + # Ensure dtype matches input tensor x + sin = tf.cast(sin, x.dtype) + cos = tf.cast(cos, x.dtype) + + # Return sin and cos needed by InterleavedRoPE return sin, cos + def get_config(self): + config = super().get_config() + config.update({ + "dim": self.dim, + "max_seq_len": self.max_seq_len, + "temperature": self.temperature, + }) + return config + + @classmethod + def from_config(cls, config): + return cls(**config) + + def split_alternate(x): From f46ad78a5e7708d356552dcb3f0b0edcb545a1fd Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 12:51:18 -0400 Subject: [PATCH 095/100] Update phishing_email_detection_gpt2.py Another attempt to resolve tf v 2.19.0 graph scope compatibility... --- phishing_email_detection_gpt2.py | 97 +++++++++++++++++++------------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 4cd3f5b..ebf3dc8 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -251,67 +251,63 @@ def from_config(cls, config): +# --- Updated RotaryEmbedding --- class RotaryEmbedding(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): super().__init__(**kwargs) self.dim = dim - self.max_seq_len = max_seq_len # Still useful for potential pre-allocation if needed, but not for caching tensors + # Ensure dim is even right at initialization + if self.dim % 2 != 0: + raise ValueError(f"Embedding dimension `dim` ({self.dim}) must be even for RotaryEmbedding.") + self.max_seq_len = max_seq_len self.temperature = temperature - # No caching in __init__ or build anymore + # *** No calculation or storage of inv_freq here or in build *** def build(self, input_shape): - # Build is primarily for creating weights. We don't have trainable weights here. - # We can calculate inv_freq here if desired, as it doesn't depend on input shape directly - # and is constant. However, calculating it in call() is also fine. - # Let's calculate it once here to avoid recomputing constants. - # Ensure dim is even - if self.dim % 2 != 0: - raise ValueError(f"Embedding dimension `dim` ({self.dim}) must be even for RotaryEmbedding.") - - inv_freq_base = tf.range(0, self.dim, 2, dtype=tf.float32) # Corrected range for pair dimension - inv_freq = 1.0 / (self.temperature ** (inv_freq_base / self.dim)) # Corrected calculation - self.inv_freq = inv_freq # Store the constant factor + # Build should primarily be for creating trainable weights, which we don't have. + # Call super().build() for Keras compatibility. super().build(input_shape) - def call(self, x, seq_len=None): + def call(self, x): # Removed seq_len argument, calculate from x shape = tf.shape(x) batch_size = shape[0] - # Determine sequence length dynamically from input tensor 'x' actual_seq_len = shape[1] + # *** Calculate inv_freq inside call *** + inv_freq_base = tf.range(0, self.dim, 2, dtype=tf.float32) + inv_freq = 1.0 / (self.temperature ** (inv_freq_base / self.dim)) + # Ensure inv_freq has the correct shape [dim/2] + inv_freq = tf.cast(inv_freq, dtype=x.dtype) # Match dtype early + # Use actual_seq_len for calculations - position = tf.range(actual_seq_len, dtype=tf.float32) + position = tf.range(actual_seq_len, dtype=x.dtype) # Match dtype + # Calculate sinusoid input using einsum or broadcasting - # Einsum approach: - sinusoid_inp = tf.einsum("i,j->ij", position, self.inv_freq) - # Broadcasting approach (might be clearer): - # sinusoid_inp = tf.expand_dims(position, axis=-1) * tf.expand_dims(self.inv_freq, axis=0) + # Einsum approach: Ensure correct dimensions [seq_len, dim/2] + sinusoid_inp = tf.einsum("i,j->ij", position, inv_freq) # Calculate sin and cos based on the actual sequence length sin = tf.sin(sinusoid_inp) cos = tf.cos(sinusoid_inp) # Repeat sin/cos for interleaving: [a, b] -> [a, a, b, b] - # Original code used repeat then reshape, which might be slightly different - # from direct interleaving depending on interpretation. Let's stick to the - # original logic's apparent intent which leads to pairing. - # We need shape [actual_seq_len, dim] - # sin/cos currently [actual_seq_len, dim/2] - sin = tf.repeat(sin, 2, axis=-1) # Repeat along the last dimension - cos = tf.repeat(cos, 2, axis=-1) # Repeat along the last dimension + # Result needs shape [actual_seq_len, dim] + sin = tf.repeat(sin, 2, axis=-1) + cos = tf.repeat(cos, 2, axis=-1) # Expand dims for batch and tile # Output shape needs to be [batch_size, actual_seq_len, dim] - sin = tf.expand_dims(sin, axis=0) # Shape [1, actual_seq_len, dim] - cos = tf.expand_dims(cos, axis=0) # Shape [1, actual_seq_len, dim] + # Add batch dimension: [1, actual_seq_len, dim] + sin = tf.expand_dims(sin, axis=0) + cos = tf.expand_dims(cos, axis=0) - # Tile to match the batch size + # Tile to match the batch size: [batch_size, actual_seq_len, dim] sin = tf.tile(sin, [batch_size, 1, 1]) cos = tf.tile(cos, [batch_size, 1, 1]) - # Ensure dtype matches input tensor x - sin = tf.cast(sin, x.dtype) - cos = tf.cast(cos, x.dtype) + # Casting to x.dtype was already done for inv_freq, sin/cos will inherit + # sin = tf.cast(sin, x.dtype) # Already done via calculation chain + # cos = tf.cast(cos, x.dtype) # Already done via calculation chain # Return sin and cos needed by InterleavedRoPE return sin, cos @@ -332,6 +328,7 @@ def from_config(cls, config): + def split_alternate(x): shape = tf.shape(x) x = tf.reshape(x, [shape[0], shape[1], shape[2] // 2, 2]) @@ -357,17 +354,37 @@ def apply_rotary_pos_emb(x, sin, cos): class InterleavedRoPE(tf.keras.layers.Layer): def __init__(self, dim, max_seq_len=1024, **kwargs): super().__init__(**kwargs) + if dim % 2 != 0: + raise ValueError(f"Embedding dimension `dim` ({dim}) must be even for InterleavedRoPE.") self.dim = dim self.max_seq_len = max_seq_len - self.rotary_emb = RotaryEmbedding(dim, max_seq_len) + # Instantiate the RotaryEmbedding layer + # Ensure the name is consistent if needed for saving/loading + self.rotary_emb = RotaryEmbedding(dim, max_seq_len, name="rotary_embedding") def call(self, x): - batch_size = tf.shape(x)[0] - seq_len = tf.shape(x)[1] - - sin, cos = self.rotary_emb(x, seq_len) - x = apply_rotary_pos_emb(x, sin, cos) - return x + # Get sin and cos from the RotaryEmbedding layer's call method + # *** Pass only 'x'. RotaryEmbedding calculates seq_len internally. *** + sin, cos = self.rotary_emb(x) + + # Apply the positional embeddings + x_embedded = apply_rotary_pos_emb(x, sin, cos) + return x_embedded + + def get_config(self): + config = super().get_config() + config.update({ + "dim": self.dim, + "max_seq_len": self.max_seq_len, + }) + # Keras handles nested layer serialization automatically + return config + + @classmethod + def from_config(cls, config): + # Keras handles nested layer restoration automatically + return cls(**config) + From 4806719ace6570b16f1ebfc21c2ad30c4dee3c25 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 22:13:40 -0400 Subject: [PATCH 096/100] Update automerge.yml Run a full CICD run. --- .github/workflows/automerge.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 72b8417..596dbc0 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -40,9 +40,9 @@ jobs: # - name: Test text classifier - random search - ham-spam # run: python3 text-class-ham-or-spam.py # timeout-minutes: 90 - # - name: Test image classifier - small subset of CIFAR10 # add back - # timeout-minutes: 90 - # run: python3 cifar10-example.py + - name: Test image classifier - small subset of CIFAR10 # add back + timeout-minutes: 90 + run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding timeout-minutes: 420 run: python3 phishing_email_detection_gpt2.py From 783368ee6c7d3c1cf2b54c8e2aec8b331b4fd0d9 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Thu, 10 Apr 2025 22:58:38 -0400 Subject: [PATCH 097/100] Update cifar10-example.py AI suggested tf 2.15.0 -> 2.19.0 compat fix. --- cifar10-example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cifar10-example.py b/cifar10-example.py index 13bd2a4..f890cb6 100644 --- a/cifar10-example.py +++ b/cifar10-example.py @@ -88,8 +88,8 @@ def make_dataset(dataset): last_relevant_layer = base_new.layers[-2] # last_relevant_layer_extracted = last_relevant_layer #.output[0][0][0] -base_embedding = tf.keras.Model(inputs=base_new.layers[0].input, - outputs=last_relevant_layer.output) +base_embedding = tf.keras.Model(inputs=base_new.input, + outputs=last_relevant_layer.output) image_input_0 = tf.keras.layers.Input(shape=INPUT_SHAPES[0]) From 3b9ffc0f2150cd3073390795101ca6c4b2b5b7ce Mon Sep 17 00:00:00 2001 From: David Thrower Date: Fri, 11 Apr 2025 08:20:27 -0400 Subject: [PATCH 098/100] Update phishing_email_detection_gpt2.py Add back the baseline GPT2 task. --- phishing_email_detection_gpt2.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index ebf3dc8..9bc4894 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -8,11 +8,6 @@ ## GPT2 + Cerebros for Phishing email detection -Initialization -def rotate_half(x): - x = split_alternate(x) - rotated_x = tf.concat([-x[..., x.shape[-1]//2:], x[..., :x.shape[-1]//2]], axis=-1) - return tf.reshape(rotated_x, tf.shape(x))""" import tensorflow as tf import tensorflow_text @@ -89,7 +84,7 @@ def rotate_half(x): """### A custom GPT2 encoder layer for text embedding""" -""" + class GPT2Layer(tf.keras.layers.Layer): def __init__(self, max_seq_length, **kwargs): @@ -187,7 +182,7 @@ def from_config(cls, config): hy_df = pd.DataFrame(history.history) print(hy_df) -""" + ### Cerebros model: @@ -526,7 +521,7 @@ def from_config(cls, config): cerebros_time_per_model = cerebros_time_all_models_min / models_tried print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") -# print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") +print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") print(f'Cerebros best accuracy achieved is {result}') From 97591d3fea5c0db3ec30b98c47befc6029ad45a4 Mon Sep 17 00:00:00 2001 From: David Thrower Date: Fri, 11 Apr 2025 08:25:53 -0400 Subject: [PATCH 099/100] Update phishing_email_detection_gpt2.py Fix a typo in string termination ... --- phishing_email_detection_gpt2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 9bc4894..0c94f77 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -5,6 +5,7 @@ Original file is located at https://colab.research.google.com/drive/10KKTHjBkdfKBpT9OLIj2eZs533BuCS6h +""" ## GPT2 + Cerebros for Phishing email detection From 1de46fce7d6bce5c4ec48948daee4f17ffcd860c Mon Sep 17 00:00:00 2001 From: David Thrower Date: Fri, 11 Apr 2025 18:39:48 -0400 Subject: [PATCH 100/100] Update automerge.yml Uncommented out CICD test that was left commented out by error. --- .github/workflows/automerge.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 596dbc0..d11affb 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -35,8 +35,8 @@ jobs: # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test distributed random search Ames by running run: python3 regression-example-ames-no-preproc.py - # - name: Test distributed random search Ames by running - Val set - # run: python3 regression-example-ames-no-preproc-val-set.py + - name: Test distributed random search Ames by running - Val set + run: python3 regression-example-ames-no-preproc-val-set.py # - name: Test text classifier - random search - ham-spam # run: python3 text-class-ham-or-spam.py # timeout-minutes: 90