diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 29b75cc..d11affb 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "148-tensorflow-upgrades" ] + branches: [ "main", "171-upgrade-tf-2190" ] permissions: contents: read @@ -33,25 +33,16 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics -# - name: Test by running. -# run: python3 cerebros.py -# - name: Test distributed by running. -# run: python3 test_simple_cerebros_gridsearch.py -# - name: Test distributed random search wine by running. -# run: python3 random_search.py -# - name: Test CerebrosRealNeuronNetwork -# run: python3 realnn-regression-example-ames-no-preproc.py -# timeout-minutes: 45 - name: Test distributed random search Ames by running run: python3 regression-example-ames-no-preproc.py - name: Test distributed random search Ames by running - Val set run: python3 regression-example-ames-no-preproc-val-set.py - - name: Test text classifier - random search - ham-spam - run: python3 text-class-ham-or-spam.py - timeout-minutes: 90 - - name: Test image classifier - small subset of CIFAR10 + # - name: Test text classifier - random search - ham-spam + # run: python3 text-class-ham-or-spam.py + # timeout-minutes: 90 + - name: Test image classifier - small subset of CIFAR10 # add back timeout-minutes: 90 run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding - timeout-minutes: 120 + timeout-minutes: 420 run: python3 phishing_email_detection_gpt2.py diff --git a/cerebros/neuralnetworkfuture/neural_network_future.py b/cerebros/neuralnetworkfuture/neural_network_future.py index b91adf6..4643794 100644 --- a/cerebros/neuralnetworkfuture/neural_network_future.py +++ b/cerebros/neuralnetworkfuture/neural_network_future.py @@ -332,8 +332,10 @@ def compile_neural_network(self): self.materialized_neural_network.compile( loss=self.loss, metrics=self.metrics, - optimizer=tf.keras.optimizers.Adam( - learning_rate=self.learning_rate), + optimizer=tf.keras.optimizers.AdamW( + learning_rate=self.learning_rate, + weight_decay=0.004 # Add weight decay parameter + ), jit_compile=jit_compile) def util_parse_connectivity_csv(self): diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py index 125582c..0874e99 100644 --- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py +++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import tensorflow as tf +from tqdm import tqdm from cerebros.denseautomlstructuralcomponent.\ dense_automl_structural_component \ import DenseAutoMlStructuralComponent, DenseLateralConnectivity, \ @@ -519,7 +520,10 @@ def run_moity_permutations(self, spec, subtrial_number, lock): def run_random_search(self): processes = [] - for i in np.arange(self.number_of_architecture_moities_to_try): + for i in tqdm(np.arange(self.number_of_architecture_moities_to_try), + desc="Global task progress", + colour="#16ceeb"): + self.parse_neural_network_structural_spec_random() spec = self.get_neural_network_spec() diff --git a/cicd-requirements.txt b/cicd-requirements.txt index 749d089..3bc545e 100644 --- a/cicd-requirements.txt +++ b/cicd-requirements.txt @@ -1,5 +1,6 @@ matplotlib==3.8.4 -tensorflow-text==2.15.0 +tensorflow-text==2.19.0 keras-nlp==0.9.1 scikit-learn==1.4.1.post1 tensorflow-hub==0.16.1 +transformers==4.51.1 diff --git a/cifar10-example.py b/cifar10-example.py index 13bd2a4..f890cb6 100644 --- a/cifar10-example.py +++ b/cifar10-example.py @@ -88,8 +88,8 @@ def make_dataset(dataset): last_relevant_layer = base_new.layers[-2] # last_relevant_layer_extracted = last_relevant_layer #.output[0][0][0] -base_embedding = tf.keras.Model(inputs=base_new.layers[0].input, - outputs=last_relevant_layer.output) +base_embedding = tf.keras.Model(inputs=base_new.input, + outputs=last_relevant_layer.output) image_input_0 = tf.keras.layers.Input(shape=INPUT_SHAPES[0]) diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 9920ce9..0c94f77 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -5,15 +5,16 @@ Original file is located at https://colab.research.google.com/drive/10KKTHjBkdfKBpT9OLIj2eZs533BuCS6h +""" ## GPT2 + Cerebros for Phishing email detection -Initialization -""" import tensorflow as tf import tensorflow_text from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone +from keras_nlp.layers import PositionEmbedding +from transformers import AutoTokenizer from sklearn.model_selection import train_test_split from sklearn.utils import shuffle from tensorflow.keras.utils import to_categorical @@ -29,6 +30,8 @@ from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\ import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid from ast import literal_eval +import time + # # Load the email data @@ -65,8 +68,15 @@ # # Tensors for training data and labels # -training_x = [tf.constant(X_train)] -train_labels = [tf.constant(y_train)] + +# Training data for baseline model +baseline_train_x = tf.constant(X_train, dtype=tf.string) +baseline_train_y = tf.constant(y_train, dtype=tf.int8) + +# Packaged for Cerebros (multimodal, takes inputs as a list) +training_x = [baseline_train_x] +train_labels = [baseline_train_y] + # # Input and output shapes # @@ -75,6 +85,7 @@ """### A custom GPT2 encoder layer for text embedding""" + class GPT2Layer(tf.keras.layers.Layer): def __init__(self, max_seq_length, **kwargs): @@ -90,9 +101,9 @@ def __init__(self, max_seq_length, **kwargs): # Set whether the GPT2 model's layers are trainable #self.encoder.trainable = False for layer in self.encoder.layers: - layer.trainable = False + layer.trainable = True # - self.encoder.layers[-2].trainable = True + # self.encoder.layers[-2].trainable = True # # Set the maximum sequence length for tokenization self.max_seq_length = max_seq_length @@ -121,30 +132,328 @@ def from_config(cls, config): # GPT2 configurables max_seq_length = 96 -# Base model +# GPT Baseline Model input_layer = Input(shape=(), dtype=tf.string) gpt2_layer = GPT2Layer(max_seq_length)(input_layer) #output = Flatten()(gpt2_layer) -base_model = Model(inputs=input_layer, outputs=gpt2_layer) -base_model.summary() +binary_output = tf.keras.layers.Dense(1, activation='sigmoid')(gpt2_layer) + +gpt_baseline_model = Model(inputs=input_layer, outputs=binary_output) + + +gpt_baseline_model.compile( + optimizer=Adam(learning_rate=1e-4), # Small LR since we're fine-tuning GPT + loss='binary_crossentropy', + # metrics=['accuracy', tf.keras.metrics.AUC(name='auc')] + metrics=[tf.keras.metrics.BinaryAccuracy(), + tf.keras.metrics.Precision(), + tf.keras.metrics.Recall()] +) + +gpt_t0 = time.time() + +print(gpt_baseline_model.summary()) + +history = gpt_baseline_model.fit( + x=X_train, # Input data + y=y_train, # Labels + epochs=3, # Number of training iterations + batch_size=16, # Batch size small due to GPU memory constraints + validation_split=0.2, # Hold out 20% of training data for validation + shuffle=True, # Shuffle data at each epoch + callbacks=[ + tf.keras.callbacks.EarlyStopping( + monitor='val_loss', + patience=3, + restore_best_weights=True, + min_delta=0.001 + ), + tf.keras.callbacks.ReduceLROnPlateau( + monitor='val_loss', + factor=0.2, + patience=2, + min_lr=1e-6 + ) + ] +) + +gpt_t1 = time.time() +gpt_time_on_one_model_min = (gpt_t1 - gpt_t0) / 60 + +hy_df = pd.DataFrame(history.history) +print(hy_df) + + + +### Cerebros model: + +from transformers import AutoTokenizer +import tensorflow as tf + +class NewTokenizerLayer(tf.keras.layers.Layer): + def __init__(self, max_seq_length, tokenizer_checkpoint, **kwargs): + super().__init__(**kwargs) + self.max_seq_length = max_seq_length + self.tokenizer_checkpoint = tokenizer_checkpoint + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) + + # Ensure tokenizer has a padding token + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + def call(self, inputs): + def tokenize_py_fn(inputs): + # Convert TensorFlow bytes to Python strings + texts = [text.decode('utf-8') for text in inputs.numpy()] + + # Tokenize with Hugging Face tokenizer + tokenized = self.tokenizer( + texts, + max_length=self.max_seq_length, + padding='max_length', + truncation=True, + return_tensors='tf' + ) + return tokenized['input_ids'].numpy() + + # Wrap Python function in TensorFlow operation + input_ids = tf.py_function( + tokenize_py_fn, + [inputs], + Tout=tf.int32 + ) + + # Set shape for downstream layers + batch_size = tf.shape(inputs)[0] + input_ids.set_shape([None, self.max_seq_length]) + + return input_ids + + def get_config(self): + config = super().get_config() + config.update({ + 'max_seq_length': self.max_seq_length, + 'tokenizer_checkpoint': self.tokenizer_checkpoint + }) + return config + + @classmethod + def from_config(cls, config): + return cls( + max_seq_length=config['max_seq_length'], + tokenizer_checkpoint=config['tokenizer_checkpoint'] + ) + + + + +# --- Updated RotaryEmbedding --- +class RotaryEmbedding(tf.keras.layers.Layer): + def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs): + super().__init__(**kwargs) + self.dim = dim + # Ensure dim is even right at initialization + if self.dim % 2 != 0: + raise ValueError(f"Embedding dimension `dim` ({self.dim}) must be even for RotaryEmbedding.") + self.max_seq_len = max_seq_len + self.temperature = temperature + # *** No calculation or storage of inv_freq here or in build *** + + def build(self, input_shape): + # Build should primarily be for creating trainable weights, which we don't have. + # Call super().build() for Keras compatibility. + super().build(input_shape) + + def call(self, x): # Removed seq_len argument, calculate from x + shape = tf.shape(x) + batch_size = shape[0] + actual_seq_len = shape[1] + + # *** Calculate inv_freq inside call *** + inv_freq_base = tf.range(0, self.dim, 2, dtype=tf.float32) + inv_freq = 1.0 / (self.temperature ** (inv_freq_base / self.dim)) + # Ensure inv_freq has the correct shape [dim/2] + inv_freq = tf.cast(inv_freq, dtype=x.dtype) # Match dtype early + + # Use actual_seq_len for calculations + position = tf.range(actual_seq_len, dtype=x.dtype) # Match dtype + + # Calculate sinusoid input using einsum or broadcasting + # Einsum approach: Ensure correct dimensions [seq_len, dim/2] + sinusoid_inp = tf.einsum("i,j->ij", position, inv_freq) + + # Calculate sin and cos based on the actual sequence length + sin = tf.sin(sinusoid_inp) + cos = tf.cos(sinusoid_inp) + + # Repeat sin/cos for interleaving: [a, b] -> [a, a, b, b] + # Result needs shape [actual_seq_len, dim] + sin = tf.repeat(sin, 2, axis=-1) + cos = tf.repeat(cos, 2, axis=-1) + + # Expand dims for batch and tile + # Output shape needs to be [batch_size, actual_seq_len, dim] + # Add batch dimension: [1, actual_seq_len, dim] + sin = tf.expand_dims(sin, axis=0) + cos = tf.expand_dims(cos, axis=0) + + # Tile to match the batch size: [batch_size, actual_seq_len, dim] + sin = tf.tile(sin, [batch_size, 1, 1]) + cos = tf.tile(cos, [batch_size, 1, 1]) + + # Casting to x.dtype was already done for inv_freq, sin/cos will inherit + # sin = tf.cast(sin, x.dtype) # Already done via calculation chain + # cos = tf.cast(cos, x.dtype) # Already done via calculation chain + + # Return sin and cos needed by InterleavedRoPE + return sin, cos + + def get_config(self): + config = super().get_config() + config.update({ + "dim": self.dim, + "max_seq_len": self.max_seq_len, + "temperature": self.temperature, + }) + return config + + @classmethod + def from_config(cls, config): + return cls(**config) + + + + + +def split_alternate(x): + shape = tf.shape(x) + x = tf.reshape(x, [shape[0], shape[1], shape[2] // 2, 2]) + x = tf.transpose(x, [0, 1, 3, 2]) + x = tf.reshape(x, [shape[0], shape[1], -1]) + return x + + +def rotate_half(x): + x = split_alternate(x) + d = tf.shape(x)[-1] + rotated_x = tf.concat([-x[..., d//2:], x[..., :d//2]], axis=-1) + return tf.reshape(rotated_x, tf.shape(x)) + + +def apply_rotary_pos_emb(x, sin, cos): + cos = tf.reshape(cos, [tf.shape(cos)[0], tf.shape(cos)[1], -1]) + sin = tf.reshape(sin, [tf.shape(sin)[0], tf.shape(sin)[1], -1]) + x_rotated = x * cos + rotate_half(x) * sin + return x_rotated + + +class InterleavedRoPE(tf.keras.layers.Layer): + def __init__(self, dim, max_seq_len=1024, **kwargs): + super().__init__(**kwargs) + if dim % 2 != 0: + raise ValueError(f"Embedding dimension `dim` ({dim}) must be even for InterleavedRoPE.") + self.dim = dim + self.max_seq_len = max_seq_len + # Instantiate the RotaryEmbedding layer + # Ensure the name is consistent if needed for saving/loading + self.rotary_emb = RotaryEmbedding(dim, max_seq_len, name="rotary_embedding") + + def call(self, x): + # Get sin and cos from the RotaryEmbedding layer's call method + # *** Pass only 'x'. RotaryEmbedding calculates seq_len internally. *** + sin, cos = self.rotary_emb(x) + + # Apply the positional embeddings + x_embedded = apply_rotary_pos_emb(x, sin, cos) + return x_embedded + + def get_config(self): + config = super().get_config() + config.update({ + "dim": self.dim, + "max_seq_len": self.max_seq_len, + }) + # Keras handles nested layer serialization automatically + return config + + @classmethod + def from_config(cls, config): + # Keras handles nested layer restoration automatically + return cls(**config) + + + + + + + +# GPT2 configurables + +# Optimal for accuracy thus far: +max_seq_length = 1536 +tokenizer_checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct" + +inp = tf.keras.layers.Input(shape=(), dtype=tf.string) +gp2_tokenizer = NewTokenizerLayer(max_seq_length=max_seq_length,tokenizer_checkpoint=tokenizer_checkpoint) +VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocab_size +tokens = gp2_tokenizer(inp) + +# On larger hardware, this could probably be increased considerably and +# Probably would improve performance ... +EMBEDDING_N = 12 # Define EMBEDDING_DIM here, to match your embedding layer. +EMBEDDING_DIM = int(EMBEDDING_N * 2) + +embedded = tf.keras.layers.Embedding( + input_dim=VOCABULARY_SIZE, + output_dim=EMBEDDING_DIM, + input_length=max_seq_length, + mask_zero=True)(tokens) + +position_embedding = InterleavedRoPE( + dim=EMBEDDING_DIM, + max_seq_len=max_seq_length, + # initializer="uniform", +)(embedded) + +# As an FYI, we tried an add layer both with and without +# LayerNorm ... It degraded accuracy +# Just an FYI for anyone trying to apply conventional wisdom +# to save you the time ... +x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) +x = tf.keras.layers.Dropout(0.4)(x) # AI suggested 0.4 +flattened = tf.keras.layers.Flatten()(x) + +cerebros_base_model = tf.keras.Model( + inputs=inp, + outputs=flattened # Output enhanced embeddings now +) + """### Cerebros search for the best model""" # # Cerebros configurables # -activation = 'gelu' -predecessor_level_connection_affinity_factor_first = 49.9999 -predecessor_level_connection_affinity_factor_main = 0.31456 -max_consecutive_lateral_connections = 22 -p_lateral_connection = 0.39256 -num_lateral_connection_tries_per_unit = 10 -learning_rate = 0.0000511065 -epochs = 6 # [1, 100] -batch_size = 13 -maximum_levels = 4 # [3,7] -maximum_units_per_level = 8 # [2,10] -maximum_neurons_per_unit = 5 # [2,20] +activation = "relu" +predecessor_level_connection_affinity_factor_first = 10 +predecessor_level_connection_affinity_factor_main = 40 +max_consecutive_lateral_connections = 20 +p_lateral_connection = 30 +num_lateral_connection_tries_per_unit = 25 +learning_rate = 3 * 10 ** -3 +epochs = 15 # +batch_size = 17 +minimum_levels = 2 +maximum_levels = 2 # [3,7] + +minimum_units_per_level = 4 +maximum_units_per_level = 7 + +minimum_neurons_per_unit = 1 +maximum_neurons_per_unit = 2 + +moities_to_try = 5 +tries_per_moity = 1 # # Logging @@ -157,6 +466,7 @@ def from_config(cls, config): meta_trial_number = 42 # irrelevant unless in distributed training + cerebros_automl = SimpleCerebrosRandomSearch( unit_type=DenseUnit, input_shapes=INPUT_SHAPES, @@ -166,16 +476,16 @@ def from_config(cls, config): validation_split=0.35, direction='maximize', metric_to_rank_by="val_binary_accuracy", - minimum_levels=2, + minimum_levels=minimum_levels, maximum_levels=maximum_levels, - minimum_units_per_level=1, + minimum_units_per_level=minimum_units_per_level, maximum_units_per_level=maximum_units_per_level, - minimum_neurons_per_unit=1, + minimum_neurons_per_unit=minimum_neurons_per_unit, maximum_neurons_per_unit=maximum_neurons_per_unit, activation=activation, final_activation='sigmoid', - number_of_architecture_moities_to_try=2, - number_of_tries_per_architecture_moity=1, + number_of_architecture_moities_to_try=moities_to_try, + number_of_tries_per_architecture_moity=tries_per_moity, minimum_skip_connection_depth=1, maximum_skip_connection_depth=7, predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first, @@ -191,31 +501,31 @@ def from_config(cls, config): p_lateral_connection_decay=zero_95_exp_decay, num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, learning_rate=learning_rate, - loss=tf.keras.losses.CategoricalHinge(), - metrics=[tf.keras.metrics.BinaryAccuracy(), - tf.keras.metrics.Precision(), - tf.keras.metrics.Recall()], + loss=tf.keras.losses.BinaryCrossentropy(), + # loss=tf.keras.losses.CategoricalHinge(), + metrics=[tf.keras.metrics.BinaryAccuracy(), + tf.keras.metrics.Precision(), + tf.keras.metrics.Recall()], epochs=epochs, project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", model_graphs='model_graphs', batch_size=batch_size, meta_trial_number=meta_trial_number, - base_models=[base_model], + base_models=[cerebros_base_model], train_data_dtype=tf.string) +cerebros_t0 = time.time() result = cerebros_automl.run_random_search() +cerebros_t1 = time.time() +cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60 +models_tried = moities_to_try * tries_per_moity +cerebros_time_per_model = cerebros_time_all_models_min / models_tried -print(f'Best accuracy achieved is {result}') -print(f'binary accuracy') +print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") +print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") -"""### Testing the best model found""" -# -# Load the best model (taking into account that it has a custom layer) -# -best_model_found =\ -tf.keras.models.load_model(cerebros_automl.best_model_path,\ -custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)}) +print(f'Cerebros best accuracy achieved is {result}') +print(f'val set accuracy') -print('Evaluating on the test dataset') -best_model_found.evaluate(X_test, y_test) +# """### Testing the best model found""" diff --git a/requirements.txt b/requirements.txt index 1964f13..e1e3563 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,11 @@ -jax==0.4.26 -jaxlib==0.4.26 +jax==0.5.3 +jaxlib==0.5.3 pendulum==3.0.0 -tensorflow==2.15.0 +tensorflow==2.19.0 numpy==1.26.4 pandas==2.2.1 pyvis==0.3.2 plotly==5.20.0 matplotlib==3.8.4 imageio==2.34.0 +tqdm==4.67.1