diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml
index 29b75cc..d11affb 100644
--- a/.github/workflows/automerge.yml
+++ b/.github/workflows/automerge.yml
@@ -5,7 +5,7 @@ name: Python application
 
 on:
   push:
-    branches: [ "main", "148-tensorflow-upgrades" ]
+    branches: [ "main", "171-upgrade-tf-2190" ]
 
 permissions:
   contents: read
@@ -33,25 +33,16 @@ jobs:
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-#    - name: Test by running.
-#      run: python3 cerebros.py
-#    - name: Test distributed by running.
-#      run: python3 test_simple_cerebros_gridsearch.py
-#    - name: Test distributed random search wine by running.
-#      run: python3 random_search.py
-#    - name: Test CerebrosRealNeuronNetwork
-#      run: python3 realnn-regression-example-ames-no-preproc.py
-#      timeout-minutes: 45
     - name: Test distributed random search Ames by running
       run: python3 regression-example-ames-no-preproc.py
     - name: Test distributed random search Ames by running - Val set
       run: python3 regression-example-ames-no-preproc-val-set.py
-    - name: Test text classifier - random search - ham-spam
-      run: python3 text-class-ham-or-spam.py
-      timeout-minutes: 90
-    - name: Test image classifier - small subset of CIFAR10
+    # - name: Test text classifier - random search - ham-spam
+    #   run: python3 text-class-ham-or-spam.py
+    #   timeout-minutes: 90
+    - name: Test image classifier - small subset of CIFAR10 # add back
       timeout-minutes: 90
       run: python3 cifar10-example.py
     - name: Phishing email detection with GPT2 embedding
-      timeout-minutes: 120
+      timeout-minutes: 420
       run: python3 phishing_email_detection_gpt2.py
diff --git a/cerebros/neuralnetworkfuture/neural_network_future.py b/cerebros/neuralnetworkfuture/neural_network_future.py
index b91adf6..4643794 100644
--- a/cerebros/neuralnetworkfuture/neural_network_future.py
+++ b/cerebros/neuralnetworkfuture/neural_network_future.py
@@ -332,8 +332,10 @@ def compile_neural_network(self):
         self.materialized_neural_network.compile(
             loss=self.loss,
             metrics=self.metrics,
-            optimizer=tf.keras.optimizers.Adam(
-                    learning_rate=self.learning_rate),
+            optimizer=tf.keras.optimizers.AdamW(
+                learning_rate=self.learning_rate,
+                weight_decay=0.004  # Add weight decay parameter
+            ),
             jit_compile=jit_compile)
 
     def util_parse_connectivity_csv(self):
diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py
index 125582c..0874e99 100644
--- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py
+++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pandas as pd
 import tensorflow as tf
+from tqdm import tqdm
 from cerebros.denseautomlstructuralcomponent.\
     dense_automl_structural_component \
     import DenseAutoMlStructuralComponent, DenseLateralConnectivity, \
@@ -519,7 +520,10 @@ def run_moity_permutations(self, spec, subtrial_number, lock):
 
     def run_random_search(self):
         processes = []
-        for i in np.arange(self.number_of_architecture_moities_to_try):
+        for i in tqdm(np.arange(self.number_of_architecture_moities_to_try),
+                      desc="Global task progress",
+                      colour="#16ceeb"):
+
             self.parse_neural_network_structural_spec_random()
             spec = self.get_neural_network_spec()
 
diff --git a/cicd-requirements.txt b/cicd-requirements.txt
index 749d089..3bc545e 100644
--- a/cicd-requirements.txt
+++ b/cicd-requirements.txt
@@ -1,5 +1,6 @@
 matplotlib==3.8.4
-tensorflow-text==2.15.0
+tensorflow-text==2.19.0
 keras-nlp==0.9.1
 scikit-learn==1.4.1.post1
 tensorflow-hub==0.16.1
+transformers==4.51.1
diff --git a/cifar10-example.py b/cifar10-example.py
index 13bd2a4..f890cb6 100644
--- a/cifar10-example.py
+++ b/cifar10-example.py
@@ -88,8 +88,8 @@ def make_dataset(dataset):
 
 last_relevant_layer = base_new.layers[-2]
 # last_relevant_layer_extracted = last_relevant_layer #.output[0][0][0]
-base_embedding = tf.keras.Model(inputs=base_new.layers[0].input,
-                                outputs=last_relevant_layer.output)
+base_embedding = tf.keras.Model(inputs=base_new.input,
+                               outputs=last_relevant_layer.output)
 
 
 image_input_0 = tf.keras.layers.Input(shape=INPUT_SHAPES[0])
diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py
index 9920ce9..0c94f77 100644
--- a/phishing_email_detection_gpt2.py
+++ b/phishing_email_detection_gpt2.py
@@ -5,15 +5,16 @@
 
 Original file is located at
     https://colab.research.google.com/drive/10KKTHjBkdfKBpT9OLIj2eZs533BuCS6h
+"""
 
 ## GPT2 + Cerebros for Phishing email detection
 
-Initialization
-"""
 
 import tensorflow as tf
 import tensorflow_text
 from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone
+from keras_nlp.layers import PositionEmbedding
+from transformers import AutoTokenizer
 from sklearn.model_selection import train_test_split
 from sklearn.utils import shuffle
 from tensorflow.keras.utils import to_categorical
@@ -29,6 +30,8 @@
 from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
     import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
 from ast import literal_eval
+import time
+
 
 #
 # Load the email data
@@ -65,8 +68,15 @@
 #
 # Tensors for training data and labels
 #
-training_x   = [tf.constant(X_train)]
-train_labels = [tf.constant(y_train)]
+
+# Training data for baseline model
+baseline_train_x = tf.constant(X_train, dtype=tf.string)
+baseline_train_y = tf.constant(y_train, dtype=tf.int8)
+
+# Packaged for Cerebros (multimodal, takes inputs as a list)
+training_x   = [baseline_train_x]
+train_labels = [baseline_train_y]
+
 #
 # Input and output shapes
 #
@@ -75,6 +85,7 @@
 
 """### A custom GPT2 encoder layer for text embedding"""
 
+
 class GPT2Layer(tf.keras.layers.Layer):
 
     def __init__(self, max_seq_length, **kwargs):
@@ -90,9 +101,9 @@ def __init__(self, max_seq_length, **kwargs):
         # Set whether the GPT2 model's layers are trainable
         #self.encoder.trainable = False
         for layer in self.encoder.layers:
-            layer.trainable = False
+            layer.trainable = True
         #
-        self.encoder.layers[-2].trainable = True
+        # self.encoder.layers[-2].trainable = True
         #
         # Set the maximum sequence length for tokenization
         self.max_seq_length = max_seq_length
@@ -121,30 +132,328 @@ def from_config(cls, config):
 # GPT2 configurables
 max_seq_length = 96
 
-# Base model
+# GPT Baseline Model
 input_layer = Input(shape=(), dtype=tf.string)
 gpt2_layer = GPT2Layer(max_seq_length)(input_layer)
 #output = Flatten()(gpt2_layer)
-base_model = Model(inputs=input_layer, outputs=gpt2_layer)
-base_model.summary()
+binary_output = tf.keras.layers.Dense(1, activation='sigmoid')(gpt2_layer)
+
+gpt_baseline_model = Model(inputs=input_layer, outputs=binary_output)
+
+
+gpt_baseline_model.compile(
+    optimizer=Adam(learning_rate=1e-4),  # Small LR since we're fine-tuning GPT
+    loss='binary_crossentropy',
+    # metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
+    metrics=[tf.keras.metrics.BinaryAccuracy(), 
+         tf.keras.metrics.Precision(), 
+         tf.keras.metrics.Recall()]
+)
+
+gpt_t0 = time.time()
+
+print(gpt_baseline_model.summary())
+
+history = gpt_baseline_model.fit(
+    x=X_train,  # Input data
+    y=y_train,  # Labels
+    epochs=3,  # Number of training iterations
+    batch_size=16,  # Batch size small due to GPU memory constraints
+    validation_split=0.2,  # Hold out 20% of training data for validation
+    shuffle=True,  # Shuffle data at each epoch
+    callbacks=[
+        tf.keras.callbacks.EarlyStopping(
+            monitor='val_loss',
+            patience=3,
+            restore_best_weights=True,
+            min_delta=0.001
+        ),
+        tf.keras.callbacks.ReduceLROnPlateau(
+            monitor='val_loss',
+            factor=0.2,
+            patience=2,
+            min_lr=1e-6
+        )
+    ]
+)
+
+gpt_t1 = time.time()
+gpt_time_on_one_model_min =  (gpt_t1 - gpt_t0) / 60
+
+hy_df = pd.DataFrame(history.history)
+print(hy_df)
+
+
+
+### Cerebros model:
+
+from transformers import AutoTokenizer
+import tensorflow as tf
+
+class NewTokenizerLayer(tf.keras.layers.Layer):
+    def __init__(self, max_seq_length, tokenizer_checkpoint, **kwargs):
+        super().__init__(**kwargs)
+        self.max_seq_length = max_seq_length
+        self.tokenizer_checkpoint = tokenizer_checkpoint
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
+        
+        # Ensure tokenizer has a padding token
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+    def call(self, inputs):
+        def tokenize_py_fn(inputs):
+            # Convert TensorFlow bytes to Python strings
+            texts = [text.decode('utf-8') for text in inputs.numpy()]
+            
+            # Tokenize with Hugging Face tokenizer
+            tokenized = self.tokenizer(
+                texts,
+                max_length=self.max_seq_length,
+                padding='max_length',
+                truncation=True,
+                return_tensors='tf'
+            )
+            return tokenized['input_ids'].numpy()
+        
+        # Wrap Python function in TensorFlow operation
+        input_ids = tf.py_function(
+            tokenize_py_fn,
+            [inputs],
+            Tout=tf.int32
+        )
+        
+        # Set shape for downstream layers
+        batch_size = tf.shape(inputs)[0]
+        input_ids.set_shape([None, self.max_seq_length])
+        
+        return input_ids
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            'max_seq_length': self.max_seq_length,
+            'tokenizer_checkpoint': self.tokenizer_checkpoint
+        })
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(
+            max_seq_length=config['max_seq_length'],
+            tokenizer_checkpoint=config['tokenizer_checkpoint']
+        )
+
+
+
+
+# --- Updated RotaryEmbedding ---
+class RotaryEmbedding(tf.keras.layers.Layer):
+    def __init__(self, dim, max_seq_len=1024, temperature=10000.0, **kwargs):
+        super().__init__(**kwargs)
+        self.dim = dim
+        # Ensure dim is even right at initialization
+        if self.dim % 2 != 0:
+            raise ValueError(f"Embedding dimension `dim` ({self.dim}) must be even for RotaryEmbedding.")
+        self.max_seq_len = max_seq_len
+        self.temperature = temperature
+        # *** No calculation or storage of inv_freq here or in build ***
+
+    def build(self, input_shape):
+        # Build should primarily be for creating trainable weights, which we don't have.
+        # Call super().build() for Keras compatibility.
+        super().build(input_shape)
+
+    def call(self, x): # Removed seq_len argument, calculate from x
+        shape = tf.shape(x)
+        batch_size = shape[0]
+        actual_seq_len = shape[1]
+
+        # *** Calculate inv_freq inside call ***
+        inv_freq_base = tf.range(0, self.dim, 2, dtype=tf.float32)
+        inv_freq = 1.0 / (self.temperature ** (inv_freq_base / self.dim))
+        # Ensure inv_freq has the correct shape [dim/2]
+        inv_freq = tf.cast(inv_freq, dtype=x.dtype) # Match dtype early
+
+        # Use actual_seq_len for calculations
+        position = tf.range(actual_seq_len, dtype=x.dtype) # Match dtype
+
+        # Calculate sinusoid input using einsum or broadcasting
+        # Einsum approach: Ensure correct dimensions [seq_len, dim/2]
+        sinusoid_inp = tf.einsum("i,j->ij", position, inv_freq)
+
+        # Calculate sin and cos based on the actual sequence length
+        sin = tf.sin(sinusoid_inp)
+        cos = tf.cos(sinusoid_inp)
+
+        # Repeat sin/cos for interleaving: [a, b] -> [a, a, b, b]
+        # Result needs shape [actual_seq_len, dim]
+        sin = tf.repeat(sin, 2, axis=-1)
+        cos = tf.repeat(cos, 2, axis=-1)
+
+        # Expand dims for batch and tile
+        # Output shape needs to be [batch_size, actual_seq_len, dim]
+        # Add batch dimension: [1, actual_seq_len, dim]
+        sin = tf.expand_dims(sin, axis=0)
+        cos = tf.expand_dims(cos, axis=0)
+
+        # Tile to match the batch size: [batch_size, actual_seq_len, dim]
+        sin = tf.tile(sin, [batch_size, 1, 1])
+        cos = tf.tile(cos, [batch_size, 1, 1])
+
+        # Casting to x.dtype was already done for inv_freq, sin/cos will inherit
+        # sin = tf.cast(sin, x.dtype) # Already done via calculation chain
+        # cos = tf.cast(cos, x.dtype) # Already done via calculation chain
+
+        # Return sin and cos needed by InterleavedRoPE
+        return sin, cos
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "dim": self.dim,
+            "max_seq_len": self.max_seq_len,
+            "temperature": self.temperature,
+        })
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+
+
+
+
+
+def split_alternate(x):
+    shape = tf.shape(x)
+    x = tf.reshape(x, [shape[0], shape[1], shape[2] // 2, 2])
+    x = tf.transpose(x, [0, 1, 3, 2])
+    x = tf.reshape(x, [shape[0], shape[1], -1])
+    return x
+
+
+def rotate_half(x):
+    x = split_alternate(x)
+    d = tf.shape(x)[-1]
+    rotated_x = tf.concat([-x[..., d//2:], x[..., :d//2]], axis=-1)
+    return tf.reshape(rotated_x, tf.shape(x))
+
+
+def apply_rotary_pos_emb(x, sin, cos):
+    cos = tf.reshape(cos, [tf.shape(cos)[0], tf.shape(cos)[1], -1])
+    sin = tf.reshape(sin, [tf.shape(sin)[0], tf.shape(sin)[1], -1])
+    x_rotated = x * cos + rotate_half(x) * sin
+    return x_rotated
+
+
+class InterleavedRoPE(tf.keras.layers.Layer):
+    def __init__(self, dim, max_seq_len=1024, **kwargs):
+        super().__init__(**kwargs)
+        if dim % 2 != 0:
+             raise ValueError(f"Embedding dimension `dim` ({dim}) must be even for InterleavedRoPE.")
+        self.dim = dim
+        self.max_seq_len = max_seq_len
+        # Instantiate the RotaryEmbedding layer
+        # Ensure the name is consistent if needed for saving/loading
+        self.rotary_emb = RotaryEmbedding(dim, max_seq_len, name="rotary_embedding")
+
+    def call(self, x):
+        # Get sin and cos from the RotaryEmbedding layer's call method
+        # *** Pass only 'x'. RotaryEmbedding calculates seq_len internally. ***
+        sin, cos = self.rotary_emb(x)
+
+        # Apply the positional embeddings
+        x_embedded = apply_rotary_pos_emb(x, sin, cos)
+        return x_embedded
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "dim": self.dim,
+            "max_seq_len": self.max_seq_len,
+        })
+        # Keras handles nested layer serialization automatically
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        # Keras handles nested layer restoration automatically
+        return cls(**config)
+
+
+
+
+
+
+
+# GPT2 configurables
+
+# Optimal for accuracy thus far:
+max_seq_length = 1536
+tokenizer_checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
+
+inp = tf.keras.layers.Input(shape=(), dtype=tf.string)
+gp2_tokenizer = NewTokenizerLayer(max_seq_length=max_seq_length,tokenizer_checkpoint=tokenizer_checkpoint)
+VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocab_size
+tokens = gp2_tokenizer(inp)
+
+# On larger hardware, this could probably be increased considerably and
+# Probably would improve performance ...
+EMBEDDING_N = 12  # Define EMBEDDING_DIM here, to match your embedding layer.
+EMBEDDING_DIM = int(EMBEDDING_N * 2)
+
+embedded = tf.keras.layers.Embedding(
+    input_dim=VOCABULARY_SIZE,
+    output_dim=EMBEDDING_DIM,
+    input_length=max_seq_length,
+    mask_zero=True)(tokens)
+
+position_embedding = InterleavedRoPE(
+    dim=EMBEDDING_DIM,
+    max_seq_len=max_seq_length,
+    # initializer="uniform",
+)(embedded)
+
+# As an FYI, we tried an add layer both with and without
+# LayerNorm ... It degraded accuracy
+# Just an FYI for anyone trying to apply conventional wisdom
+# to save you the time ...
+x = x = tf.keras.layers.Concatenate()([embedded, position_embedding])
+x = tf.keras.layers.Dropout(0.4)(x)  # AI suggested 0.4
+flattened = tf.keras.layers.Flatten()(x)
+
+cerebros_base_model = tf.keras.Model(
+    inputs=inp,
+    outputs=flattened  # Output enhanced embeddings now
+)
+
 
 """### Cerebros search for the best model"""
 
 #
 # Cerebros configurables
 #
-activation = 'gelu'
-predecessor_level_connection_affinity_factor_first = 49.9999
-predecessor_level_connection_affinity_factor_main = 0.31456
-max_consecutive_lateral_connections = 22
-p_lateral_connection = 0.39256
-num_lateral_connection_tries_per_unit = 10
-learning_rate = 0.0000511065
-epochs = 6  # [1, 100]
-batch_size = 13
-maximum_levels = 4  # [3,7]
-maximum_units_per_level = 8  # [2,10]
-maximum_neurons_per_unit = 5  # [2,20]
+activation = "relu"
+predecessor_level_connection_affinity_factor_first = 10
+predecessor_level_connection_affinity_factor_main = 40
+max_consecutive_lateral_connections = 20
+p_lateral_connection = 30
+num_lateral_connection_tries_per_unit = 25
+learning_rate = 3 * 10 ** -3
+epochs = 15  #
+batch_size = 17
+minimum_levels = 2
+maximum_levels = 2 # [3,7]
+
+minimum_units_per_level = 4
+maximum_units_per_level = 7
+
+minimum_neurons_per_unit = 1
+maximum_neurons_per_unit = 2
+
+moities_to_try = 5
+tries_per_moity = 1
 
 #
 # Logging
@@ -157,6 +466,7 @@ def from_config(cls, config):
 
 meta_trial_number = 42 # irrelevant unless in distributed training
 
+
 cerebros_automl = SimpleCerebrosRandomSearch(
     unit_type=DenseUnit,
     input_shapes=INPUT_SHAPES,
@@ -166,16 +476,16 @@ def from_config(cls, config):
     validation_split=0.35,
     direction='maximize',
     metric_to_rank_by="val_binary_accuracy",
-    minimum_levels=2,
+    minimum_levels=minimum_levels,
     maximum_levels=maximum_levels,
-    minimum_units_per_level=1,
+    minimum_units_per_level=minimum_units_per_level,
     maximum_units_per_level=maximum_units_per_level,
-    minimum_neurons_per_unit=1,
+    minimum_neurons_per_unit=minimum_neurons_per_unit,
     maximum_neurons_per_unit=maximum_neurons_per_unit,
     activation=activation,
     final_activation='sigmoid',
-    number_of_architecture_moities_to_try=2,
-    number_of_tries_per_architecture_moity=1,
+    number_of_architecture_moities_to_try=moities_to_try,
+    number_of_tries_per_architecture_moity=tries_per_moity,
     minimum_skip_connection_depth=1,
     maximum_skip_connection_depth=7,
     predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
@@ -191,31 +501,31 @@ def from_config(cls, config):
     p_lateral_connection_decay=zero_95_exp_decay,
     num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
     learning_rate=learning_rate,
-    loss=tf.keras.losses.CategoricalHinge(),
-    metrics=[tf.keras.metrics.BinaryAccuracy(),
-             tf.keras.metrics.Precision(),
-             tf.keras.metrics.Recall()],
+    loss=tf.keras.losses.BinaryCrossentropy(),
+    # loss=tf.keras.losses.CategoricalHinge(),
+    metrics=[tf.keras.metrics.BinaryAccuracy(), 
+         tf.keras.metrics.Precision(), 
+         tf.keras.metrics.Recall()],
     epochs=epochs,
     project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
     model_graphs='model_graphs',
     batch_size=batch_size,
     meta_trial_number=meta_trial_number,
-    base_models=[base_model],
+    base_models=[cerebros_base_model],
     train_data_dtype=tf.string)
 
+cerebros_t0 = time.time()
 result = cerebros_automl.run_random_search()
+cerebros_t1 = time.time()
+cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60
+models_tried = moities_to_try  * tries_per_moity
+cerebros_time_per_model = cerebros_time_all_models_min / models_tried
 
-print(f'Best accuracy achieved is {result}')
-print(f'binary accuracy')
+print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.")
+print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.")
 
-"""### Testing the best model found"""
 
-#
-# Load the best model (taking into account that it has a custom layer)
-#
-best_model_found =\
-tf.keras.models.load_model(cerebros_automl.best_model_path,\
-custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)})
+print(f'Cerebros best accuracy achieved is {result}')
+print(f'val set accuracy')
 
-print('Evaluating on the test dataset')
-best_model_found.evaluate(X_test, y_test)
+# """### Testing the best model found"""
diff --git a/requirements.txt b/requirements.txt
index 1964f13..e1e3563 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,11 @@
-jax==0.4.26
-jaxlib==0.4.26
+jax==0.5.3
+jaxlib==0.5.3
 pendulum==3.0.0
-tensorflow==2.15.0
+tensorflow==2.19.0
 numpy==1.26.4
 pandas==2.2.1
 pyvis==0.3.2
 plotly==5.20.0
 matplotlib==3.8.4
 imageio==2.34.0
+tqdm==4.67.1