@@ -181,46 +181,71 @@ def from_config(cls, config):
181
181
182
182
# TokenizerLayer class to handle tokenization and return only token_ids
183
183
class TokenizerLayer (tf .keras .layers .Layer ):
184
+
184
185
def __init__ (self , max_seq_length , ** kwargs ):
185
- super ().__init__ (** kwargs )
186
- self .tokenizer = GPT2Tokenizer .from_preset ("gpt2_base_en" )
187
- self .preprocessor = GPT2Preprocessor (self .tokenizer , sequence_length = max_seq_length )
186
+ #
187
+ super (GPT2Layer , self ).__init__ (** kwargs )
188
+ #
189
+ # Load the GPT2 tokenizer, preprocessor and model
190
+ self .tokenizer = GPT2Tokenizer .from_preset ("gpt2_extra_large_en" ) # "gpt2_base_en"
191
+ self .preprocessor = GPT2Preprocessor (self .tokenizer ,
192
+ sequence_length = max_seq_length )
193
+ # self.encoder = GPT2Backbone.from_preset("gpt2_base_en")
194
+ #
195
+ # Set whether the GPT2 model's layers are trainable
196
+ # self.encoder.trainable = False
197
+ # for layer in self.encoder.layers:
198
+ # layer.trainable = False
199
+ #
200
+ # self.encoder.layers[-2].trainable = True
201
+ #
202
+ # Set the maximum sequence length for tokenization
188
203
self .max_seq_length = max_seq_length
189
204
190
205
def call (self , inputs ):
191
- processed = self .preprocessor (inputs ) # Accepts tensor of strings, outputs {"token_ids": ...}
192
- return processed ["token_ids" ] # Output shape: (batch_size, max_seq_length)
206
+ #
207
+ # Output the GPT2 embedding
208
+ prep = self .preprocessor ([inputs ])
209
+ # embedding = self.encoder(prep)
210
+ # avg_pool = tf.reduce_mean(embedding, axis=1)
211
+ #
212
+ return prep ['token_ids' ]
193
213
194
214
def get_config (self ):
195
- base_config = super ().get_config ()
196
- base_config .update ({"max_seq_length" : self .max_seq_length })
197
- return base_config
215
+ #
216
+ config = super (GPT2Layer , self ).get_config ()
217
+ config .update ({'max_seq_length' : self .max_seq_length })
218
+ #
219
+ return config
220
+
221
+ @classmethod
222
+ def from_config (cls , config ):
223
+ #
224
+ return cls (max_seq_length = config ['max_seq_length' ])
198
225
226
+ # GPT2 configurables
199
227
200
- VOCAB_SIZE = GPT2Tokenizer . vocabulary_size ()
228
+ max_seq_length = 900
201
229
202
- # Create cerebros_base_model
203
- def build_cerebros_base_model (max_seq_length = 96 , embedding_dim = 256 , output_dim = VOCAB_SIZE ):
204
- input_layer = Input (shape = (), dtype = tf .string ) # Text input
205
- token_ids = TokenizerLayer (max_seq_length )(input_layer )
206
- # Build embedding layer with GPT2 tokenizer's vocabulary size (50257 for GPT2Base)
207
- embedded = tf .keras .layers .Embedding (
208
- input_dim = GPT2Tokenizer .vocabulary_size (), # Uses standard GPT-2 vocab size
209
- output_dim = embedding_dim ,
210
- mask_zero = True , # Handle <PAD> tokens
211
- name = "custom_embedding"
212
- )(token_ids )
213
-
214
- # Flatten for downstream models
215
- flattened = Flatten ()(embedded )
216
- dropout = tf .keras .layers .Dropout (.6 )(flattened )
217
- model = Model (inputs = input_layer , outputs = dropout )
218
- return model
230
+ inp = tf .keras .layers .Input (shape = (), dtype = tf .string )
231
+ gp2_tokenizer = TokenizerLayer (max_seq_length = max_seq_length )
232
+ VOCABULARY_SIZE = gp2_tokenizer .tokenizer .vocabulary_size ()
233
+ tokens = gp2_tokenizer (inp )
219
234
220
235
221
- # Example usage (outputs depend on parameters, set embedding_dim as desired)
222
- cerebros_base_model = build_cerebros_base_model (max_seq_length = 96 )
236
+ embedded = \
237
+ tf .keras .layers .Embedding (
238
+ input_dim = VOCABULARY_SIZE ,
239
+ output_dim = 15 ,
240
+ input_length = max_seq_length ,
241
+ mask_zero = True )(tokens )
242
+ dropout_embedded = tf .keras .layers .Dropout (0.6 )(embedded )
243
+ flattened = tf .keras .layers .Flatten ()(dropout_embedded )
223
244
245
+ cerebros_base_model = \
246
+ tf .keras .Model (
247
+ inputs = inp ,
248
+ outputs = flattened )
224
249
225
250
"""### Cerebros search for the best model"""
226
251
@@ -234,11 +259,17 @@ def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=V
234
259
p_lateral_connection = 0.39256
235
260
num_lateral_connection_tries_per_unit = 10
236
261
learning_rate = 0.0000511065
237
- epochs = 6 # [1, 100]
238
- batch_size = 13
239
- maximum_levels = 4 # [3,7]
240
- maximum_units_per_level = 8 # [2,10]
262
+ epochs = 15 # [1, 100]
263
+ batch_size = 20
264
+ minimum_levels = 2
265
+ maximum_levels = 4 # [3,7]
266
+
267
+ minimum_units_per_level = 4
268
+ maximum_units_per_level = 8
269
+
270
+ minimum_neurons_per_unit = 1
241
271
maximum_neurons_per_unit = 5 # [2,20]
272
+
242
273
moities_to_try = 2
243
274
tries_per_moity = 1
244
275
@@ -263,11 +294,11 @@ def build_cerebros_base_model(max_seq_length=96, embedding_dim=256, output_dim=V
263
294
validation_split = 0.35 ,
264
295
direction = 'maximize' ,
265
296
metric_to_rank_by = "val_accuracy" ,
266
- minimum_levels = 2 ,
297
+ minimum_levels = minimum_levels ,
267
298
maximum_levels = maximum_levels ,
268
- minimum_units_per_level = 1 ,
299
+ minimum_units_per_level = minimum_units_per_level ,
269
300
maximum_units_per_level = maximum_units_per_level ,
270
- minimum_neurons_per_unit = 1 ,
301
+ minimum_neurons_per_unit = minimum_neurons_per_unit ,
271
302
maximum_neurons_per_unit = maximum_neurons_per_unit ,
272
303
activation = activation ,
273
304
final_activation = 'sigmoid' ,
0 commit comments