1
1
# imports
2
- import transformers
2
+ import gc
3
3
from transformers import (AutoModelForCausalLM ,
4
4
AutoTokenizer ,
5
5
TrainingArguments ,
6
+ BitsAndBytesConfig
6
7
)
7
- from trl import SFTTrainer
8
- from peft import LoraConfig
8
+ from peft import LoraConfig , prepare_model_for_kbit_training , get_peft_model
9
9
from datasets import load_dataset
10
- from transformers import AutoTokenizer , AutoModelForCausalLM
11
10
from huggingface_hub import HfApi , login
12
- from transformers .hyperparameter_search import HPSearchBackend
13
- from transformers .trainer import *
14
- import optuna
15
- import gc
16
-
11
+ import torch
12
+ import CustomSFTTrainer
13
+ import random
17
14
import os
18
15
HF_TOKEN = os .getenv ('HF_TOKEN' , 'add_hf_token' )
19
16
api = HfApi ()
20
17
login (HF_TOKEN , add_to_git_credential = True )
21
18
22
-
23
19
gc .collect ()
24
20
torch .cuda .empty_cache ()
25
-
26
-
27
- def run_hp_search_optuna (trainer , n_trials , direction , ** kwargs ):
28
-
29
- def _objective (trial , checkpoint_dir = None ):
30
- checkpoint = None
31
- if checkpoint_dir :
32
- for subdir in os .listdir (checkpoint_dir ):
33
- if subdir .startswith (PREFIX_CHECKPOINT_DIR ):
34
- checkpoint = os .path .join (checkpoint_dir , subdir )
35
- #################
36
- # UPDATES START
37
- #################
38
- if not checkpoint :
39
- # free GPU memory
40
- del trainer .model
41
- gc .collect ()
42
- torch .cuda .empty_cache ()
43
- trainer .objective = None
44
- trainer .train (resume_from_checkpoint = checkpoint , trial = trial )
45
- # If there hasn't been any evaluation during the training loop.
46
- if getattr (trainer , "objective" , None ) is None :
47
- metrics = trainer .evaluate ()
48
- trainer .objective = trainer .compute_objective (metrics )
49
- return trainer .objective
50
-
51
- timeout = kwargs .pop ("timeout" , None )
52
- n_jobs = kwargs .pop ("n_jobs" , 1 )
53
- study = optuna .create_study (direction = direction , ** kwargs )
54
- study .optimize (_objective , n_trials = n_trials ,
55
- timeout = timeout , n_jobs = n_jobs )
56
- best_trial = study .best_trial
57
- return BestRun (str (best_trial .number ), best_trial .value , best_trial .params )
58
-
59
-
60
- def hyperparameter_search (
61
- self ,
62
- hp_space ,
63
- n_trials ,
64
- direction ,
65
- compute_objective = default_compute_objective ,
66
- ) -> Union [BestRun , List [BestRun ]]:
67
-
68
- trainer .hp_search_backend = HPSearchBackend .OPTUNA
69
- self .hp_space = hp_space
70
- trainer .hp_name = None
71
- trainer .compute_objective = compute_objective
72
- best_run = run_hp_search_optuna (trainer , n_trials , direction )
73
- self .hp_search_backend = None
74
- return best_run
75
-
76
-
77
- transformers .trainer .Trainer .hyperparameter_search = hyperparameter_search
78
-
79
-
80
21
# defining hyperparameter search space for optuna
81
22
82
23
83
24
def optuna_hp_space (trial ):
84
25
return {
85
26
"learning_rate" : trial .suggest_float ("learning_rate" , 1e-6 , 1e-4 , log = True ),
86
- "per_device_train_batch_size" : trial .suggest_categorical ("per_device_train_batch_size" , [16 , 32 , 64 ]),
87
27
"num_train_epochs" : trial .suggest_int ("num_train_epochs" , 3 , 15 ),
88
28
"weight_decay" : trial .suggest_loguniform ("weight_decay" , 1e-6 , 1e-2 ),
89
- "gradient_clipping" : trial .suggest_float ("gradient_clipping" , 0.1 , 0.5 ),
90
29
}
91
30
92
31
# Define a function to calculate BLEU score
93
32
94
33
95
34
# configuration arguments
96
- model_id = "google/gemma-2-27b -it"
35
+ model_id = "google/gemma-2-9b -it"
97
36
98
- # model init function for the trainer
37
+ # bits and bytes config
38
+ bnb_config = BitsAndBytesConfig (
39
+ load_in_4bit = True ,
40
+ bnb_4bit_quant_type = "nf4" ,
41
+ bnb_4bit_compute_dtype = torch .bfloat16
42
+ )
99
43
100
44
101
45
def model_init (trial ):
102
-
103
- return AutoModelForCausalLM .from_pretrained (model_id , device_map = "auto" )
46
+ model = AutoModelForCausalLM .from_pretrained (
47
+ model_id , quantization_config = bnb_config , device_map = "auto" )
48
+ model = prepare_model_for_kbit_training (model )
49
+ model = get_peft_model (model , lora_config )
50
+ return model
104
51
105
52
106
53
# tokenizer load
107
54
tokenizer = AutoTokenizer .from_pretrained (model_id , padding_side = 'right' )
108
55
109
- # Loading training and evaluation data
110
- training_dataset = load_dataset (
111
- "Kubermatic/cncf-question-and-answer-dataset-for-llm-training" , split = "train[:7500]" )
112
- eval_dataset = load_dataset (
113
- "Kubermatic/cncf-question-and-answer-dataset-for-llm-training" , split = "train[7500:8000]" )
56
+ dataset = load_dataset (
57
+ "Kubermatic/Merged_QAs" , split = "train" )
58
+
59
+ random .seed (42 )
60
+ random_indices = random .sample (range (len (dataset )), k = 500 )
61
+
62
+ training_indices = random_indices [:400 ]
63
+ eval_indices = random_indices [400 :500 ]
64
+ training_dataset = dataset .filter (
65
+ lambda _ , idx : idx in training_indices , with_indices = True )
66
+ eval_dataset = dataset .filter (
67
+ lambda _ , idx : idx in eval_indices , with_indices = True )
114
68
115
69
max_seq_length = 1024
116
70
117
71
118
72
output_dir = "trained_model"
119
73
training_arguments = TrainingArguments (
120
74
output_dir = output_dir ,
121
- num_train_epochs = 1 ,
75
+ num_train_epochs = 3 ,
122
76
gradient_checkpointing = True ,
123
77
per_device_train_batch_size = 1 ,
124
78
gradient_accumulation_steps = 8 ,
@@ -163,11 +117,14 @@ def formatting_func(example):
163
117
output_texts .append (text )
164
118
return output_texts
165
119
166
- # instantiation of the trainer
120
+
121
+ # Passing model
122
+ model = model_init (None )
167
123
168
124
169
- trainer = SFTTrainer (
170
- model = model_id ,
125
+ # instantiation of the trainer
126
+ trainer = CustomSFTTrainer (
127
+ model = model ,
171
128
train_dataset = training_dataset ,
172
129
eval_dataset = eval_dataset ,
173
130
args = training_arguments ,
@@ -178,10 +135,13 @@ def formatting_func(example):
178
135
model_init = model_init ,
179
136
)
180
137
138
+ # avoid placing model on device as it is already placed on device in model_init
139
+ trainer .place_model_on_device = False
140
+
181
141
best_trial = trainer .hyperparameter_search (
182
142
direction = "minimize" ,
183
143
hp_space = optuna_hp_space ,
184
- n_trials = 20 ,
144
+ n_trials = 5 ,
185
145
)
186
146
187
147
print (best_trial )
0 commit comments