1
1
from torch .utils .data import DataLoader , RandomSampler
2
- import torch , os , sys , time , argparse , numpy as np
3
2
from utils_dataset import SQLDataset , HDF5Dataset
3
+ import torch , os , time , argparse , numpy as np
4
4
from transformers .optimization import AdamW
5
5
from model_generator import GeneTransformer
6
- from datetime import datetime , timedelta
7
- from utils_logplot import LogPlot
8
6
import utils_misc , utils_tokenizer
7
+ from utils_logplot import LogPlot
8
+ from datetime import datetime
9
9
10
10
from model_coverage import KeywordCoverage
11
11
from model_guardrails import PatternPenalty , LengthPenalty , RepeatPenalty
17
17
parser .add_argument ("--experiment" , type = str , required = True , help = "Experiment name. Will be used to save a model file and a log file." )
18
18
parser .add_argument ("--dataset_file" , type = str , required = True , help = "Which dataset file to use. Can be full path or the root folder will be attached." )
19
19
20
- parser .add_argument ("--root_folder" , type = str , default = "/home/" + user + "/" )
21
20
parser .add_argument ("--train_batch_size" , type = int , default = 5 , help = "Training batch size." )
22
21
parser .add_argument ("--n_epochs" , type = int , default = 3 , help = "Number of epochs to run over the data." )
23
22
parser .add_argument ("--optim_every" , type = int , default = 4 , help = "Optimize every x backprops. A multiplier to the true batch size." )
34
33
os .environ ["CUDA_VISIBLE_DEVICES" ] = "" + str (freer_gpu )
35
34
args .experiment += "_" + freer_gpu
36
35
37
- models_folder = "/home/ubuntu /models/"
38
- log_folder = "/home/ubuntu /logs/"
36
+ models_folder = "/home/phillab /models/"
37
+ log_folder = "/home/phillab /logs/"
39
38
40
39
summarizer_model_start = os .path .join (models_folder , "gpt2_copier23.bin" )
41
40
@@ -65,6 +64,7 @@ def collate_func(inps):
65
64
else :
66
65
return [inp [0 ].decode () for inp in inps ]
67
66
67
+
68
68
param_optimizer = list (summarizer .model .named_parameters ())
69
69
no_decay = ['bias' , 'LayerNorm.bias' , 'LayerNorm.weight' ]
70
70
optimizer_grouped_parameters = [
@@ -88,9 +88,9 @@ def collate_func(inps):
88
88
89
89
print ("Loading scorers" )
90
90
91
- coverage_model_file = os .path .join (models_folder , "bert_coverage .bin" )
91
+ coverage_model_file = os .path .join (models_folder , "bert_coverage_google_cnndm_length15_1 .bin" )
92
92
coverage_keyword_model_file = os .path .join (models_folder , "keyword_extractor.joblib" )
93
- fluency_news_model_file = os .path .join (models_folder , "fluency_news_bs32 .bin" )
93
+ fluency_news_model_file = os .path .join (models_folder , "news_gpt2_bs32 .bin" )
94
94
95
95
scorers = [{"name" : "coverage" , "importance" : 10.0 , "sign" : 1.0 , "model" : KeywordCoverage (args .device , keyword_model_file = coverage_keyword_model_file , model_file = coverage_model_file )},
96
96
{"name" : "fluency" , "importance" : 2.0 , "sign" : 1.0 , "model" : GeneTransformer (max_output_length = args .max_output_length , device = args .device , starter_model = fluency_news_model_file )},
@@ -102,6 +102,7 @@ def collate_func(inps):
102
102
def background_tokenizer (bodies , out_queue ):
103
103
out_queue .put ([bert_tokenizer .encode (body ) for body in bodies ])
104
104
105
+
105
106
my_queue = queue .Queue ()
106
107
print ("Started training" )
107
108
@@ -116,7 +117,7 @@ def background_tokenizer(bodies, out_queue):
116
117
dataloader = DataLoader (dataset = dataset , batch_size = args .train_batch_size , sampler = RandomSampler (dataset ), drop_last = True , collate_fn = collate_func )
117
118
118
119
for epi in range (n_epochs ):
119
- print ("=================== EPOCH" ,epi , "===================" )
120
+ print ("=================== EPOCH" , epi , "===================" )
120
121
for ib , documents in enumerate (dataloader ):
121
122
Timer = {}
122
123
@@ -126,7 +127,7 @@ def background_tokenizer(bodies, out_queue):
126
127
bodies = [" " .join (doc .split (" " )[:300 ]) for doc in documents ]
127
128
128
129
# We run tokenization in the background, as it is BERT tokenization only used after the summarizer has run. Saves about 5% of time.
129
- thread1 = threading .Thread (target = background_tokenizer , args = (bodies , my_queue ))
130
+ thread1 = threading .Thread (target = background_tokenizer , args = (bodies , my_queue ))
130
131
# bodies_bert_tokenized = [bert_tokenizer.enncode(body) for body in bodies] # This is the not background version
131
132
thread1 .start ()
132
133
@@ -159,11 +160,11 @@ def background_tokenizer(bodies, out_queue):
159
160
sampled_scores = torch .FloatTensor (sampled_scores ).to (args .device )
160
161
161
162
argmax_scores , _ = scorer ['model' ].score (argmax_summaries , bodies , bodies_tokenized = bodies_bert_tokenized , extra = extra , lengths = argmax_end_idxs )
162
- argmax_scores = torch .FloatTensor (argmax_scores ).to (args .device )
163
+ argmax_scores = torch .FloatTensor (argmax_scores ).to (args .device )
163
164
164
165
Timer ["scores_" + scorer ['name' ]] = time .time ()- T
165
166
total_sampled_scores += (scorer ['sign' ])* (scorer ['importance' ])* sampled_scores
166
- total_argmax_scores += (scorer ['sign' ])* (scorer ['importance' ])* argmax_scores
167
+ total_argmax_scores += (scorer ['sign' ])* (scorer ['importance' ])* argmax_scores
167
168
log_obj [scorer ['name' ]+ "_score" ] = sampled_scores .mean ().item ()
168
169
scores_track [scorer ['name' ]+ "_scores" ] = sampled_scores
169
170
@@ -180,7 +181,7 @@ def background_tokenizer(bodies, out_queue):
180
181
T6 = time .time ()
181
182
Timer ['backward' ] = T6 - T5
182
183
183
- if ib % args .optim_every == 0 :
184
+ if ib % args .optim_every == 0 :
184
185
optimizer .step ()
185
186
optimizer .zero_grad ()
186
187
@@ -220,7 +221,7 @@ def background_tokenizer(bodies, out_queue):
220
221
221
222
if ckpt_every > 0 and len (total_score_history ) > ckpt_lookback :
222
223
current_score = np .mean (total_score_history [- ckpt_lookback :])
223
-
224
+
224
225
if time .time ()- time_ckpt > ckpt_every :
225
226
revert_ckpt = best_ckpt_score is not None and current_score < min (1.2 * best_ckpt_score , 0.8 * best_ckpt_score ) # Could be negative or positive
226
227
print ("================================== CKPT TIME, " + str (datetime .now ())+ " =================================" )
@@ -232,7 +233,7 @@ def background_tokenizer(bodies, out_queue):
232
233
optimizer .load_state_dict (torch .load (ckpt_optimizer_file ))
233
234
time_ckpt = time .time ()
234
235
print ("==============================================================================" )
235
-
236
+
236
237
if best_ckpt_score is None or current_score > best_ckpt_score :
237
238
print ("[CKPT] Saved new best at: %.3f %s" % (current_score , "[" + str (datetime .now ())+ "]" ))
238
239
best_ckpt_score = current_score
0 commit comments