1
1
import argparse
2
2
import os
3
3
import sys
4
+
4
5
sys .path .append ('./' )
5
6
import time
6
7
import json
33
34
'--seed' ,
34
35
type = int , default = 42 , help = 'Seed for sampling the calibration data.'
35
36
)
36
- parser .add_argument ("--approach" , type = str , default = 'static' ,
37
+ parser .add_argument ("--approach" , type = str , default = 'static' ,
37
38
help = "Select from ['dynamic', 'static', 'weight-only']" )
38
39
parser .add_argument ("--int8" , action = "store_true" )
39
40
parser .add_argument ("--ipex" , action = "store_true" , help = "Use intel extension for pytorch." )
50
51
parser .add_argument ("--calib_iters" , default = 512 , type = int ,
51
52
help = "calibration iters." )
52
53
parser .add_argument ("--tasks" , nargs = '+' , default = ["lambada_openai" ,
53
- "hellaswag" ,"winogrande" ,"piqa" ,"wikitext" ],
54
- type = str , help = "tasks list for accuracy validation" )
54
+ "hellaswag" , "winogrande" , "piqa" , "wikitext" ],
55
+ type = str , help = "tasks list for accuracy validation" )
55
56
parser .add_argument ("--peft_model_id" , type = str , default = None , help = "model_name_or_path of peft model" )
56
57
# ============SmoothQuant configs==============
57
58
parser .add_argument ("--sq" , action = "store_true" )
58
59
parser .add_argument ("--alpha" , default = "auto" , help = "Smooth quant parameter." )
59
60
# ============WeightOnly configs===============
60
- parser .add_argument ("--woq_algo" , default = "RTN" , choices = ['RTN' , 'AWQ' , 'TEQ' , 'GPTQ' ],
61
+ parser .add_argument ("--woq_algo" , default = "RTN" , choices = ['RTN' , 'AWQ' , 'TEQ' , 'GPTQ' ],
61
62
help = "Weight-only parameter." )
62
63
parser .add_argument ("--woq_bits" , type = int , default = 8 )
63
64
parser .add_argument ("--woq_group_size" , type = int , default = - 1 )
64
65
parser .add_argument ("--woq_scheme" , default = "sym" )
65
66
parser .add_argument ("--woq_enable_mse_search" , action = "store_true" )
66
67
parser .add_argument ("--woq_enable_full_range" , action = "store_true" )
67
68
# =============GPTQ configs====================
68
- parser .add_argument ("--gptq_actorder" , action = "store_true" , help = "Whether to apply the activation order GPTQ heuristic." )
69
- parser .add_argument ('--gptq_percdamp' , type = float , default = .01 , help = 'Percent of the average Hessian diagonal to use for dampening.' )
69
+ parser .add_argument ("--gptq_actorder" , action = "store_true" ,
70
+ help = "Whether to apply the activation order GPTQ heuristic." )
71
+ parser .add_argument ('--gptq_percdamp' , type = float , default = .01 ,
72
+ help = 'Percent of the average Hessian diagonal to use for dampening.' )
70
73
parser .add_argument ('--gptq_block_size' , type = int , default = 128 , help = 'Block size. sub weight matrix size to run GPTQ.' )
71
74
parser .add_argument ('--gptq_nsamples' , type = int , default = 128 , help = 'Number of calibration data samples.' )
72
- parser .add_argument ('--gptq_use_max_length' , action = "store_true" , help = 'Set all sequence length to be same length of args.gptq_pad_max_length' )
75
+ parser .add_argument ('--gptq_use_max_length' , action = "store_true" ,
76
+ help = 'Set all sequence length to be same length of args.gptq_pad_max_length' )
73
77
parser .add_argument ('--gptq_pad_max_length' , type = int , default = 2048 , help = 'Calibration dataset sequence max length, \
74
78
this should align with your model config, \
75
79
and your dataset builder args: args.pad_max_length' )
76
80
parser .add_argument ('--gptq_debug' , action = 'store_true' , help = 'Whether to use debug model ' )
77
- parser .add_argument ('--gptq_gpu' , action = 'store_true' , help = 'Whether to use gpu' )
78
81
# =======================================
79
82
80
83
args = parser .parse_args ()
81
84
if args .ipex :
82
85
import intel_extension_for_pytorch as ipex
83
86
calib_size = 1
84
87
88
+
85
89
class Evaluator :
86
90
def __init__ (self , dataset , tokenizer , batch_size = 8 , pad_val = 1 , pad_max = 196 , is_calib = False ):
87
91
self .dataset = dataset
@@ -149,7 +153,7 @@ def evaluate(self, model):
149
153
pred = last_token_logits .argmax (dim = - 1 )
150
154
total += label .size (0 )
151
155
hit += (pred == label ).sum ().item ()
152
- if (i + 1 ) % 50 == 0 :
156
+ if (i + 1 ) % 50 == 0 :
153
157
print (hit / total )
154
158
print ("Processed minibatch:" , i )
155
159
@@ -187,6 +191,7 @@ def get_user_model():
187
191
user_model .eval ()
188
192
return user_model , tokenizer
189
193
194
+
190
195
if args .quantize :
191
196
# dataset
192
197
user_model , tokenizer = get_user_model ()
@@ -201,43 +206,46 @@ def get_user_model():
201
206
collate_fn = calib_evaluator .collate_batch ,
202
207
)
203
208
209
+
204
210
def calib_func (prepared_model ):
205
211
for i , calib_input in enumerate (calib_dataloader ):
206
212
if i > args .calib_iters :
207
213
break
208
214
prepared_model (calib_input [0 ])
209
215
216
+
210
217
recipes = {}
211
218
eval_func = None
212
219
from neural_compressor import PostTrainingQuantConfig , quantization
220
+
213
221
# specify the op_type_dict and op_name_dict
214
222
if args .approach == 'weight_only' :
215
223
op_type_dict = {
216
- '.*' :{ # re.match
224
+ '.*' : { # re.match
217
225
"weight" : {
218
- 'bits' : args .woq_bits , # 1-8 bits
226
+ 'bits' : args .woq_bits , # 1-8 bits
219
227
'group_size' : args .woq_group_size , # -1 (per-channel)
220
- 'scheme' : args .woq_scheme , # sym/asym
221
- 'algorithm' : args .woq_algo , # RTN/AWQ/TEQ
228
+ 'scheme' : args .woq_scheme , # sym/asym
229
+ 'algorithm' : args .woq_algo , # RTN/AWQ/TEQ
222
230
},
223
231
},
224
232
}
225
- op_name_dict = {
226
- 'lm_head' :{"weight" : {'dtype' : 'fp32' },},
227
- 'embed_out' :{"weight" : {'dtype' : 'fp32' },}, # for dolly_v2
233
+ op_name_dict = {
234
+ 'lm_head' : {"weight" : {'dtype' : 'fp32' }, },
235
+ 'embed_out' : {"weight" : {'dtype' : 'fp32' }, }, # for dolly_v2
228
236
}
229
237
recipes ["rtn_args" ] = {
230
238
"enable_mse_search" : args .woq_enable_mse_search ,
231
239
"enable_full_range" : args .woq_enable_full_range ,
232
240
}
233
241
recipes ['gptq_args' ] = {
234
- 'percdamp' : args .gptq_percdamp ,
235
- 'act_order' :args .gptq_actorder ,
236
- 'block_size' : args .gptq_block_size ,
237
- 'nsamples' : args .gptq_nsamples ,
238
- 'use_max_length' : args .gptq_use_max_length ,
239
- 'pad_max_length' : args .gptq_pad_max_length
240
- }
242
+ 'percdamp' : args .gptq_percdamp ,
243
+ 'act_order' : args .gptq_actorder ,
244
+ 'block_size' : args .gptq_block_size ,
245
+ 'nsamples' : args .gptq_nsamples ,
246
+ 'use_max_length' : args .gptq_use_max_length ,
247
+ 'pad_max_length' : args .gptq_pad_max_length
248
+ }
241
249
# GPTQ: use assistive functions to modify calib_dataloader and calib_func
242
250
# TEQ: set calib_func=None, use default training func as calib_func
243
251
if args .woq_algo in ["GPTQ" , "TEQ" ]:
@@ -253,30 +261,32 @@ def calib_func(prepared_model):
253
261
# for test on various models, keep the code of directly call gptq_quantize
254
262
if args .gptq_debug :
255
263
from neural_compressor .adaptor .torch_utils .weight_only import gptq_quantize
264
+
256
265
conf = {
257
- ".*" :{
258
- 'wbits' : args .woq_bits , # 1-8 bits
266
+ ".*" : {
267
+ 'wbits' : args .woq_bits , # 1-8 bits
259
268
'group_size' : args .woq_group_size , # -1 (per-channel)
260
269
'sym' : (args .woq_scheme == "sym" ),
261
270
'act_order' : args .gptq_actorder ,
262
271
}
263
- }
272
+ }
264
273
q_model_gptq_debug , gptq_config = gptq_quantize (
265
- user_model ,
266
- weight_config = conf ,
267
- dataloader = calib_dataloader ,
268
- nsamples = args .gptq_nsamples ,
269
- use_max_length = args .gptq_use_max_length ,
270
- pad_max_length = args .gptq_pad_max_length
274
+ user_model ,
275
+ weight_config = conf ,
276
+ dataloader = calib_dataloader ,
277
+ nsamples = args .gptq_nsamples ,
278
+ use_max_length = args .gptq_use_max_length ,
279
+ pad_max_length = args .gptq_pad_max_length
271
280
)
272
281
from intel_extension_for_transformers .llm .evaluation .lm_eval import evaluate
282
+
273
283
results = evaluate (
274
284
model = "hf-causal" ,
275
- model_args = 'pretrained=' + args .model + ',tokenizer=' + args .model + ',dtype=float32' ,
285
+ model_args = 'pretrained=' + args .model + ',tokenizer=' + args .model + ',dtype=float32' ,
276
286
user_model = q_model_gptq_debug , tasks = ["lambada_openai" ],
277
- device = DEV .type ,
278
287
batch_size = 4
279
288
)
289
+ exit (0 )
280
290
281
291
else :
282
292
if re .search ("gpt" , user_model .config .model_type ):
@@ -306,6 +316,8 @@ def calib_func(prepared_model):
306
316
if isinstance (args .alpha , list ):
307
317
eval_dataset = load_dataset ('lambada' , split = 'validation' )
308
318
evaluator = Evaluator (eval_dataset , tokenizer )
319
+
320
+
309
321
def eval_func (model ):
310
322
acc = evaluator .evaluate (model )
311
323
return acc
@@ -323,6 +335,7 @@ def eval_func(model):
323
335
if args .int8 or args .int8_bf16_mixed :
324
336
print ("load int8 model" )
325
337
from neural_compressor .utils .pytorch import load
338
+
326
339
if args .ipex :
327
340
user_model = load (os .path .abspath (os .path .expanduser (args .output_dir )))
328
341
else :
@@ -335,9 +348,10 @@ def eval_func(model):
335
348
if args .accuracy :
336
349
user_model .eval ()
337
350
from intel_extension_for_transformers .llm .evaluation .lm_eval import evaluate
351
+
338
352
results = evaluate (
339
353
model = "hf-causal" ,
340
- model_args = 'pretrained=' + args .model + ',tokenizer=' + args .model + ',dtype=float32' ,
354
+ model_args = 'pretrained=' + args .model + ',tokenizer=' + args .model + ',dtype=float32' ,
341
355
user_model = user_model ,
342
356
batch_size = args .batch_size ,
343
357
tasks = args .tasks ,
@@ -358,11 +372,12 @@ def eval_func(model):
358
372
user_model .eval ()
359
373
from intel_extension_for_transformers .llm .evaluation .lm_eval import evaluate
360
374
import time
375
+
361
376
samples = args .iters * args .batch_size
362
377
start = time .time ()
363
378
results = evaluate (
364
379
model = "hf-causal" ,
365
- model_args = 'pretrained=' + args .model + ',tokenizer=' + args .model + ',dtype=float32' ,
380
+ model_args = 'pretrained=' + args .model + ',tokenizer=' + args .model + ',dtype=float32' ,
366
381
user_model = user_model ,
367
382
batch_size = args .batch_size ,
368
383
tasks = args .tasks ,
@@ -376,5 +391,5 @@ def eval_func(model):
376
391
acc = results ["results" ][task_name ]["acc" ]
377
392
print ("Accuracy: %.5f" % acc )
378
393
print ('Throughput: %.3f samples/sec' % (samples / (end - start )))
379
- print ('Latency: %.3f ms' % ((end - start )* 1000 / samples ))
394
+ print ('Latency: %.3f ms' % ((end - start ) * 1000 / samples ))
380
395
print ('Batch size = %d' % args .batch_size )
0 commit comments