52
52
help = "calibration iters." )
53
53
parser .add_argument ("--tasks" , nargs = '+' , default = ["lambada_openai" ,
54
54
"hellaswag" , "winogrande" , "piqa" , "wikitext" ],
55
- type = str , help = "tasks list for accuracy validation" )
55
+ type = str , help = "tasks list for accuracy validation, text-generation and code-generation tasks are different. " )
56
56
parser .add_argument ("--peft_model_id" , type = str , default = None , help = "model_name_or_path of peft model" )
57
57
# ============SmoothQuant configs==============
58
58
parser .add_argument ("--sq" , action = "store_true" )
78
78
this should align with your model config, \
79
79
and your dataset builder args: args.pad_max_length' )
80
80
parser .add_argument ('--gptq_debug' , action = 'store_true' , help = 'Whether to use debug model ' )
81
- # =======================================
81
+ # ==============code generation args===========
82
+ parser .add_argument ("--code_generation" , action = "store_true" )
83
+ parser .add_argument ("--n_samples" , default = 200 , type = int )
84
+ parser .add_argument (
85
+ "--limit" , default = None , type = int , help = "Limit number of samples to eval"
86
+ )
87
+ parser .add_argument ("--allow_code_execution" , action = "store_true" )
88
+ parser .add_argument ("--prefix" , default = "" )
89
+ parser .add_argument ("--generation_only" , action = "store_true" )
90
+ parser .add_argument ("--postprocess" , action = "store_false" )
91
+ parser .add_argument ("--save_references" , action = "store_true" )
92
+ parser .add_argument ("--save_generations" , action = "store_true" )
93
+ parser .add_argument ("--instruction_tokens" , default = None )
94
+ parser .add_argument ("--save_generations_path" , default = "generations.json" )
95
+ parser .add_argument ("--load_generations_path" , default = None )
96
+ parser .add_argument ("--metric_output_path" , default = "evaluation_results.json" )
97
+ parser .add_argument ("--max_length_generation" , default = 512 , type = int )
98
+ parser .add_argument ("--temperature" , default = 0.8 , type = float )
99
+ parser .add_argument ("--top_p" , default = 0.8 , type = float )
100
+ parser .add_argument ("--top_k" , default = 0 , type = int )
101
+ parser .add_argument ("--do_sample" , action = "store_true" )
102
+ parser .add_argument ("--check_references" , action = "store_true" )
103
+ parser .add_argument ("--max_memory_per_gpu" , type = str , default = None )
104
+ parser .add_argument (
105
+ "--modeltype" ,
106
+ default = "causal" ,
107
+ help = "AutoModel to use, it can be causal or seq2seq" ,
108
+ )
109
+ parser .add_argument (
110
+ "--limit_start" ,
111
+ type = int ,
112
+ default = 0 ,
113
+ help = "Optional offset to start from when limiting the number of samples" ,
114
+ )
82
115
83
116
args = parser .parse_args ()
84
117
if args .ipex :
@@ -262,7 +295,7 @@ def calib_func(prepared_model):
262
295
if args .gptq_debug :
263
296
from neural_compressor .adaptor .torch_utils .weight_only import gptq_quantize
264
297
265
- conf = {
298
+ gptq_conf = {
266
299
".*" : {
267
300
'wbits' : args .woq_bits , # 1-8 bits
268
301
'group_size' : args .woq_group_size , # -1 (per-channel)
@@ -272,20 +305,16 @@ def calib_func(prepared_model):
272
305
}
273
306
q_model_gptq_debug , gptq_config = gptq_quantize (
274
307
user_model ,
275
- weight_config = conf ,
308
+ weight_config = gptq_conf ,
276
309
dataloader = calib_dataloader ,
277
310
nsamples = args .gptq_nsamples ,
278
311
use_max_length = args .gptq_use_max_length ,
279
- pad_max_length = args .gptq_pad_max_length
312
+ pad_max_length = args .gptq_pad_max_length ,
280
313
)
281
- from intel_extension_for_transformers .llm .evaluation .lm_eval import evaluate
282
314
283
- results = evaluate (
284
- model = "hf-causal" ,
285
- model_args = 'pretrained=' + args .model + ',tokenizer=' + args .model + ',dtype=float32' ,
286
- user_model = q_model_gptq_debug , tasks = ["lambada_openai" ],
287
- batch_size = 4
288
- )
315
+ # save the fake quantized model
316
+ os .makedirs (args .output_dir , exist_ok = True )
317
+ torch .save (q_model_gptq_debug , os .path .join (args .output_dir , "gptq_best_model.pt" ))
289
318
exit (0 )
290
319
291
320
else :
@@ -317,7 +346,6 @@ def calib_func(prepared_model):
317
346
eval_dataset = load_dataset ('lambada' , split = 'validation' )
318
347
evaluator = Evaluator (eval_dataset , tokenizer )
319
348
320
-
321
349
def eval_func (model ):
322
350
acc = evaluator .evaluate (model )
323
351
return acc
@@ -347,15 +375,29 @@ def eval_func(model):
347
375
348
376
if args .accuracy :
349
377
user_model .eval ()
350
- from intel_extension_for_transformers .llm .evaluation .lm_eval import evaluate
378
+ if args .gptq_debug :
379
+ user_model = torch .load (os .path .join (args .output_dir , "gptq_best_model.pt" ))
380
+ if args .code_generation :
381
+ from intel_extension_for_transformers .llm .evaluation .lm_code_eval import evaluate
382
+ from transformers import AutoTokenizer
383
+ tokenizer = AutoTokenizer .from_pretrained (args .model )
384
+ results = evaluate (
385
+ model = user_model ,
386
+ tokenizer = tokenizer ,
387
+ tasks = "," .join (args .tasks ),
388
+ batch_size = args .batch_size ,
389
+ args = args ,
390
+ )
391
+ else :
392
+ from intel_extension_for_transformers .llm .evaluation .lm_eval import evaluate
393
+ results = evaluate (
394
+ model = "hf-causal" ,
395
+ model_args = 'pretrained=' + args .model + ',tokenizer=' + args .model + ',dtype=float32' ,
396
+ user_model = user_model ,
397
+ batch_size = args .batch_size ,
398
+ tasks = args .tasks ,
399
+ )
351
400
352
- results = evaluate (
353
- model = "hf-causal" ,
354
- model_args = 'pretrained=' + args .model + ',tokenizer=' + args .model + ',dtype=float32' ,
355
- user_model = user_model ,
356
- batch_size = args .batch_size ,
357
- tasks = args .tasks ,
358
- )
359
401
dumped = json .dumps (results , indent = 2 )
360
402
if args .save_accuracy_path :
361
403
with open (args .save_accuracy_path , "w" ) as f :
0 commit comments