Enhance example for HPU performance (#2043)

xin3he · xinhe3 · web-flow · commit aad03d54e981 · 2024-10-29T17:41:44.000+08:00
* Enhance example for HPU performance

Signed-off-by: xinhe3 &lt;xinhe3@habana.ai&gt;

* Update run_clm_no_trainer.py

* remove wikitext to avoid oom for llama2-7b bs=8

* remove wikitext

Signed-off-by: xinhe3 &lt;xinhe3@habana.ai&gt;

---------

Signed-off-by: xinhe3 &lt;xinhe3@habana.ai&gt;
Co-authored-by: xinhe3 &lt;xinhe3@habana.ai&gt;
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
@@ -55,22 +55,23 @@ python run_clm_no_trainer.py \
 ```
 ### Evaluation
 
+> Note: The SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false is an experimental flag which yields better performance for uint4, and it will be removed in a future release.
+
 ```bash
 # original model
 python run_clm_no_trainer.py \
     --model meta-llama/Llama-2-7b-hf \
     --accuracy \
     --batch_size 8 \
-    --tasks "lambada_openai,wikitext" \
-    --output_dir saved_results
+    --tasks "lambada_openai"
 
 # quantized model
-python run_clm_no_trainer.py \
+SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1 python run_clm_no_trainer.py \
     --model meta-llama/Llama-2-7b-hf \
-    --load \
     --accuracy \
     --batch_size 8 \
-    --tasks "lambada_openai,wikitext" \
+    --tasks "lambada_openai" \
+    --load \
     --output_dir saved_results
 ```
 
@@ -81,15 +82,14 @@ python run_clm_no_trainer.py \
 python run_clm_no_trainer.py \
     --model meta-llama/Llama-2-7b-hf \
     --performance \
-    --batch_size 8 \
-    --output_dir saved_results
+    --batch_size 8
 
 # quantized model
-python run_clm_no_trainer.py \
+SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1 python run_clm_no_trainer.py \
     --model meta-llama/Llama-2-7b-hf \
-    --load \
     --performance \
     --batch_size 8 \
+    --load \
     --output_dir saved_results
 ```
 
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -14,6 +14,10 @@
 from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
 from neural_compressor.torch.utils import is_hpex_available
 
+if is_hpex_available():
+    import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
+    htcore.hpu_set_inference_env()
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--model", nargs="?", default="EleutherAI/gpt-j-6b"
@@ -44,7 +48,7 @@
                     help="Pad input ids to max length.")
 parser.add_argument("--calib_iters", default=512, type=int,
                     help="calibration iters.")
-parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
+parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa",
                     type=str, help="tasks for accuracy validation")
 parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
 # ============WeightOnly configs===============
@@ -501,6 +505,12 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
     user_model, tokenizer = get_user_model()
 
 
+if is_hpex_available():
+    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+    user_model = user_model.to(torch.bfloat16)
+    wrap_in_hpu_graph(user_model, max_graphs=10)
+
+
 if args.accuracy:
     user_model.eval()
     from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser