add llama2 examples for smoothquant (#1470)

chensuyue · yintong-lu · web-flow · commit 111b3cee0858 · 2023-12-15T15:06:43.000+08:00
Signed-off-by: chensuyue &lt;suyue.chen@intel.com&gt;
Co-authored-by: Lu, Yintong &lt;yintong.lu@intel.com&gt;
diff --git a/docs/source/smooth_quant.md b/docs/source/smooth_quant.md
@@ -324,7 +324,7 @@ IPEX (Intel Extension for PyTorch): 2.0/2.1
 
 Dataset: lambada_openai
 
-Task: text-generation
+Task: text-generation provided by [ITREX](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/huggingface/pytorch/text-generation/quantization)
 
 alpha [0.4, 0.6] is sweet spot region in SmoothQuant paper.
 
@@ -370,6 +370,13 @@ A list of models that achieved a <1% accuracy drop is shown below.
 | databricks/dolly-v2-3b* | 0.6297 | 0.6247 | alpha=0.5, Ipex 2.1 |
 | tiiuae/falcon-7b-instruct | 0.6437 | 0.6392 | alpha=0.7, Pytorch |
 
+The results listed below are achieved using IPEX optimize_transformers in model initialization for better performance. Please refer to the step-by-step [instruction](../../examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/README.md) for details.
+| Model/Last token accuracy |  FP32 Accuracy   | INT8 (w/ SmoothQuant) | Notes |
+|:----------:|:------:|:------:|-----------------------------------|
+| LLaMa-2-7b-hf* | 0.7392 | 0.7332  | alpha=Auto, Ipex 2.1 |
+| LLaMa-2-13b-hf* | 0.7677 | 0.7632  | alpha=Auto, Ipex 2.1 |
+
+
 Please note that for models with asterisk(*), we have set all add ops to FP32 during quantization step to achieve desirable results.
 ## Example
 
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/README.md
@@ -0,0 +1,30 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run llama2 SmoothQuant with Intel® Neural Compressor and Intel® Extension for PyTorch.
+
+# Prerequisite
+```
+# Installation dependencies
+pip install -r requirements.txt
+```
+
+# Run Quantization
+
+## Llama-2-7b
+```bash
+python run_llama2_sq.py \
+    --model-id meta-llama/Llama-2-7b-hf \
+    --batch-size 56 \
+    --sq-recipes "llama2-7b"
+```
+## Llama-2-13b
+```bash
+python run_llama2_sq.py \
+    --model-id meta-llama/Llama-2-13b-hf \
+    --batch-size 56 \
+    --sq-recipes "llama2-13b" \
+    --padding
+```
+> Notes:  
+> - INT8 model will be saved into "./saved_results" including "./saved_results/best_configure.json" and "./saved_results/best_model.pt", which can be loaded and evaluated by IPEX.  
+> - Parameter "--sq-recipes" decides the recipes used to do quantize, details can be found in scripts.
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/requirements.txt
@@ -0,0 +1,10 @@
+neural-compressor==2.4
+transformers==4.32.0
+datasets
+accelerate
+sentencepiece
+protobuf
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.1.0+cpu
+intel-extension-for-pytorch==2.1.0
+
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/run_llama2_sq.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/run_llama2_sq.py
@@ -0,0 +1,240 @@
+import argparse
+
+from datasets import load_dataset
+from transformers import LlamaForCausalLM, LlamaTokenizer, AutoConfig
+
+import torch
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+
+import intel_extension_for_pytorch as ipex
+
+parser = argparse.ArgumentParser('LLaMA generation script (int8 path)', add_help=False)
+
+parser.add_argument(
+    "-m", "--model-id", default=None, type=str, required=True, help="your llama model"
+)
+parser.add_argument(
+    "--sq-recipes", default=None, type=str, required=True, help="llama2-7b or llama2-13b"
+)
+parser.add_argument(
+    "--max-new-tokens", default=32, type=int, help="output max new tokens"
+)
+parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k")
+parser.add_argument("--output-dir", nargs="?", default="./saved_results")
+
+parser.add_argument(
+    "--int8-bf16-mixed",
+    action="store_true",
+    help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)",
+)
+parser.add_argument("--input-tokens", default="32", type=str)
+parser.add_argument("--prompt", default=None, type=str)
+parser.add_argument("--padding", action="store_true", help="whether do padding in calib_dataloader")
+parser.add_argument("--batch-size", default=1, type=int, help="batch size")
+parser.add_argument("--alpha", default=0.8, type=float, help="alpha value for smoothquant")
+parser.add_argument("--greedy", action="store_true")
+
+args = parser.parse_args()
+
+try:
+    ipex._C.disable_jit_linear_repack()
+except Exception:
+    pass
+
+# amp autocast
+if args.int8_bf16_mixed:
+    amp_enabled = True
+    amp_dtype = torch.bfloat16
+else:
+    amp_enabled = False
+    amp_dtype = torch.float32
+
+num_beams = 1 if args.greedy else 4
+
+# load model
+config = AutoConfig.from_pretrained(args.model_id, torchscript=True)
+if not hasattr(config, "text_max_length") and args.prompt is None:
+    config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens)
+
+user_model = LlamaForCausalLM.from_pretrained(
+    args.model_id, config=config, low_cpu_mem_usage=True, torch_dtype=torch.float
+)
+
+tokenizer = LlamaTokenizer.from_pretrained(args.model_id)
+print("Data type of the model:", user_model.dtype)
+
+# dummy past key value
+beam_idx_tmp = torch.zeros(
+    (2048, int(args.batch_size * num_beams)), dtype=torch.long
+).contiguous()
+global_past_key_value = [
+    (
+        torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
+        torch.zeros(
+            [
+                1,
+                user_model.config.num_attention_heads,
+                1,
+                int(
+                    user_model.config.hidden_size
+                    / user_model.config.num_attention_heads
+                ),
+            ]
+        ).contiguous(),
+        torch.zeros(
+            [
+                1,
+                user_model.config.num_attention_heads,
+                1,
+                int(
+                    user_model.config.hidden_size
+                    / user_model.config.num_attention_heads
+                ),
+            ]
+        ).contiguous(),
+        beam_idx_tmp,
+    )
+    for i in range(user_model.config.num_hidden_layers)
+]
+
+
+class Evaluator:
+
+    def __init__(self, dataset, tokenizer, batch_size=1, pad_val=1, pad_max=512):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.pad_val = pad_val
+        self.pad_max = pad_max
+
+        # tokenize the dataset
+        self.dataset = self.dataset.map(self.tokenize_function, batched=True)
+        self.dataset.set_format(type="torch", columns=["input_ids"])
+
+    @torch.no_grad()
+    def tokenize_function(self, examples):
+        if "prompt" in examples:
+            example = self.tokenizer(examples["prompt"])
+        elif "text" in examples:
+            example = self.tokenizer(examples["text"])
+        elif "code" in examples:
+            example = self.tokenizer(examples["code"])
+        return example
+
+    @torch.no_grad()
+    def collate_batch(self, batch):
+        position_ids_padded = []
+        input_ids_padded = []
+        last_ind = []
+        attention_mask_padded = []
+        for text in batch:
+            input_ids = text["input_ids"]
+            if not args.padding:
+                input_ids = (
+                    input_ids[: int(self.pad_max)]
+                    if len(input_ids) > int(self.pad_max)
+                    else input_ids
+                ) #no_padding
+            else:
+                pad_len = self.pad_max - input_ids.shape[0] 
+                input_ids = pad(input_ids, (0, pad_len), value=self.pad_val)
+            last_ind.append(input_ids.shape[0] - 1)
+            attention_mask = torch.ones(len(input_ids))
+            position_ids = torch.arange(len(input_ids))
+            input_ids_padded.append(input_ids)
+            attention_mask_padded.append(attention_mask)
+            position_ids_padded.append(position_ids)
+        return (
+            (
+                torch.vstack(input_ids_padded),
+                torch.vstack(attention_mask_padded),
+                torch.vstack(position_ids_padded),
+                tuple(global_past_key_value),
+            ),
+            torch.tensor(last_ind),
+        )
+
+
+calib_dataset = load_dataset(args.dataset, split="train")
+user_model.eval()
+if args.sq_recipes == "llama2-7b":
+    pad_max = 2048
+elif args.sq_recipes == "llama2-13b":
+    pad_max = 1024
+else:
+    pad_max = 512
+calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=pad_max)
+calib_dataloader = DataLoader(
+    calib_evaluator.dataset,
+    batch_size=1,
+    shuffle=False,
+    collate_fn=calib_evaluator.collate_batch,
+)
+
+
+def calib_func(prepared_model):
+    for i, (
+            (input_ids, attention_mask, position_ids, past_key_values),
+            last_ind,
+    ) in enumerate(calib_dataloader):
+        if i == 512:
+            break
+        prepared_model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+        )
+
+
+example_inputs = None
+for i, (
+        (input_ids, attention_mask, position_ids, past_key_values),
+        last_ind,
+) in enumerate(calib_dataloader):
+    example_inputs = (input_ids, attention_mask, position_ids, past_key_values)
+    break
+
+qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=args.alpha)
+user_model = ipex.optimize_transformers(
+    user_model.eval(),
+    dtype=amp_dtype,
+    quantization_config=qconfig,
+    inplace=True,
+    deployment_mode=False,
+)
+
+# steps for SmoothQuant with Intel® Neural Compressor
+from neural_compressor import PostTrainingQuantConfig, quantization
+
+# quantization recipes
+excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
+op_type_dict = {"add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}}
+recipes = {}
+if args.sq_recipes == "llama2-7b":
+    recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8,
+                                                           'auto_alpha_args': {"alpha_min": 0.8, "alpha_max": 0.99,
+                                                                               "alpha_step": 0.01,
+                                                                               "shared_criterion": "mean"}}}
+elif args.sq_recipes == "llama2-13b":
+    recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8,
+                                                        'auto_alpha_args': {"alpha_min": 0.75, "alpha_max": 0.99,
+                                                                            "alpha_step": 0.01,
+                                                                            "shared_criterion": "max"}}}
+
+
+conf = PostTrainingQuantConfig(
+    backend="ipex",
+    excluded_precisions=excluded_precisions,
+    op_type_dict=op_type_dict,
+    recipes=recipes,
+    example_inputs=example_inputs,
+)
+q_model = quantization.fit(
+    user_model,
+    conf,
+    calib_dataloader=calib_dataloader,
+    calib_func=calib_func,
+)
+q_model.save(args.output_dir)