Address comments

gpetters-amd · gpetters-amd · commit 90844af4c515 · 2024-07-08T09:28:41.000-07:00
diff --git a/models/turbine_models/custom_models/llm_cmd_opts.py b/models/turbine_models/custom_models/llm_cmd_opts.py
@@ -1,6 +1,5 @@
 import argparse
 import os
-from pathlib import Path
 
 
 def path_expand(s):
@@ -28,7 +27,7 @@ def is_valid_file(arg):
 )
 
 ##############################################################################
-# SDXL Huggingface Options
+# Huggingface Options
 ##############################################################################
 
 p.add_argument(
@@ -41,13 +40,7 @@ def is_valid_file(arg):
     "--hf_model_name",
     type=str,
     help="HF model name",
-    default="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-)
-p.add_argument(
-    "--scheduler_id",
-    type=str,
-    help="Scheduler ID",
-    default="Euler",
+    default="meta-llama/Llama-2-7b-chat-hf",
 )
 
 ##############################################################################
@@ -56,39 +49,14 @@ def is_valid_file(arg):
 ##############################################################################
 
 p.add_argument(
-    "--prompt",
-    type=str,
-    default=" a cat under the snow with blue eyes, covered by snow, cinematic style, medium shot, professional photo, animal",
-    help="Prompt input to stable diffusion.",
+    "--seed", type=float, default=0, help="Seed for random number/latents generation."
 )
 
 p.add_argument(
-    "--negative_prompt",
+    "--pipeline_dir",
     type=str,
-    default="Watermark, blurry, oversaturated, low resolution, pollution",
-    help="Negative prompt input to stable diffusion.",
-)
-
-p.add_argument(
-    "--num_inference_steps", type=int, default=30, help="Number of UNet inference steps"
-)
-
-p.add_argument(
-    "--batch_count",
-    type=int,
-    default=1,
-    help="Number of batches to run for a single prompt",
-)
-
-p.add_argument(
-    "--guidance_scale",
-    type=float,
-    default=7.5,
-    help="Scale by which to adjust prompt guidance to the unconditional noise prediction output of UNet after each iteration.",
-)
-
-p.add_argument(
-    "--seed", type=float, default=0, help="Seed for random number/latents generation."
+    default="",
+    help="Path to location of vmfb files.",
 )
 
 p.add_argument(
@@ -109,73 +77,30 @@ def is_valid_file(arg):
     "--vmfb_path", type=str, default="", help="path to vmfb containing compiled module"
 )
 
-p.add_argument(
-    "--pipeline_vmfb_path",
-    type=str,
-    default="",
-    help="path to vmfb containing compiled meta-module",
-)
-
 p.add_argument(
     "--external_weight_file",
     type=str,
     default=None,
     help="Path to external weights, used in benchmark scripts.",
 )
 
-p.add_argument(
-    "--pipeline_dir",
-    type=str,
-    default=None,
-    help="Directory to save pipeline artifacts",
-)
-
-p.add_argument(
-    "--compiled_pipeline",
-    default=False,
-    action="store_true",
-    help="Do one-shot inference from tokens to image in a shrink-wrapped pipeline binary.",
-)
 
 ##############################################################################
-# SDXL Modelling Options
-#    These options are used to control model defining parameters for SDXL.
+# Modelling Options
+#    These options are used to control model defining parameters.
 #    These are MLIR - changing variables! If you change them, you will need
 #    to import/download and recompile the model.
 ##############################################################################
 
-p.add_argument("--batch_size", type=int, default=1, help="Batch size for inference")
-p.add_argument(
-    "--height", type=int, default=1024, help="Height of Stable Diffusion output image."
-)
-p.add_argument(
-    "--width", type=int, default=1024, help="Width of Stable Diffusion output image"
-)
 p.add_argument(
     "--precision",
     type=str,
     default="fp16",
     help="Precision of Stable Diffusion weights and graph.",
 )
-p.add_argument(
-    "--max_length", type=int, default=64, help="Sequence Length of Stable Diffusion"
-)
-p.add_argument("--vae_variant", type=str, default="decode", help="encode, decode")
-p.add_argument(
-    "--return_index",
-    action="store_true",
-    help="Make scheduled unet compiled module return the step index.",
-)
-
-p.add_argument(
-    "--vae_decomp_attn",
-    type=bool,
-    default=False,
-    help="Decompose attention for VAE decode only at fx graph level",
-)
 
 ##############################################################################
-# SDXL script general options.
+# Script general options.
 ##############################################################################
 
 p.add_argument("--compile_to", type=str, default="mlir", help="torch, linalg, vmfb")
@@ -195,12 +120,6 @@ def is_valid_file(arg):
     action="store_true",
     help="Runs both turbine vmfb and a torch model to compare results",
 )
-p.add_argument(
-    "--decomp_attn",
-    default=False,
-    action="store_true",
-    help="Decompose attention at fx graph level",
-)
 p.add_argument(
     "--exit_on_vmfb",
     default=True,
@@ -264,26 +183,5 @@ def is_valid_file(arg):
     help="extra iree-compile options for models with iree_linalg_ext.attention ops. Set this to 'default' if you are using mfma-capable hardware with ROCM.",
 )
 
-p.add_argument(
-    "--clip_flags",
-    type=str,
-    default="",
-    help="extra iree-compile options to send for compiling CLIP/prompt_encoder. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
-)
-
-p.add_argument(
-    "--vae_flags",
-    type=str,
-    default="",
-    help="extra iree-compile options to send for compiling VAE. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
-)
-
-p.add_argument(
-    "--unet_flags",
-    type=str,
-    default="",
-    help="extra iree-compile options to send for compiling unet. Only use this for testing bleeding edge flags! Any default options should be added to sd_inference/utils.py",
-)
-
 
 args, unknown = p.parse_known_args()
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
@@ -22,55 +22,6 @@
 
 BATCH_SIZE = 1
 
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--hf_auth_token", type=str, help="The Hugging Face auth token, required"
-)
-parser.add_argument("--compile_to", type=str, help="torch, linalg, vmfb")
-parser.add_argument(
-    "--hf_model_name",
-    type=str,
-    help="HF model name",
-    default="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-)
-parser.add_argument("--quantization", type=str, default="unquantized")
-parser.add_argument("--external_weight_file", type=str, default="")
-parser.add_argument(
-    "--vmfb_path", type=str, default=None, help="Path/name to store compiled vmfb."
-)
-parser.add_argument(
-    "--external_weights",
-    type=str,
-    default=None,
-    help="saves ir/vmfb without global weights for size and readability, options [gguf, safetensors]",
-)
-parser.add_argument(
-    "--precision", type=str, default="fp16", help="dtype of model [f16, f32]"
-)
-parser.add_argument(
-    "--device", type=str, default="llvm-cpu", help="llvm-cpu, cuda, vulkan, rocm"
-)
-# TODO: Bring in detection for target triple
-parser.add_argument(
-    "--iree_target_triple",
-    type=str,
-    default="host",
-    help="Specify vulkan target triple or rocm/cuda target device.",
-)
-parser.add_argument("--vulkan_max_allocation", type=str, default="4294967296")
-parser.add_argument(
-    "--streaming_llm",
-    action="store_true",
-    help="Compile LLM with StreamingLLM optimizations",
-)
-parser.add_argument(
-    "--decomp_attn",
-    action="store_true",
-    help="Decompose attention ops at fx graph level.",
-)
-
 
 def generate_schema(num_layers):
     null = None
@@ -519,51 +470,31 @@ def evict_kvcache_space(self):
 }
 
 
-class StatelessLlamaPipeline:
+class StatelessLlama:
     def __init__(
         self,
         hf_model_name: str,
-        scheduler_id: str,
-        height: int,
-        width: int,
         precision: str,
-        max_length: int,
-        batch_size: int,
-        num_inference_steps: int,
         device: str,
         iree_target_triple: str,
         ireec_flags: list = [],
-        attn_spec: str = None,
-        decomp_attn: bool = False,
         pipeline_dir: str | Path = "./shark_vmfbs",
         external_weights_dir: str | Path = "./shark_weights",
         external_weights: str = "safetensors",
-        custom_vae: str = None,
-        vae_decomp_attn: bool = True,
         hf_auth_token: str = None,
+        streaming_llm: bool = False,
     ):
         self.hf_model_name = hf_model_name
         self.iree_dtype = "float32" if precision == "fp32" else "float16"
         self.torch_dtype = torch.float32 if precision == "fp32" else torch.float16
         self.cpu_scheduling = True
-        self.scheduler_id = scheduler_id
-        self.height = height
-        self.width = width
         self.precision = precision
-        self.max_length = max_length
-        self.model_max_length = max_length
-        self.batch_size = batch_size
-        self.num_inference_steps = num_inference_steps
         self.device = device
         self.iree_target_triple = iree_target_triple
         self.ireec_flags = ireec_flags
-        self.attn_spec = attn_spec
-        self.decomp_attn = decomp_attn
         self.pipeline_dir = pipeline_dir
         self.external_weights_dir = external_weights_dir
         self.external_weights = external_weights
-        self.custom_vae = custom_vae
-        self.vae_decomp_attn = vae_decomp_attn
 
         self.first_input = True
         self.max_tokens = llm_model_map[self.hf_model_name]["max_tokens"]
@@ -582,10 +513,11 @@ def __init__(
         )
         self.model = None
         self.hf_auth_token=hf_auth_token
+        self.streaming_llm = streaming_llm
 
     # FILE MANAGEMENT AND PIPELINE SETUP
 
-    def check_prepared(
+    def prepare_pipeline(
         self,
         mlir: str,
         vmfb: str,
@@ -660,8 +592,8 @@ def export(
         weights_only: bool = False,
     ):
         safe_name = self.hf_model_name.replace("-", "_").replace("/", "_")
-        # if self.streaming_llm:
-        safe_name += "_streaming"
+        if self.streaming_llm:
+            safe_name += "_streaming"
 
         if not os.path.exists(self.pipeline_dir):
             os.makedirs(self.pipeline_dir)
@@ -698,7 +630,7 @@ def export(
             device=self.device,
             target_triple=self.iree_target_triple,
             vulkan_max_allocation=None,
-            streaming_llm=True,
+            streaming_llm=self.streaming_llm,
             vmfb_path=os.path.join(self.pipeline_dir, safe_name + ".vmfb"),
             upload_ir=False,
             mod=None,
@@ -732,9 +664,12 @@ def format_out(results):
 
         history = []
         for iter in range(self.max_tokens):
-            # if self.streaming_llm:
-            token_slice = max(self.prev_token_len - 1, 0)
-            input_tensor = input_tensor[:, token_slice:]
+            if self.streaming_llm:
+                token_slice = max(self.prev_token_len - 1, 0)
+                input_tensor = input_tensor[:, token_slice:]
+            else:
+                # TODO
+                pass
             # if self.streaming_llm and self.model["get_seq_step"]() > 600:
             if self.model["get_seq_step"]() > 600:
                 print("Evicting cache space!")
@@ -743,7 +678,7 @@ def format_out(results):
             device_inputs = [
                 ireert.asdevicearray(self.device, input_tensor)
             ]
-            if self.first_input: # or not self.streaming_llm:
+            if self.first_input or not self.streaming_llm:
                 st_time = time.time()
                 token = self.model["run_initialize"](*device_inputs)
                 total_time = time.time() - st_time
@@ -820,33 +755,17 @@ def format_out(results):
     if not args.external_weights_dir and args.external_weights:
         args.external_weights_dir = args.pipeline_dir
 
-    sd_pipe = StatelessLlamaPipeline(
+    llama = StatelessLlama(
         args.hf_model_name,
-        args.scheduler_id,
-        args.height,
-        args.width,
         args.precision,
-        args.max_length,
-        args.batch_size,
-        args.num_inference_steps,
         args.device,
         args.iree_target_triple,
         flags,
-        args.attn_spec,
-        args.decomp_attn,
         args.pipeline_dir,
         args.external_weights_dir,
         args.external_weights,
-        args.vae_decomp_attn,
         args.hf_auth_token,
+        True,
     )
-    vmfb, weight = sd_pipe.check_prepared(mlir, vmfb, weight, interactive=False, quantization="int4")
-    sd_pipe.load_pipeline(vmfb, weight, args.rt_device, args.compiled_pipeline)
-    sd_pipe.generate_images(
-        args.prompt,
-        args.negative_prompt,
-        args.batch_count,
-        args.guidance_scale,
-        args.seed,
-        False,
-    )
+    vmfb, weight = llama.prepare_pipeline(mlir, vmfb, weight, interactive=False, quantization="int4")
+    llama.load_pipeline(vmfb, weight, args.rt_device, args.compiled_pipeline)
diff --git a/models/turbine_models/tests/stateless_llama_test.py b/models/turbine_models/tests/stateless_llama_test.py