Stateless llama testing + SDXL CI Job (#513)

saienduri · web-flow · commit 232598408e4f · 2024-03-13T16:19:37.000-07:00
This commit cleans up stateless llama testing and helps with memory
efficiency by creating the model on the Meta device (with empty weights)
and the state dict is then loaded inside it (shard by shard in the case
of a sharded checkpoint). The sd test for `vae_encode` throws an error.
I took a look at the mlir generated and the `encode_inp func` is there
and inputs/return looks valid. Looks like it has to do with iree bump
(`FuncConversion` pass). It also adds a CI job to run Jinchen's sdxl
script nightly (one failure right now, hopefully fixed soon).
diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
@@ -20,7 +20,7 @@ jobs:
     strategy:
       matrix:
         version: [3.11]
-        os: [nodai-ubuntu-builder-large]
+        os: [nodai-amdgpu-w7900-x86-64]
 
     runs-on: ${{matrix.os}}
     steps:
diff --git a/.github/workflows/test_sdxl.yml b/.github/workflows/test_sdxl.yml
@@ -0,0 +1,50 @@
+name: SDXL Models Nightly
+
+on:
+  schedule:
+    - cron:  '30 6 * * *'
+
+jobs:
+  test-sdxl-models:
+    strategy:
+      matrix:
+        version: [3.11]
+        os: [nodai-amdgpu-w7900-x86-64]
+
+    runs-on: ${{matrix.os}}
+    steps:
+      - name: "Setting up Python"
+        uses: actions/setup-python@75f3110429a8c05be0e1bf360334e4cced2b63fa # v2.3.3
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@v2
+        with:
+          ref: ean-sd-fp16
+
+      - name: Sync source deps
+        # build IREE from source with -DIREE_BUILD_TRACY=ON if getting tracy profile
+        run: |
+          python -m pip install --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --index-url https://download.pytorch.org/whl/cpu \
+            -r core/pytorch-cpu-requirements.txt \
+            -r core/torchvision-requirements.txt          
+          pip install --upgrade -r core/requirements.txt
+          pip install -e core[testing,torch-cpu-nightly]
+          pip install --upgrade -r models/requirements.txt
+          pip install -e models
+      
+      - name: Show current free memory
+        run: |
+          free -mh
+
+      - name: Run sdxl tests
+        run: |
+          pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
+          pytest models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu
+          pytest models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
+          pytest models/turbine_models/tests/sdxl_test.py --device rocm --rt_device rocm --iree_target_triple gfx90a
diff --git a/models/turbine_models/custom_models/llm_runner.py b/models/turbine_models/custom_models/llm_runner.py
@@ -168,12 +168,14 @@ def run_llm(
     streaming_llm=False,
     chat_mode=False,
     chat_sys_prompt=DEFAULT_CHAT_SYS_PROMPT,
+    tokenizer=None,
 ):
-    tokenizer = AutoTokenizer.from_pretrained(
-        hf_model_name,
-        use_fast=False,
-        token=hf_auth_token,
-    )
+    if tokenizer == None:
+        tokenizer = AutoTokenizer.from_pretrained(
+            hf_model_name,
+            use_fast=False,
+            token=hf_auth_token,
+        )
     llm = SharkLLM(
         device=device,
         vmfb_path=vmfb_path,
@@ -204,43 +206,35 @@ def run_torch_llm(
     prompt,
     streaming_llm=False,
     chat_sys_prompt=DEFAULT_CHAT_SYS_PROMPT,
+    model=None,
+    tokenizer=None,
 ):
-    from turbine_models.model_builder import HFTransformerBuilder
-    from transformers import AutoModelForCausalLM
-
-    model_builder = HFTransformerBuilder(
-        example_input=None,
-        hf_id=hf_model_name,
-        auto_model=AutoModelForCausalLM,
-        hf_auth_token=hf_auth_token,
-        auto_tokenizer=AutoTokenizer,
-    )
     if streaming_llm is True:
-        enable_llama_pos_shift_attention(model_builder.model)
+        enable_llama_pos_shift_attention(model)
 
     def get_token_from_logits(logits):
         return torch.argmax(logits[:, -1, :], dim=1)
 
     prompt = append_user_prompt(chat_sys_prompt, prompt)
-    initial_input = model_builder.tokenizer(prompt, return_tensors="pt")
+    initial_input = tokenizer(prompt, return_tensors="pt")
     example_input_id = initial_input.input_ids
 
-    model_results = model_builder.model.forward(example_input_id)
+    model_results = model.forward(example_input_id)
     model_token = get_token_from_logits(model_results.logits)
 
     pkv = model_results.past_key_values
 
     torch_results = []
     torch_results.append(int(model_token))
     while model_token != 2:
-        model_results = model_builder.model.forward(
+        model_results = model.forward(
             torch.unsqueeze(model_token, 0), past_key_values=pkv
         )
         model_token = get_token_from_logits(model_results.logits)
         pkv = model_results.past_key_values
         torch_results.append(int(model_token[0]))
 
-    return model_builder.tokenizer.decode(torch_results)
+    return tokenizer.decode(torch_results)
 
 
 if __name__ == "__main__":
diff --git a/models/turbine_models/custom_models/stateless_llama.py b/models/turbine_models/custom_models/stateless_llama.py
@@ -121,18 +121,21 @@ def export_transformer_model(
     streaming_llm=False,
     vmfb_path=None,
     upload_ir=False,
+    mod=None,
+    tokenizer=None,
 ):
-    tokenizer = AutoTokenizer.from_pretrained(
-        hf_model_name,
-        use_fast=False,
-        token=hf_auth_token,
-    )
-
-    mod = AutoModelForCausalLM.from_pretrained(
-        hf_model_name,
-        torch_dtype=torch.float,
-        token=hf_auth_token,
-    )
+    if tokenizer == None:
+        tokenizer = AutoTokenizer.from_pretrained(
+            hf_model_name,
+            use_fast=False,
+            token=hf_auth_token,
+        )
+    if mod == None:
+        mod = AutoModelForCausalLM.from_pretrained(
+            hf_model_name,
+            torch_dtype=torch.float,
+            token=hf_auth_token,
+        )
     schema_json = generate_schema(mod.config.num_hidden_layers)
     state_schema = pytree.treespec_loads(schema_json)
     if streaming_llm:
@@ -165,7 +168,8 @@ def export_transformer_model(
             for name in mod_params:
                 mapper["params." + name] = name
             if external_weight_file:
-                safetensors.torch.save_file(mod_params, external_weight_file)
+                if os.path.exists(external_weight_file) == False:
+                    safetensors.torch.save_file(mod_params, external_weight_file)
 
         elif external_weights == "gguf":
             tensor_mapper = remap_gguf.TensorNameMap(remap_gguf.MODEL_ARCH.LLAMA, HEADS)
diff --git a/models/turbine_models/model_builder.py b/models/turbine_models/model_builder.py
@@ -30,6 +30,7 @@ def __init__(
         model=None,
         model_type: str = None,
         compile_to_vmfb: bool = None,
+        tokenizer=None,
     ) -> None:
         self.example_input = example_input
         self.hf_id = hf_id
@@ -38,7 +39,7 @@ def __init__(
         self.auto_config = auto_config
         self.hf_auth_token = hf_auth_token
         self.model = model
-        self.tokenizer = None
+        self.tokenizer = tokenizer
         self.upload_ir = upload_ir
         self.model_type = model_type
         self.compile_to_vmfb = compile_to_vmfb
diff --git a/models/turbine_models/tests/stateless_llama_test.py b/models/turbine_models/tests/stateless_llama_test.py
@@ -9,6 +9,11 @@
 import os
 import unittest
 import difflib
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+import torch
+from accelerate import init_empty_weights
+from transformers.modeling_utils import load_sharded_checkpoint
+import tempfile
 
 os.environ["TORCH_LOGS"] = "dynamic"
 from shark_turbine.aot import *
@@ -18,18 +23,6 @@
     gen_external_params,
 )
 
-quantization = "unquantized"
-precision = "f32"
-gen_external_params(
-    hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-    quantization=quantization,
-    hf_auth_token=None,
-    precision=precision,
-)
-DEFAULT_PROMPT = """<s>[INST] <<SYS>>
-Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>> hi what are you? [/INST]
-"""
-
 
 def check_output_string(reference, output):
     # Calculate and print diff
@@ -43,7 +36,45 @@ def check_output_string(reference, output):
     assert reference == output, "".join(diff)
 
 
+quantization = "unquantized"
+precision = "f32"
+
+DEFAULT_PROMPT = """<s>[INST] <<SYS>>
+Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>> hi what are you? [/INST]
+"""
+
+
 class StatelessLlamaChecks(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        gen_external_params(
+            hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+            quantization=quantization,
+            hf_auth_token=None,
+            precision=precision,
+        )
+
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+            use_fast=False,
+        )
+
+        # The model is first created on the Meta device (with empty weights) and the state dict
+        # is then loaded inside it (shard by shard in the case of a sharded checkpoint).
+        # This avoids using twice the size of model with creating whole model with random weights,
+        # then loading pretrained weights.
+        cls.mod = AutoModelForCausalLM.from_pretrained(
+            "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+            torch_dtype=torch.float,
+            low_cpu_mem_usage=True,
+            device_map="auto",
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.tokenizer = None
+        cls.mod = None
+
     def test_vmfb_comparison(self):
         """
         Test that the vmfb model produces the same output as the torch model
@@ -66,6 +97,8 @@ def test_vmfb_comparison(self):
             device="llvm-cpu",
             target_triple="host",
             upload_ir=upload_ir_var == "upload",
+            mod=self.mod,
+            tokenizer=self.tokenizer,
         )
 
         torch_str_cache_path = (
@@ -77,7 +110,11 @@ def test_vmfb_comparison(self):
                 torch_str = f.read()
         else:
             torch_str = llm_runner.run_torch_llm(
-                "Trelis/Llama-2-7b-chat-hf-function-calling-v2", None, DEFAULT_PROMPT
+                "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+                None,
+                self.DEFAULT_PROMPT,
+                model=self.mod,
+                tokenizer=self.tokenizer,
             )
 
             with open(torch_str_cache_path, "w") as f:
@@ -90,6 +127,7 @@ def test_vmfb_comparison(self):
             "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
             None,
             f"Llama_2_7b_chat_hf_function_calling_v2_{precision}_{quantization}.safetensors",
+            tokenizer=self.tokenizer,
         )
         check_output_string(torch_str, turbine_str)
 
@@ -109,6 +147,8 @@ def test_streaming_vmfb_comparison(self):
             target_triple="host",
             streaming_llm=True,
             vmfb_path="streaming_llama.vmfb",
+            mod=self.mod,
+            tokenizer=self.tokenizer,
         )
 
         torch_str_cache_path = (
@@ -124,6 +164,8 @@ def test_streaming_vmfb_comparison(self):
                 None,
                 DEFAULT_PROMPT,
                 streaming_llm=True,
+                model=self.mod,
+                tokenizer=self.tokenizer,
             )
 
             with open(torch_str_cache_path, "w") as f:
@@ -137,6 +179,7 @@ def test_streaming_vmfb_comparison(self):
             None,
             f"Llama_2_7b_chat_hf_function_calling_v2_{precision}_{quantization}.safetensors",
             streaming_llm=True,
+            tokenizer=self.tokenizer,
         )
         check_output_string(torch_str, turbine_str)
 
@@ -145,12 +188,16 @@ def test_rerotated_torch_comparison(self):
             "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
             None,
             DEFAULT_PROMPT,
+            model=self.mod,
+            tokenizer=self.tokenizer,
         )
         rotated_torch_str = llm_runner.run_torch_llm(
             "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
             None,
             DEFAULT_PROMPT,
             streaming_llm=True,
+            model=self.mod,
+            tokenizer=self.tokenizer,
         )
         check_output_string(torch_str, rotated_torch_str)