[turbine-llm] Enable dynamic FX tracing of the paged llama model. (#516)

stellaraccident · web-flow · commit 4f7d96556847 · 2024-03-05T18:14:30.000-08:00
This was a little rough but got through it. Key changes involve removing
data-dependent views.

As per previous, I believe we still have a bug in the per step cache
management, and we will need to do a detailed per-step comparison of the
cache with a reference.
diff --git a/llm/turbine_llm/inference/export_paged_llm_v1.py b/llm/turbine_llm/inference/export_paged_llm_v1.py
@@ -0,0 +1,136 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""Inference support for the PagedLLMV1 protocol of models."""
+
+import math
+import sys
+
+import torch
+
+from shark_turbine.aot import (
+    FxProgramsBuilder,
+)
+
+from ..data.gguf import load_gguf_file
+from ..config.llm_configs import LlamaHParams
+
+# TODO: Should be using a base class with the protocol supported.
+from ..models.llama import PagedLlamaModelV1
+
+
+def main(args: list[str]):
+    try:
+        (gguf_path,) = args
+    except IndexError:
+        raise RuntimeError(f"Expected <gguf_path>")
+
+    dataset = load_gguf_file(gguf_path)
+
+    hp = LlamaHParams.from_gguf_props(dataset.properties)
+    model = PagedLlamaModelV1(dataset.root_theta, hp)
+
+    # Unrolling cache updates by batch row makes dynamo sad without an
+    # override. There may be a better way to do this.
+    import torch._dynamo.config as dynamo_config
+
+    dynamo_config.max_loop_unroll_nodes = 0
+
+    fxb = FxProgramsBuilder(model)
+
+    def generate_batch_prefill(bs: int):
+        tokens = torch.empty(bs, 64, dtype=torch.int64)
+        seq_lens = torch.empty(bs, dtype=torch.int64)
+        seq_block_ids = torch.empty(bs, 4, dtype=torch.int64)
+        cache_state = model.cache.allocate(128, torch.float32)
+        block_dim = torch.export.Dim("block", max=2047 // 16)
+        sl_dim = 16 * block_dim
+        page_dim = torch.export.Dim("page")
+        dynamic_shapes = {
+            "tokens": {1: sl_dim},
+            "seq_lens": {},
+            "seq_block_ids": {1: block_dim},
+            "cache_state": [{0: page_dim}],
+        }
+
+        @fxb.export_program(
+            name=f"prefill_bs{bs}",
+            args=(tokens, seq_lens, seq_block_ids, cache_state),
+            dynamic_shapes=dynamic_shapes,
+        )
+        def _(model, tokens, seq_lens, seq_block_ids, cache_state):
+            sl = tokens.shape[1]
+            input_mask = model.input_mask(seq_lens, sl)
+            attention_mask = model.attention_mask(input_mask, dtype=torch.float32)
+            logits = model.prefill(
+                tokens,
+                attention_mask=attention_mask,
+                seq_block_ids=seq_block_ids,
+                cache_state=cache_state,
+            )
+            return logits
+
+    def generate_batch_decode(bs: int):
+        tokens = torch.ones(bs, 1, dtype=torch.int64)
+        seq_lens = torch.ones(bs, dtype=torch.int64)
+        start_positions = torch.ones(bs, dtype=torch.int64)
+        seq_block_ids = torch.zeros(bs, 4, dtype=torch.int64)
+        cache_state = model.cache.allocate(128, torch.float32)
+        block_dim = torch.export.Dim("block", max=2047 // 16)
+        page_dim = torch.export.Dim("page")
+        dynamic_shapes = {
+            "tokens": {},
+            "seq_lens": {},
+            "start_positions": {},
+            "seq_block_ids": {1: block_dim},
+            "cache_state": [{0: page_dim}],
+        }
+
+        @fxb.export_program(
+            name=f"decode_bs{bs}",
+            args=(
+                tokens,
+                seq_lens,
+                start_positions,
+                seq_block_ids,
+                cache_state,
+            ),
+            dynamic_shapes=dynamic_shapes,
+        )
+        def _(
+            model,
+            tokens,
+            seq_lens,
+            start_positions,
+            seq_block_ids,
+            cache_state,
+        ):
+            input_mask = model.input_mask(
+                seq_lens, seq_block_ids.shape[1] * model.cache.block_seq_stride
+            )
+            attention_mask = model.decode_attention_mask(
+                input_mask, dtype=torch.float32
+            )
+            logits = model.decode(
+                tokens,
+                attention_mask=attention_mask,
+                start_positions=start_positions,
+                seq_block_ids=seq_block_ids,
+                read_cache_state=cache_state,
+                write_cache_state=cache_state,
+            )
+            return logits
+
+    generate_batch_prefill(16)
+    generate_batch_decode(16)
+    print("GENERATED!")
+
+    for name, ep in fxb.programs.items():
+        print(f"EXPORT {name}:\n{ep}")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/llm/turbine_llm/layers/core.py b/llm/turbine_llm/layers/core.py
@@ -34,9 +34,10 @@ def assert_not_nan(self, *ts: torch.Tensor):
         Must be enabled via a global switch as this kind of checking is not
         accelerator or compilation friendly.
         """
-        for t in ts:
-            if torch.isnan(t).any():
-                raise AssertionError(f"Tensor contains nans! {t}")
+        if debugging.flags.enable_nan_checks:
+            for t in ts:
+                if torch.isnan(t).any():
+                    raise AssertionError(f"Tensor contains nans! {t}")
 
 
 class ThetaLayer(BaseLayer):
diff --git a/llm/turbine_llm/layers/kv_cache.py b/llm/turbine_llm/layers/kv_cache.py
@@ -181,13 +181,19 @@ def write_timestep(
         assert len(cache_partitions) == self.cache_partition_count
         for i in range(bs):
             position = seq_positions[i]
-            page_id = page_ids[i, position // self.block_seq_stride]
+            # TODO: Let's clamp to the allowable range so that we don't need
+            # an assert.
+            page_id = page_ids[i, :].index_select(0, position // self.block_seq_stride)
             page_offset = position % self.block_seq_stride
             for partition_index in range(self.cache_partition_count):
                 cache_partition = cache_partitions[partition_index]
-                page_table[
-                    page_id, transformer_block_index, partition_index, page_offset
-                ] = cache_partition[i, 0]
+                indices = (
+                    page_id,
+                    torch.tensor([transformer_block_index]),
+                    torch.tensor([partition_index]),
+                    page_offset.unsqueeze(0),
+                )
+                page_table.index_put_(indices=indices, values=cache_partition[i, 0])
 
     def write(
         self,
diff --git a/llm/turbine_llm/models/llama.py b/llm/turbine_llm/models/llama.py
@@ -95,7 +95,6 @@ def __init__(self, theta: Theta, hp: LlamaHParams):
                     theta("blk", n),
                     block_index=n,
                     cache=self.cache,
-                    embedding=self.attention_embedding,
                     head_count=hp.attention_head_count,
                     head_dim=attn_head_dim,
                     head_count_kv=hp.attention_head_count_kv,
@@ -125,6 +124,7 @@ def prefill(
                 self.trace_tensor(f"llama.attn_block.{block_idx}.input", h)
             h = block(
                 h,
+                embedding=self.attention_embedding,
                 start_index=0,
                 attention_mask=attention_mask,
                 write_cache_state=cache_state,
@@ -189,6 +189,7 @@ def decode(
             h = block(
                 h,
                 start_positions=start_positions,
+                embedding=self.attention_embedding,
                 embedding_batch_mask=embedding_batch_mask,
                 attention_mask=attention_mask,
                 read_cache_state=read_cache_state,
@@ -222,7 +223,6 @@ def __init__(
         head_count: int,
         head_dim: int,
         head_count_kv: int,
-        embedding: RotaryEmbeddingLayer,
         rms_epsilon: float,
     ):
         super().__init__(theta)
@@ -242,7 +242,6 @@ def __init__(
 
         self.block_index = block_index
         self.cache = cache
-        self.embedding = embedding
         self.head_count = head_count
         self.head_dim = head_dim
         self.head_count_kv = head_count_kv
@@ -251,6 +250,7 @@ def forward(
         self,
         h: torch.Tensor,
         *,
+        embedding: RotaryEmbeddingLayer,
         # [bs, batch_seq_len // block_seq_stride]
         seq_block_ids: torch.Tensor,
         start_index: Optional[int] = None,
@@ -280,9 +280,9 @@ def forward(
         # Fast path to start_index based embedding lookup if available.
         # Falls back to a slower position based index lookup.
         if start_index is not None:
-            xq, xk = self.embedding.forward(xq=xq, xk=xk, start_index=start_index)
+            xq, xk = embedding.forward(xq=xq, xk=xk, start_index=start_index)
         else:
-            xq, xk = self.embedding.apply_batched_mask(
+            xq, xk = embedding.apply_batched_mask(
                 xq=xq, xk=xk, mask=embedding_batch_mask
             )
 
@@ -321,6 +321,19 @@ def forward(
 
             kv_seq_len = seq_block_ids.shape[1] * self.cache.block_seq_stride
 
+            if write_cache_state:
+                # Write our one updated cache row into the cache.
+                self.cache.write_timestep(
+                    write_cache_state,
+                    cache_partitions=[
+                        xk_cache_update,
+                        xv_cache_update,
+                    ],
+                    transformer_block_index=self.block_index,
+                    seq_positions=start_positions + 1,
+                    page_ids=seq_block_ids,
+                )
+
             # Restore from the cache.
             self.cache.read(
                 read_cache_state,
@@ -332,18 +345,6 @@ def forward(
                 page_ids=seq_block_ids,
             )
 
-            # self.trace_tensor("DECODE.KV.ACTUAL", xk_temp)
-            # Now restore the newly computed position into the xk/xv view we
-            # are operating on. This will also be done later when updating the
-            # cache, but we separate it here to avoid creating a data
-            # dependency. Since the batch size is static, we do a static loop
-            # in order to simplify indexing and keeps us from needing to
-            # deal with masking.
-            for i in range(bs):
-                row_start_pos = start_positions[i]
-                xk_temp[i, row_start_pos : row_start_pos + 1, :, :] = xk[i, ...]
-                xv_temp[i, row_start_pos : row_start_pos + 1, :, :] = xv[i, ...]
-
             # For computation, we create a subview of the xk/xv tensors to have
             # a sequence length covering the blocked size. This must include
             # the newly added row (the caller is responsible for ensuring that
@@ -352,25 +353,6 @@ def forward(
             xk = xk_temp[:, 0:kv_seq_len, ...]
             xv = xv_temp[:, 0:kv_seq_len, ...]
 
-            if write_cache_state:
-                # Write our one updated cache row. We currently do this apart
-                # from the linearization step because it lets us have aliased
-                # cache states. We may need to revisit this if we can support
-                # a cache write-read in the same sequence.
-                # In that case, this would go prior to the read.
-                # self.trace_tensor("decode.xk_cache_update", xk_cache_update)
-                # self.trace_tensor("decode.xv_cache_update", xv_cache_update)
-                self.cache.write_timestep(
-                    write_cache_state,
-                    cache_partitions=[
-                        xk_cache_update,
-                        xv_cache_update,
-                    ],
-                    transformer_block_index=self.block_index,
-                    seq_positions=start_positions + 1,
-                    page_ids=seq_block_ids,
-                )
-
         # Tranpose into [bs, heads, sl, dim]
         xq = xq.transpose(1, 2)
         keys = xk.transpose(1, 2)
diff --git a/llm/turbine_llm/utils/debugging.py b/llm/turbine_llm/utils/debugging.py
@@ -26,6 +26,7 @@
 @dataclass
 class DebugFlags:
     enable_tensor_trace: bool = False
+    enable_nan_checks: bool = False
 
     def set(self, part: str):
         m = re.match(SETTING_PART_PATTERN, part)
@@ -38,6 +39,8 @@ def set(self, part: str):
 
         if name == "tensor_trace":
             self.enable_tensor_trace = logical_sense
+        elif name == "enable_nan_checks":
+            self.enable_nan_checks = logical_sense
         else:
             logger.warn("Unrecognized %s flag: '%s'", FLAGS_ENV_NAME, name)