fix attention mask; chunked prefill working now

skyzh · skyzh · commit 0c1ae4bca39e · 2025-05-10T12:03:08.000+08:00
Signed-off-by: Alex Chi &lt;iskyzh@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ You may join skyzh's Discord server and study with the tiny-llm community.
 | 2.4            | Flash Attention 2 - CPU                                     | ✅    | 🚧   | 🚧  |
 | 2.5            | Flash Attention 2 - GPU                                     | ✅    | 🚧   | 🚧  |
 | 2.6            | Continuous Batching                                         | ✅    | 🚧   | 🚧  |
-| 2.7            | Chunked Prefill                                             | 🚧    | 🚧   | 🚧  |
+| 2.7            | Chunked Prefill                                             | ✅    | 🚧   | 🚧  |
 | 3.1            | Paged Attention - Part 1                                    | 🚧    | 🚧   | 🚧  |
 | 3.2            | Paged Attention - Part 2                                    | 🚧    | 🚧   | 🚧  |
 | 3.3            | MoE (Mixture of Experts)                                    | 🚧    | 🚧   | 🚧  |
diff --git a/batch-main.py b/batch-main.py
@@ -20,6 +20,8 @@
 Shanghai[a] is a direct-administered municipality and the most populous urban area in China. The city is located on the Chinese shoreline on the southern estuary of the Yangtze River, with the Huangpu River flowing through it. The population of the city proper is the second largest in the world after Chongqing, with around 24.87 million inhabitants in 2023, while the urban area is the most populous in China, with 29.87 million residents. As of 2022, the Greater Shanghai metropolitan area was estimated to produce a gross metropolitan product (nominal) of nearly 13 trillion RMB ($1.9 trillion).[13] Shanghai is one of the world's major centers for finance, business and economics, research, science and technology, manufacturing, transportation, tourism, and culture. The Port of Shanghai is the world's busiest container port.
 """
 
+shanghai_wikipedia += "Based on the previous information, "
+
 prompts = [
     shanghai_wikipedia + "Where is Shanghai?",
     shanghai_wikipedia + "How much is the population of Shanghai?",
diff --git a/book/src/week2-overview.md b/book/src/week2-overview.md
@@ -20,3 +20,4 @@ https://github.com/ml-explore/mlx/blob/main/mlx/backend/metal/kernels/sdpa_vecto
 
 attention mask why
 https://www.shashankshekhar.com/blog/apple-metal-vs-nvidia-cuda
+https://arxiv.org/pdf/2308.16369
diff --git a/src/tiny_llm_ref/attention.py b/src/tiny_llm_ref/attention.py
@@ -40,8 +40,8 @@ def scaled_dot_product_attention_grouped(
     scores = mx.matmul(query, key.swapaxes(-2, -1)) * factor
     if mask is not None:
         if mask == "causal":
-            mask = 1 - mx.tril(mx.ones((L, S)))
-            mask = mx.where(mask, mx.array(-mx.inf), mx.array(0))
+            mask = mx.tril(mx.ones((L, S)), k=S - L)
+            mask = mx.where(mask, mx.array(0), mx.array(-mx.inf))
             scores = scores + mask
         else:
             mask = mask.reshape(-1, H, n_repeats, mask.shape[-2], mask.shape[-1])
diff --git a/src/tiny_llm_ref/generate.py b/src/tiny_llm_ref/generate.py
@@ -47,6 +47,16 @@ def _step(model, y, offset, kv_cache):
     # prefill with the prompt
     tokens = mx.array(tokenizer.encode(prompt, add_special_tokens=False))
     offset = 0
+    prefill_max = 64
+    total_tokens = tokens.size
+    while tokens.size > prefill_max:
+        token, _ = _step(model, tokens[:prefill_max], offset, kv_cache)
+        for i in kv_cache:
+            mx.eval(i.key_values[0])
+            mx.eval(i.key_values[1])
+        offset += prefill_max
+        tokens = tokens[prefill_max:]
+        print(f"Prefill progress: {offset}/{total_tokens}", flush=True)
     detokenizer = tokenizer.detokenizer
     detokenizer.reset()
     # generate/decode
@@ -72,7 +82,7 @@ def _step(model, y, offsets, kv_cache):
 
 class PrefillRequest:
     def __init__(
-        self, model: any, tokenizer: TokenizerWrapper, prompt: str, max_step: int = 64
+        self, model: any, tokenizer: TokenizerWrapper, prompt: str, max_step: int = 16
     ):
         self.prompt = prompt
         self.kv_cache = [TinyKvFullCache() for _ in range(model.num_hidden_layers)]
@@ -92,16 +102,19 @@ def prefill(self):
             [self.offset],
             self.kv_cache,
         )
-        mx.eval(token)
         self.offset += tokens_to_prefill
+        for i in self.kv_cache:
+            mx.eval(i.key_values[0])
+            mx.eval(i.key_values[1])
         if self.offset == self.prefill_tokens.size:
+            mx.eval(token)
             return token, self.kv_cache, self.offset
         else:
             return None
 
 
 def batch_generate(
-    model: any, tokenizer: TokenizerWrapper, prompts: list[str], max_seq_len=64
+    model: any, tokenizer: TokenizerWrapper, prompts: list[str], max_seq_len=512
 ):
     MAX_REQUESTS = 5
     is_idle = [True] * MAX_REQUESTS
@@ -110,7 +123,7 @@ def batch_generate(
     offsets = mx.array([0] * MAX_REQUESTS)
     detokenizers = [None] * MAX_REQUESTS
     kv_cache = [
-        BatchingKvCache(max_active_requests=MAX_REQUESTS, max_seq_len=64)
+        BatchingKvCache(max_active_requests=MAX_REQUESTS, max_seq_len=max_seq_len)
         for _ in range(model.num_hidden_layers)
     ]
     result = []
@@ -166,7 +179,10 @@ def batch_generate(
                         offsets[i] = offset
                         break
             else:
-                print("Still prefilling the request", flush=True)
+                print(
+                    f"Still prefilling the request: {pending_prefill_requests.offset}/{pending_prefill_requests.prefill_tokens.size}",
+                    flush=True,
+                )
 
         if not all(is_idle):
             next_tokens = mx.array(next_tokens)
@@ -194,4 +210,6 @@ def batch_generate(
                             f"(In Progress) {prompt_idx[i]}: " + detokenizers[i].text,
                             flush=True,
                         )
+        else:
+            print("No requests in progress", flush=True)
     return result
diff --git a/src/tiny_llm_ref/qwen2_week2.py b/src/tiny_llm_ref/qwen2_week2.py
@@ -62,7 +62,10 @@ def __call__(
         projection_v = quantized_linear(x, self.wv, bias=self.bv).reshape(
             B, L, self.num_kv_heads, self.head_dim
         )
-        offset_slice = [slice(int(i), int(i + L)) for i in offsets]
+        if isinstance(offsets, int):
+            offset_slice = [slice(int(offsets), int(offsets + L))]
+        else:
+            offset_slice = [slice(int(i), int(i + L)) for i in offsets]
         projection_q = self.rope(projection_q, offset=offset_slice)
         projection_k = self.rope(projection_k, offset=offset_slice)
         projection_q = projection_q.transpose(0, 2, 1, 3)

Original file line number	Diff line number	Diff line change
`@@ -20,3 +20,4 @@ https://github.com/ml-explore/mlx/blob/main/mlx/backend/metal/kernels/sdpa_vecto`
`20`	`20`
`21`	`21`	`attention mask why`
`22`	`22`	`https://www.shashankshekhar.com/blog/apple-metal-vs-nvidia-cuda`
	`23`	`+https://arxiv.org/pdf/2308.16369`