Add last_token_pos in llama_transformer (#11793)

jinook-song-meta · web-flow · commit b342f8391e45 · 2025-07-07T20:59:47.000-07:00
Differential Revision: D76440105 Pull Request resolved: #12239
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -19,6 +19,7 @@ class ForwardOptions(TypedDict, total=False):
     freqs_sin_override: Optional[torch.Tensor]
     in_cache_state: Optional[Any]
     out_cache_state: Optional[Any]
+    last_valid_token_pos: Optional[torch.LongTensor]
 
 
 class Attention(nn.Module, ABC):
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -204,7 +204,8 @@ def forward(
 
         if not self.generate_full_logits:
             # Only the last logit is used for the new generated token
-            h = h[:, -1, :]
+            pos = attn_options.get("last_valid_token_pos", -1)
+            h = h[:, pos, :]
 
         h = self.norm(h)