clean up code

congcongchen123 · congcongchen123 · commit 721956038a06 · 2025-07-09T23:38:28.000Z
Signed-off-by: Congcong Chen &lt;congcongchen@microsoft.com&gt;
diff --git a/vllm/model_executor/models/phi3samba.py b/vllm/model_executor/models/phi3samba.py
@@ -167,161 +167,6 @@ def __init__(self,
     def lambda_init_fn(self, depth):
         return 0.8 - 0.6 * math.exp(-0.3 * depth)
 
-
-    def split_heads(self, x):
-        # split by num_heads, the stripe pattern is friendly to tensor parallel.
-        x = rearrange(x, "... (H two) D -> ... H two D", two=2)
-        x1 = x[..., 0, :]
-        x2 = x[..., 1, :]
-        return x1.contiguous(), x2.contiguous()
-    
-    def split_kv_cache(self, x):
-        # split by num_heads, the stripe pattern is friendly to tensor parallel.
-        if x.numel() == 0:
-            return torch.empty(0), torch.empty(0)
-        
-        x1, x2 = x[0], x[1]
-        return x1, x2
-
-    def forward_decode(
-        self,
-        query: torch.Tensor,
-        k_cache: torch.Tensor,
-        v_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-    ):
-        if not attn_metadata.decode_metadata:            
-            block_tables_arg = attn_metadata.cross_layer_shared_block_tables                
-        else:
-            block_tables_arg = attn_metadata.block_tables
-
-        output = flash_attn_with_kvcache(
-            q=query.unsqueeze(1),
-            k_cache=k_cache,
-            v_cache=v_cache,
-            block_table=block_tables_arg,
-            cache_seqlens=attn_metadata.seq_lens_tensor,
-            softmax_scale=self.attn.impl.scale,
-            causal=True,
-            window_size=self.attn.impl.sliding_window,
-            alibi_slopes=self.attn.impl.alibi_slopes,
-            softcap=self.attn.impl.logits_soft_cap,
-        ).squeeze(1)
-        return output
-
-    def populate_kv_cache(self,
-                          key, 
-                          value, 
-                          kv_cache, 
-                          attn_metadata):
-        if (kv_cache.numel() > 0):
-            if (key is not None) and (value is not None):
-                updated_slot_mapping = attn_metadata.slot_mapping
-                # previous_key_cache_sum = key_cache.sum()
-                # previous_value_cache_sum = value_cache.sum()
-
-                torch.ops._C_cache_ops.reshape_and_cache_flash(
-                    key,
-                    value,
-                    kv_cache[0],
-                    kv_cache[1],
-                    updated_slot_mapping.flatten(),
-                    self.attn.impl.kv_cache_dtype,
-                    self._k_scale,
-                    self._v_scale,
-                )
-                # assert key_cache.sum() - previous_key_cache_sum == key.sum(), "key_cache sum mismatch"
-                # assert value_cache.sum() - previous_value_cache_sum == value.sum(), "value_cache sum mismatch"
-                # if key_cache.sum() - previous_key_cache_sum != key.sum():
-                #     print("key_cache sum mismatch")
-                # if value_cache.sum() - previous_value_cache_sum != value.sum():
-                #     print("value_cache sum mismatch")
-
-    def forward_customized(
-        self,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor],
-        value: Optional[torch.Tensor],
-        k_cache: torch.Tensor,
-        v_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata
-    ) -> torch.Tensor:
-
-        head_size = self.head_dim
-        num_heads = self.num_heads // 2
-        num_kv_heads = self.num_key_value_heads // 2
-
-        query = query.view(-1, num_heads, head_size)
-        if key is not None:
-            assert value is not None
-            key = key.view(-1, num_kv_heads, head_size)
-            value = value.view(-1, num_kv_heads, head_size)
-        else:
-            assert value is None
-
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens, "key shape mismatch"
-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens, "value shape mismatch"
-        
-        output = torch.empty_like(query)
-        # Query for decode. KV is not needed because it is already cached.
-        decode_query = query[num_prefill_tokens:]
-        # QKV for prefill.
-        query = query[:num_prefill_tokens]
-        if key is not None and value is not None:
-            key = key[:num_prefill_tokens]
-            value = value[:num_prefill_tokens]
-
-        assert query.shape[0] == num_prefill_tokens, "query shape mismatch"
-        assert decode_query.shape[0] == num_decode_tokens, "decode query shape mismatch"
-
-        if prefill_meta := attn_metadata.prefill_metadata:
-            # Prompt run.
-            if k_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0:
-                # normal attention
-                prefill_output = flash_attn_varlen_func(
-                    q=query,
-                    k=key,
-                    v=value,
-                    cu_seqlens_q=prefill_meta.seq_start_loc,
-                    cu_seqlens_k=prefill_meta.seq_start_loc,
-                    max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                    max_seqlen_k=prefill_meta.max_prefill_seq_len,
-                    softmax_scale=self.attn.impl.scale,
-                    causal=True,
-                    window_size=self.attn.impl.sliding_window,
-                    alibi_slopes=self.attn.impl.alibi_slopes,
-                    softcap=self.attn.impl.logits_soft_cap,
-                )
-                assert prefill_output.shape == output[:num_prefill_tokens].shape
-                output[:num_prefill_tokens] = prefill_output
-            else:
-                raise Exception("prefix caching not supported")
-
-        if decode_meta := attn_metadata.decode_metadata:
-            block_tables_arg = decode_meta.block_tables
-            try:
-                output[num_prefill_tokens:] = flash_attn_with_kvcache(
-                    q=decode_query.unsqueeze(1),
-                    k_cache=k_cache,
-                    v_cache=v_cache,
-                    block_table=block_tables_arg,
-                    cache_seqlens=decode_meta.seq_lens_tensor,
-                    softmax_scale=self.attn.impl.scale,
-                    causal=True,
-                    window_size=self.attn.impl.sliding_window,
-                    alibi_slopes=self.attn.impl.alibi_slopes,
-                    softcap=self.attn.impl.logits_soft_cap,
-                ).squeeze(1)
-            except Exception as e:
-                logger.error(
-                    f"Error in PagedAttention.forward_decode: {str(e)}")
-                raise e
-
-        # Reshape the output tensor.
-        return output.view(-1, num_heads, head_size)
-
     def forward(
             self,
             hidden_states: torch.Tensor,
@@ -333,86 +178,9 @@ def forward(
         if not self.yoco_cross: # need to generate kv-cache
             qkv = self.Wqkv(hidden_states)
             q, k, v = qkv.split([self.hidden_size, self.num_key_value_heads * self.head_dim, self.num_key_value_heads * self.head_dim], dim=-1)
-            reference_attn_output = self.attn(q, k, v)
-            # # q, k = self.rotary_emb(positions, q, k)
-            # # reshape
-            # q = q.view(-1, self.num_heads, self.head_dim)
-            # k = k.view(-1, self.num_key_value_heads, self.head_dim)
-            # v = v.view(-1, self.num_key_value_heads, self.head_dim)
-
-            # q1, q2 = self.split_heads(q)
-            # k1, k2 = self.split_heads(k)
-            # v1, v2 = self.split_heads(v)
-
-            # # kv_cache shape is (2, 2, num_blocks, block_size * num_kv_heads // 2 * head_size)
-            # # Split by half along the first dimension.
-            # kv_cache1, kv_cache2 = self.split_kv_cache(kv_cache)
-            # assert kv_cache1.is_contiguous(), "kv_cache1 is not contiguous"
-            # assert kv_cache2.is_contiguous(), "kv_cache2 is not contiguous"
-            
-            # if kv_cache1.numel() != 0:
-            #     self.populate_kv_cache(k1, v1, kv_cache1, attn_metadata)
-            #     self.populate_kv_cache(k2, v2, kv_cache2, attn_metadata)
-                
-            #     key_cache1, value_cache1 = self.split_kv_cache(kv_cache1)
-            #     key_cache2, value_cache2 = self.split_kv_cache(kv_cache2)
-            # else:
-            #     key_cache1, value_cache1 = torch.empty(0), torch.empty(0)
-            #     key_cache2, value_cache2 = torch.empty(0), torch.empty(0)
-            # attn11 = self.forward_customized(q1, k1, v1, key_cache1, value_cache1, attn_metadata)
-            # attn12 = self.forward_customized(q1, k1, v2, key_cache1, value_cache2, attn_metadata)
-            # attn11 = attn11.view(q1.shape)
-            # attn12 = attn12.view(q1.shape)
-            # attn1 = torch.cat([attn11, attn12], dim=-1)
-
-            # attn21 = self.forward_customized(q2, k2, v1, key_cache2, value_cache1, attn_metadata)
-            # attn22 = self.forward_customized(q2, k2, v2, key_cache2, value_cache2, attn_metadata)
-            # attn21 = attn21.view(q2.shape)
-            # attn22 = attn22.view(q2.shape)
-            # attn2 = torch.cat([attn21, attn22], dim=-1)
-
-            # lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(q)
-            # lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(q)
-            # lambda_full = lambda_1 - lambda_2 + self.lambda_init
-            
-            # attn = attn1 - lambda_full * attn2
-            # # attn shape (-1, self.num_heads // 2, 2 * self.head_dim)
-            # attn = self.subln(attn)
-            # attn = attn * (1 - self.lambda_init)
-            # # reshape back to 2 * num_head
-            # attn_output = rearrange(attn, "... H (two D) -> ... (H two) D", two=2)
             attn_output = self.attn(q, k, v)
         else: # re-use the kv cache, full attention
             q = self.Wqkv(hidden_states)
-            # q = q.view(-1, self.num_heads, self.head_dim)
-            # q1, q2 = self.split_heads(q)
-            # # kv_cache shape is (2, num_blocks, block_size * num_kv_heads * head_size)
-            # kv_cache1, kv_cache2 = self.split_kv_cache(kv_cache)
-            # key_cache1, value_cache1 = kv_cache1[0], kv_cache1[1]
-            # key_cache2, value_cache2 = kv_cache2[0], kv_cache2[1]
-            
-            # attn11 = self.forward_decode(q1, key_cache1, value_cache1, attn_metadata)
-            # attn12 = self.forward_decode(q1, key_cache1, value_cache2, attn_metadata)
-            # attn11 = attn11.view(q1.shape)
-            # attn12 = attn12.view(q1.shape)
-            # attn1 = torch.cat([attn11, attn12], dim=-1)
-
-            # attn21 = self.forward_decode(q2, key_cache2, value_cache1, attn_metadata)
-            # attn22 = self.forward_decode(q2, key_cache2, value_cache2, attn_metadata)
-            # attn21 = attn21.view(q2.shape)
-            # attn22 = attn22.view(q2.shape)
-            # attn2 = torch.cat([attn21, attn22], dim=-1)
-
-            # lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(q)
-            # lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(q)
-            # lambda_full = lambda_1 - lambda_2 + self.lambda_init
-            # attn = attn1 - lambda_full * attn2
-            # attn = self.subln(attn)
-            # attn = attn * (1 - self.lambda_init)
-            # # reshape back to 2 * num_head
-            # attn_output = rearrange(attn, "... H (two D) -> ... (H two) D", two=2)
-            
-
             if self.attn.kv_cache[0].numel() == 0:
                  self.attn.kv_cache = [kv_cache]
             attn_output = self.attn(q, None, None)