|
| 1 | +import math |
| 2 | +from typing import Optional, Tuple |
| 3 | +from distutils.version import LooseVersion |
| 4 | + |
| 5 | +import torch |
| 6 | +import torch.nn as nn |
| 7 | +import torch.nn.functional as F |
| 8 | +from transformers.models.llama.modeling_llama import apply_rotary_pos_emb |
| 9 | +from transformers.modeling_utils import set_module_tensor_to_device |
| 10 | + |
| 11 | + |
| 12 | +class LlamaAttentionFused(nn.Module): |
| 13 | + def __init__(self, origin_attention): |
| 14 | + super().__init__() |
| 15 | + self.config = origin_attention.config |
| 16 | + self.hidden_size = origin_attention.hidden_size |
| 17 | + self.num_heads = origin_attention.num_heads |
| 18 | + self.head_dim = origin_attention.head_dim |
| 19 | + self.max_position_embeddings = origin_attention.max_position_embeddings |
| 20 | + |
| 21 | + self.qkv_proj = nn.Linear( |
| 22 | + origin_attention.hidden_size, origin_attention.num_heads * origin_attention.head_dim * 3, bias=False |
| 23 | + ) |
| 24 | + fused_weight = torch.cat( |
| 25 | + [ |
| 26 | + fc_node.weight.data |
| 27 | + for fc_node in [origin_attention.q_proj, origin_attention.k_proj, origin_attention.v_proj] |
| 28 | + ], |
| 29 | + dim=0, |
| 30 | + ) |
| 31 | + set_module_tensor_to_device( |
| 32 | + self.qkv_proj, 'weight', fused_weight.device, value=fused_weight, dtype=fused_weight.dtype |
| 33 | + ) |
| 34 | + self.o_proj = origin_attention.o_proj |
| 35 | + self.rotary_emb = origin_attention.rotary_emb |
| 36 | + |
| 37 | + origin_attention.q_proj = None |
| 38 | + origin_attention.k_proj = None |
| 39 | + origin_attention.v_proj = None |
| 40 | + |
| 41 | + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): |
| 42 | + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() |
| 43 | + |
| 44 | + def forward( |
| 45 | + self, |
| 46 | + hidden_states: torch.Tensor, |
| 47 | + attention_mask: Optional[torch.Tensor] = None, |
| 48 | + position_ids: Optional[torch.LongTensor] = None, |
| 49 | + past_key_value: Optional[Tuple[torch.Tensor]] = None, |
| 50 | + output_attentions: bool = False, |
| 51 | + use_cache: bool = False, |
| 52 | + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: |
| 53 | + bsz, q_len, _ = hidden_states.size() |
| 54 | + # use fused fc output to get qkv states |
| 55 | + qkv_states = self.qkv_proj(hidden_states).view(bsz, q_len, self.num_heads * 3, self.head_dim).transpose(1, 2) |
| 56 | + (query_states, key_states, value_states) = torch.chunk(qkv_states, 3, 1) |
| 57 | + |
| 58 | + is_causal = past_key_value is None |
| 59 | + |
| 60 | + kv_seq_len = key_states.shape[-2] |
| 61 | + if past_key_value is not None: |
| 62 | + kv_seq_len += past_key_value[0].shape[-2] |
| 63 | + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) |
| 64 | + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) |
| 65 | + # [bsz, nh, t, hd] |
| 66 | + |
| 67 | + if past_key_value is not None: |
| 68 | + # reuse k, v, self_attention |
| 69 | + key_states = torch.cat([past_key_value[0], key_states], dim=2) |
| 70 | + value_states = torch.cat([past_key_value[1], value_states], dim=2) |
| 71 | + |
| 72 | + past_key_value = (key_states, value_states) if use_cache else None |
| 73 | + if LooseVersion(torch.__version__) == LooseVersion('1.13.0'): |
| 74 | + with torch.backends.cuda.sdp_kernel(enable_math=False): |
| 75 | + attn_output, attn_weights = F._scaled_dot_product_attention( |
| 76 | + query_states, key_states, value_states, is_causal=is_causal |
| 77 | + ) |
| 78 | + elif LooseVersion(torch.__version__) >= LooseVersion('2.0.0'): |
| 79 | + with torch.backends.cuda.sdp_kernel(enable_math=False): |
| 80 | + attn_output, attn_weights = F.scaled_dot_product_attention( |
| 81 | + query_states, key_states, value_states, is_causal=is_causal |
| 82 | + ) |
| 83 | + else: |
| 84 | + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) |
| 85 | + |
| 86 | + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): |
| 87 | + raise ValueError( |
| 88 | + f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" |
| 89 | + f" {attn_weights.size()}" |
| 90 | + ) |
| 91 | + |
| 92 | + if attention_mask is not None: |
| 93 | + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): |
| 94 | + raise ValueError( |
| 95 | + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is" |
| 96 | + f" {attention_mask.size()}" |
| 97 | + ) |
| 98 | + attn_weights = attn_weights + attention_mask |
| 99 | + attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) |
| 100 | + |
| 101 | + # upcast attention to fp32 |
| 102 | + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) |
| 103 | + attn_output = torch.matmul(attn_weights, value_states) |
| 104 | + |
| 105 | + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): |
| 106 | + raise ValueError( |
| 107 | + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" |
| 108 | + f" {attn_output.size()}" |
| 109 | + ) |
| 110 | + del query_states, key_states, value_states |
| 111 | + |
| 112 | + attn_output = attn_output.transpose(1, 2) |
| 113 | + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) |
| 114 | + |
| 115 | + attn_output = self.o_proj(attn_output) |
| 116 | + |
| 117 | + if not output_attentions: |
| 118 | + attn_weights = None |
| 119 | + |
| 120 | + return attn_output, attn_weights, past_key_value |
0 commit comments