@@ -122,7 +122,7 @@ class TensorNameMap:
122
122
"h.{bid}.attn.c_attn" , # gpt2
123
123
"transformer.h.{bid}.mixer.Wqkv" , # phi2
124
124
"encoder.layers.{bid}.attn.Wqkv" , # nomic-bert
125
- "model.layers.{bid}.self_attn.qkv_proj" # phi3
125
+ "model.layers.{bid}.self_attn.qkv_proj" , # phi3
126
126
"encoder.layers.{bid}.self_attention.query_key_value" , # chatglm
127
127
),
128
128
@@ -134,7 +134,7 @@ class TensorNameMap:
134
134
"transformer.h.{bid}.attn.q_proj" , # gpt-j
135
135
"model.layers.layers.{bid}.self_attn.q_proj" , # plamo
136
136
"model.layers.{bid}.attention.wq" , # internlm2
137
- "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
137
+ "transformer.decoder_layer.{bid}.multi_head_attention.query" , # Grok
138
138
),
139
139
140
140
# Attention key
@@ -145,7 +145,7 @@ class TensorNameMap:
145
145
"transformer.h.{bid}.attn.k_proj" , # gpt-j
146
146
"model.layers.layers.{bid}.self_attn.k_proj" , # plamo
147
147
"model.layers.{bid}.attention.wk" , # internlm2
148
- "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
148
+ "transformer.decoder_layer.{bid}.multi_head_attention.key" , # Grok
149
149
),
150
150
151
151
# Attention value
@@ -156,7 +156,7 @@ class TensorNameMap:
156
156
"transformer.h.{bid}.attn.v_proj" , # gpt-j
157
157
"model.layers.layers.{bid}.self_attn.v_proj" , # plamo
158
158
"model.layers.{bid}.attention.wv" , # internlm2
159
- "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
159
+ "transformer.decoder_layer.{bid}.multi_head_attention.value" , # Grok
160
160
),
161
161
162
162
# Attention output
0 commit comments