qk norm before rope arg

jackzhxng · jackzhxng · commit c5dba069dde3 · 2025-04-29T12:01:48.000-07:00
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -178,6 +178,7 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         self.dim = args.dim
         self.attention_qkv_bias = args.attention_qkv_bias
         self.use_qk_norm = args.use_qk_norm
+        self.qk_norm_before_rope = args.qk_norm_before_rope
 
         if self.use_qk_norm:
             q_norm_dim = self.head_dim
@@ -243,7 +244,7 @@ def forward(
         k = k.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
         v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
 
-        if self.use_qk_norm:
+        if self.use_qk_norm and self.qk_norm_before_rope:
             q = self.q_norm_fn(q)
             k = self.k_norm_fn(k)
 
@@ -254,6 +255,10 @@ def forward(
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
 
+        if self.use_qk_norm and not self.qk_norm_before_rope:
+            q = self.q_norm_fn(q)
+            k = self.k_norm_fn(k)
+
         if self.use_kv_cache:
             assert input_pos is not None
             k, v = self.kv_cache.update(input_pos, k, v)
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -38,6 +38,7 @@ class ModelArgs:
     apply_embedding: bool = True  # Use embedding inside the transformer
     apply_output: bool = True  # Use output layer (unembedding) inside the transformer
     use_qk_norm: bool = False  # apply normalization to q and k in the attention
+    qk_norm_before_rope: bool = False  # when to apply qk norm
     use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
     partial_rotary_factor: float = 1.0
     rope_theta: Optional[float] = (
diff --git a/examples/models/qwen3/0_6b_config.json b/examples/models/qwen3/0_6b_config.json
@@ -12,5 +12,6 @@
   "vocab_size": 151936,
   "use_hf_rope": true,
   "attention_qkv_bias": false,
-  "use_qk_norm": true
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true
 }
diff --git a/examples/models/qwen3/1_7b_config.json b/examples/models/qwen3/1_7b_config.json
@@ -12,5 +12,6 @@
   "vocab_size": 151936,
   "use_hf_rope": true,
   "attention_qkv_bias": false,
-  "use_qk_norm": true
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true
 }
diff --git a/examples/models/qwen3/4b_config.json b/examples/models/qwen3/4b_config.json
@@ -12,5 +12,6 @@
   "vocab_size": 151936,
   "use_hf_rope": true,
   "attention_qkv_bias": false,
-  "use_qk_norm": true
+  "use_qk_norm": true,
+  "qk_norm_before_repo": true
 }

Original file line number	Diff line number	Diff line change
`@@ -12,5 +12,6 @@`
`12`	`12`	`"vocab_size": 151936,`
`13`	`13`	`"use_hf_rope": true,`
`14`	`14`	`"attention_qkv_bias": false,`
`15`		`- "use_qk_norm": true`
	`15`	`+ "use_qk_norm": true,`
	`16`	`+ "qk_norm_before_rope": true`
`16`	`17`	`}`