Add forgotten LayerNorm (ecmwf#687)

sophie-xhonneux · Sophie Xhonneux · web-flow · commit 353401faae2c · 2025-08-07T08:12:15.000+02:00
* Add forgotten LayerNorm

* Apply ruff

---------

Co-authored-by: Sophie Xhonneux &lt;sxhonneux@clariden-ln001.cscs.ch&gt;
diff --git a/src/weathergen/model/attention.py b/src/weathergen/model/attention.py
@@ -310,6 +310,7 @@ def __init__(
         lnorm = norm if with_qk_lnorm else torch.nn.Identity
         self.lnorm_q = lnorm(self.dim_head_proj, eps=norm_eps)
         self.lnorm_k = lnorm(self.dim_head_proj, eps=norm_eps)
+        self.lnorm_kv = lnorm(dim_embed_kv, eps=norm_eps)
 
         self.dtype = attention_dtype
         assert with_flash, "Only flash attention supported at the moment"
@@ -319,6 +320,7 @@ def forward(self, x_q, x_kv, x_lens=None, x_kv_lens=None, ada_ln_aux=None):
         if self.with_residual:
             x_q_in = x_q
         x_q = x_q if ada_ln_aux is None else self.lnorm_in_q(x_q, ada_ln_aux)
+        x_kv = self.lnorm_kv(x_kv)
 
         ## project onto heads and q,k,v and
         #  ensure these are 4D tensors as required for flash attention