Raise warning when rms_scaling = True (#21352)

abheesht17 · web-flow · commit 764ed95651c1 · 2025-06-13T11:33:18.000-07:00
* Fix rms_scaling in LayerNormalization

* Fix numerics UT

* Fix numerics UT (1)

* Fix numerics UT (2)

* Add warning for rms_scaling = True

* Add warning to LayerNormalization layer

* Remove unnecessary comments
diff --git a/keras/src/layers/normalization/layer_normalization.py b/keras/src/layers/normalization/layer_normalization.py
@@ -1,3 +1,5 @@
+import warnings
+
 from keras.src import constraints
 from keras.src import initializers
 from keras.src import ops
@@ -82,12 +84,6 @@ class LayerNormalization(Layer):
             When the next layer is linear (also e.g. `nn.relu`), this can be
             disabled since the scaling will be done by the next layer.
             Defaults to `True`.
-        rms_scaling: If True, `center` and `scale` are ignored, and the
-            inputs are scaled by `gamma` and the inverse square root
-            of the square of all inputs. This is an approximate and faster
-            approach that avoids ever computing the mean of the input. Note that
-            this *isn't* equivalent to the computation that the
-            `keras.layers.RMSNormalization` layer performs.
         beta_initializer: Initializer for the beta weight. Defaults to zeros.
         gamma_initializer: Initializer for the gamma weight. Defaults to ones.
         beta_regularizer: Optional regularizer for the beta weight.
@@ -112,7 +108,6 @@ def __init__(
         epsilon=1e-3,
         center=True,
         scale=True,
-        rms_scaling=False,
         beta_initializer="zeros",
         gamma_initializer="ones",
         beta_regularizer=None,
@@ -121,6 +116,15 @@ def __init__(
         gamma_constraint=None,
         **kwargs,
     ):
+        rms_scaling = kwargs.pop("rms_scaling", False)
+        if rms_scaling:
+            warnings.warn(
+                "You passed `rms_scaling=True`, which is deprecated. This "
+                "argument incorrectly scales the input by the variance, not "
+                "the root mean square. To correctly use RMS Normalization, "
+                "please use `keras.layers.RMSNormalization` instead."
+            )
+
         super().__init__(**kwargs)
         if isinstance(axis, (list, tuple)):
             self.axis = list(axis)
@@ -185,7 +189,7 @@ def call(self, inputs):
             self.beta,
             self.axis,
             self.epsilon,
-            self.rms_scaling,
+            rms_scaling=self.rms_scaling,
         )
         return ops.cast(outputs, self.compute_dtype)
 
diff --git a/keras/src/ops/nn.py b/keras/src/ops/nn.py
@@ -2875,7 +2875,7 @@ def call(self, x):
     ]
 )
 def layer_normalization(
-    x, gamma=None, beta=None, axis=-1, epsilon=None, rms_scaling=False
+    x, gamma=None, beta=None, axis=-1, epsilon=None, **kwargs
 ):
     """Layer normalization layer (Ba et al., 2016).
 
@@ -2889,9 +2889,6 @@ def layer_normalization(
             Default to -1.
         gamma: Optional scaling factor for the normalization.
         beta: Optional add offset for the normalized tensor.
-        rms_scaling:This is an approximate and faster
-            approach that avoids ever computing the mean of the input. Note that
-            this *isn't* equivalent to the computation that rms_normalization
         epsilon: A lower bound value for the norm.
             Defaults to `backend.epsilon()`.
 
@@ -2902,6 +2899,16 @@ def layer_normalization(
     >>> print(x_norm)
     array([-1.4142135 , -0.70710677,  0.,  0.7071067 ,  1.4142135 ])
     """
+    rms_scaling = kwargs.pop("rms_scaling", False)
+    if rms_scaling:
+        warnings.warn(
+            "You passed `rms_scaling=True`, which is deprecated. This argument "
+            "incorrectly scales the input by the variance, not the root mean "
+            "square. To correctly use RMS Normalization, please use "
+            "`keras.ops.rms_normalization` / `keras.ops.nn.rms_normalization` "
+            "instead."
+        )
+
     if any_symbolic_tensors((x,)):
         return LayerNorm(
             gamma=gamma,
@@ -2953,7 +2960,6 @@ def _broadcast(v):
         # Calculate the variance along self.axis (layer activations).
         variance = backend.numpy.var(x, axis=axis, keepdims=True)
         inv = backend.math.rsqrt(variance + epsilon)
-
         outputs = x * inv * backend.cast(_broadcast(gamma), x.dtype)
     elif backend.config.backend() == "torch" and is_continuous_axis(axis):
         # when using torch backend,use kernel to improve performance