Log grad_norm (#1339)

runame · web-flow · commit a85e2a53625a · 2025-06-25T22:19:10.000-07:00
Since the `grad_norm` is always computed anyway and might provide useful insight into the training dynamics I don't see a reason not to log it. `grad_norm` could also be logged to the output [here](https://github.com/pytorch/torchtitan/blob/main/torchtitan/components/metrics.py#L400-L408), let me know if I should add that. Terminal logging demo: <img width="1192" alt="grad_norm_logging_demo" src="https://github.com/user-attachments/assets/f7c4721a-ec63-48e7-9d13-8ea0f4e26326" /> <img width="1192" alt="grad_norm_logging_demo" src="https://github.com/user-attachments/assets/f7c4721a-ec63-48e7-9d13-8ea0f4e26326" />
diff --git a/torchtitan/components/metrics.py b/torchtitan/components/metrics.py
@@ -352,6 +352,7 @@ def log(
         step: int,
         global_avg_loss: float,
         global_max_loss: float,
+        grad_norm: float,
         extra_metrics: dict[str, Any] | None = None,
     ):
         assert self.num_flops_per_token > 0, "num_flops_per_token must be set"
@@ -377,6 +378,7 @@ def log(
         metrics = {
             "loss_metrics/global_avg_loss": global_avg_loss,
             "loss_metrics/global_max_loss": global_max_loss,
+            "grad_norm": grad_norm,
             "throughput(tps)": tps,
             "tflops": tflops,
             "mfu(%)": mfu,
@@ -400,6 +402,7 @@ def log(
         logger.info(
             f"{color.red}step: {step:2}  "
             f"{color.green}loss: {global_avg_loss:7.4f}  "
+            f"{color.orange}grad_norm: {grad_norm:7.4f}  "
             f"{color.yellow}memory: {device_mem_stats.max_reserved_gib:5.2f}GiB"
             f"({device_mem_stats.max_reserved_pct:.2f}%)  "
             f"{color.blue}tps: {round(tps):,}  "
diff --git a/torchtitan/tools/utils.py b/torchtitan/tools/utils.py
@@ -135,6 +135,7 @@ class Color:
     cyan = "\033[36m"
     white = "\033[37m"
     reset = "\033[39m"
+    orange = "\033[38;2;180;60;0m"
 
 
 @dataclass(frozen=True)
@@ -148,6 +149,7 @@ class NoColor:
     cyan = ""
     white = ""
     reset = ""
+    orange = ""
 
 
 def check_if_feature_in_pytorch(
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -431,7 +431,7 @@ def train_step(
             loss = self.forward_backward_step(input_dict, labels)
             accumulated_losses.append(loss.detach())
 
-        dist_utils.clip_grad_norm_(
+        grad_norm = dist_utils.clip_grad_norm_(
             [p for m in self.model_parts for p in m.parameters()],
             self.job_config.training.max_norm,
             foreach=True,
@@ -463,7 +463,12 @@ def train_step(
         else:
             global_avg_loss = global_max_loss = loss.detach().item()
 
-        self.metrics_processor.log(self.step, global_avg_loss, global_max_loss)
+        self.metrics_processor.log(
+            self.step,
+            global_avg_loss,
+            global_max_loss,
+            grad_norm.item(),
+        )
 
     @record
     def train(self):