Redefine lr_scheduler behavior (#1284)

wwwjn · web-flow · commit e7c0cae934df · 2025-06-11T20:51:22.000-07:00
## Context The current warmup-stable-decay lr_scheduler behavior is not intuitive. For example, in `debug_model.toml`, the configurattion is: ``` [lr_scheduler] warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps decay_type = "linear" lr_min = 0.0 ``` So we are expected warmup_steps =2, decay steps=8, total steps=10. And we got the curret learning rate (blue line) There are 2 issues: 1. The max learning rate is not reaching 1 (expected max lr = 1 since we are calculating ratio here). 2. Intuitively, the user would expect to see learning rate increase by 0.5 (suppose max_lr = 1) each step during warm up stage, and decrease by 1/8 each step during decay stage. But in blue line, the lr is increasing by 1/3, and decreasing by 1/9, which is counter-intuitive. Thus we propose a standard lr_scheduler behavior, which aligns with user's intuitive and the meaning of the parameter names. (the red line) ![learning_rate_schedule](https://github.com/user-attachments/assets/1c2be9e0-6043-4310-b09f-9b06a024abf9) ## Standard lr_scheduler behavior - Warm up stage: LR increase by 1/{warmup_steps} - Stable stage: Length of stable stage = total_train_step + 1 - warmup_stage - decay_stage. We manually add one step to stable stage (so we have a fake step 11), which is preventing if decay is enabled the step 10 learning rate drops to 0. - Decay stage: LR decrease by 1/{decay_steps}
diff --git a/tests/unit_tests/test_lr_scheduler.py b/tests/unit_tests/test_lr_scheduler.py
@@ -0,0 +1,280 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from unittest.mock import MagicMock
+
+import torch
+from torch.optim import Adam
+
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import OptimizersContainer
+
+
+class TestLRScheduler(unittest.TestCase):
+    def setUp(self):
+        # Create a simple model with parameters
+        self.model = torch.nn.Linear(10, 10)
+        # Create an optimizer
+        self.optimizer = Adam(self.model.parameters(), lr=0.1)
+        # Create an optimizer container
+        self.optimizer_container = MagicMock(spec=OptimizersContainer)
+        self.optimizer_container.__iter__.return_value = iter([self.optimizer])
+        self.optimizer_container.__len__.return_value = 1
+
+    def create_job_config(
+        self,
+        training_steps=10,
+        warmup_steps=None,
+        decay_ratio=None,
+        decay_type=None,
+        lr_min=None,
+    ):
+        # Create a job config with the specified parameters
+        from torchtitan.config_manager import ConfigManager
+
+        args = [
+            "--training.steps",
+            str(training_steps),
+        ]
+
+        args += (
+            ["--lr_scheduler.warmup_steps", str(warmup_steps)]
+            if warmup_steps is not None
+            else []
+        )
+        args += (
+            ["--lr_scheduler.decay_ratio", str(decay_ratio)]
+            if decay_ratio is not None
+            else []
+        )
+        args += (
+            ["--lr_scheduler.decay_type", decay_type] if decay_type is not None else []
+        )
+        args += ["--lr_scheduler.lr_min", str(lr_min)] if lr_min is not None else []
+
+        config_manager = ConfigManager()
+        # Create base config with parameters passed directly
+        config = config_manager.parse_args(args)
+
+        return config
+
+    def test_linear_warmup_decay(self):
+        """Test the linear warmup followed by linear decay schedule."""
+        # Create a job config with 10 steps, 2 warmup steps, and linear decay
+        config = self.create_job_config(
+            training_steps=10,
+            warmup_steps=2,
+            decay_ratio=None,  # Use default decay: start decay immediately
+            decay_type=None,
+            lr_min=None,
+        )
+
+        # Build the lr scheduler
+        lr_scheduler = build_lr_schedulers(self.optimizer_container, config)
+
+        # Expected adjustment factors for each step
+        expected_factors = [
+            0.5,  # Step 0: 50% of max LR (warmup)
+            1.0,  # Step 1: 100% of max LR (warmup complete)
+            1.0,  # Step 2: We maunally added step of stable phase, to prevent LR from dropping to 0 at last step
+            7.0 / 8.0,  # Step 3: 7/8 of max LR
+            6.0 / 8.0,  # Step 4: 3/4 of max LR
+            5.0 / 8.0,  # Step 5: 5/8 of max LR
+            4.0 / 8.0,  # Step 6: 1/2 of max LR
+            3.0 / 8.0,  # Step 7: 3/8 of max LR
+            2.0 / 8.0,  # Step 8: 1/4 of max LR
+            1.0 / 8.0,  # Step 9: 1/8 of max LR
+        ]
+
+        # Check the learning rate at each step
+        for i, factor in enumerate(expected_factors):
+            # The LambdaLR multiplies the base lr by the factor
+            expected_lr = 0.1 * factor
+            self.assertAlmostEqual(
+                self.optimizer.param_groups[0]["lr"],
+                expected_lr,
+                places=6,
+                msg=f"Step {i}: Expected LR {expected_lr}, got {self.optimizer.param_groups[0]['lr']}",
+            )
+            lr_scheduler.step()
+
+    def test_warmup_stable_decay(self):
+        """Test warmup followed by stable phase and then decay."""
+        # Create a job config with 10 steps, 2 warmup steps, 3 stable steps, and 5 decay steps
+        config = self.create_job_config(
+            training_steps=10,
+            warmup_steps=2,
+            decay_ratio=0.5,  # 50% of steps for decay
+            decay_type="linear",
+            lr_min=0.0,
+        )
+
+        # Build the lr scheduler
+        lr_scheduler = build_lr_schedulers(self.optimizer_container, config)
+
+        # Expected adjustment factors for each step
+        expected_factors = [
+            0.5,  # Step 0: 50% of max LR (warmup)
+            1.0,  # Step 1: 100% of max LR (warmup complete)
+            1.0,  # Step 2: Stable phase
+            1.0,  # Step 3: Stable phase
+            1.0,  # Step 4: Stable phase
+            1.0,  # Step 5: We maunally added step of stable phase, to prevent LR from dropping to 0 at last step
+            0.8,  # Step 6: Linear decay starts (80% of max LR)
+            0.6,  # Step 7: 60% of max LR
+            0.4,  # Step 8: 40% of max LR
+            0.2,  # Step 9: 20% of max LR
+        ]
+
+        # Check the learning rate at each step
+        for i, factor in enumerate(expected_factors):
+            expected_lr = 0.1 * factor
+            self.assertAlmostEqual(
+                self.optimizer.param_groups[0]["lr"],
+                expected_lr,
+                places=6,
+                msg=f"Step {i}: Expected LR {expected_lr}, got {self.optimizer.param_groups[0]['lr']}",
+            )
+            lr_scheduler.step()
+
+    def test_min_lr(self):
+        """Test that the learning rate doesn't go below the minimum."""
+        # Create a job config with a minimum learning rate
+        config = self.create_job_config(
+            training_steps=10,
+            warmup_steps=2,
+            decay_ratio=None,
+            decay_type="linear",
+            lr_min=0.2,  # 20% of base LR as minimum
+        )
+
+        # Build the lr scheduler
+        lr_scheduler = build_lr_schedulers(self.optimizer_container, config)
+
+        # Step through all steps
+        for _ in range(10):
+            lr_scheduler.step()
+
+        # After all steps, LR should be at minimum (0.1 * 0.2 = 0.02)
+        self.assertAlmostEqual(self.optimizer.param_groups[0]["lr"], 0.02, places=6)
+
+    def test_warmup_exceeds_training(self):
+        """Test when warmup steps exceed training steps."""
+        # Create a job config where warmup steps > training steps
+        config = self.create_job_config(
+            training_steps=5,
+            warmup_steps=10,  # More than training steps
+            decay_ratio=None,
+            decay_type="linear",
+            lr_min=0.0,
+        )
+
+        # Build the lr scheduler - should adjust warmup steps
+        lr_scheduler = build_lr_schedulers(self.optimizer_container, config)
+
+        # Expected adjustment factors for each step
+        expected_factors = [
+            0.2,  # Step 0: 50% of max LR (warmup)
+            0.4,  # Step 1: 100% of max LR (warmup complete)
+            0.6,  # Step 2: Stable phase
+            0.8,  # Step 3: Stable phase
+            1.0,  # Step 4: Stable phase
+        ]
+
+        # Check the learning rate at each step
+        for i, factor in enumerate(expected_factors):
+            expected_lr = 0.1 * factor
+            self.assertAlmostEqual(
+                self.optimizer.param_groups[0]["lr"],
+                expected_lr,
+                places=6,
+                msg=f"Step {i}: Expected LR {expected_lr}, got {self.optimizer.param_groups[0]['lr']}",
+            )
+            lr_scheduler.step()
+
+    def test_warmup_stable_only(self):
+        """Test warmup followed by stable phase only, with no decay phase."""
+        # Create a job config with 10 steps, 2 warmup steps, and no decay phase
+        config = self.create_job_config(
+            training_steps=10,
+            warmup_steps=2,
+            decay_ratio=0.0,  # 0% of steps for decay (no decay)
+            decay_type="linear",
+            lr_min=0.0,
+        )
+
+        # Build the lr scheduler
+        lr_scheduler = build_lr_schedulers(self.optimizer_container, config)
+
+        # Expected adjustment factors for each step
+        expected_factors = [
+            0.5,  # Step 0: 50% of max LR (warmup)
+            1.0,  # Step 1: 100% of max LR (warmup complete)
+            1.0,  # Step 2: We maunally added step of stable phase, to prevent LR from dropping to 0 at last step
+            1.0,  # Step 3: Stable phase
+            1.0,  # Step 4: Stable phase
+            1.0,  # Step 5: Stable phase
+            1.0,  # Step 6: Stable phase
+            1.0,  # Step 7: Stable phase
+            1.0,  # Step 8: Stable phase
+            1.0,  # Step 9: Stable phase
+        ]
+
+        # Check the learning rate at each step
+        for i, factor in enumerate(expected_factors):
+            expected_lr = 0.1 * factor
+            self.assertAlmostEqual(
+                self.optimizer.param_groups[0]["lr"],
+                expected_lr,
+                places=6,
+                msg=f"Step {i}: Expected LR {expected_lr}, got {self.optimizer.param_groups[0]['lr']}",
+            )
+            lr_scheduler.step()
+
+    def test_warmup_plus_decay_exceeds_training(self):
+        """Test when warmup + decay steps exceed training steps."""
+        # Create a job config where warmup + decay steps > training steps
+        # Expected behaviro: warmup steps = 5, decay steps = 5
+        config = self.create_job_config(
+            training_steps=10,
+            warmup_steps=5,
+            decay_ratio=0.8,  # 80% of steps for decay (8 steps)
+            decay_type="linear",
+            lr_min=0.0,
+        )
+
+        # Build the lr scheduler - should adjust warmup steps
+        lr_scheduler = build_lr_schedulers(self.optimizer_container, config)
+
+        # Expected adjustment factors for each step
+        expected_factors = [
+            0.2,  # Step 0: 50% of max LR (warmup)
+            0.4,  # Step 1: 100% of max LR (warmup complete)
+            0.6,  # Step 2: Stable phase
+            0.8,  # Step 3: Stable phase
+            1.0,  # Step 4: Stable phase
+            1.0,  # Step 5: We maunally added step of stable phase, to prevent LR from dropping to 0 at last step
+            0.8,  # Step 6: Linear decay starts (80% of max LR)
+            0.6,  # Step 7: 60% of max LR
+            0.4,  # Step 8: 40% of max LR
+            0.2,  # Step 9: 20% of max LR
+        ]
+
+        # Check the learning rate at each step
+        for i, factor in enumerate(expected_factors):
+            expected_lr = 0.1 * factor
+            self.assertAlmostEqual(
+                self.optimizer.param_groups[0]["lr"],
+                expected_lr,
+                places=6,
+                msg=f"Step {i}: Expected LR {expected_lr}, got {self.optimizer.param_groups[0]['lr']}",
+            )
+            lr_scheduler.step()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchtitan/components/lr_scheduler.py b/torchtitan/components/lr_scheduler.py
@@ -102,6 +102,14 @@ def build_lr_schedulers(
     """
     training_steps = job_config.training.steps
     warmup_steps = int(job_config.lr_scheduler.warmup_steps)
+
+    if warmup_steps > training_steps:
+        logger.warning(
+            f"Warmup steps ({warmup_steps}) exceed total training steps ({training_steps}). "
+            f"Adjusting warmup steps to {training_steps}."
+        )
+        warmup_steps = training_steps
+
     if job_config.lr_scheduler.decay_ratio is not None:
         decay_steps = round(training_steps * job_config.lr_scheduler.decay_ratio)
         if warmup_steps + decay_steps > training_steps:
@@ -113,7 +121,8 @@ def build_lr_schedulers(
             decay_steps = training_steps - warmup_steps
     else:
         decay_steps = training_steps - warmup_steps
-    stable_steps = training_steps - warmup_steps - decay_steps
+    # Add a vitual last step to prevent the learning rate from dropping to 0
+    stable_steps = training_steps + 1 - warmup_steps - decay_steps
     lr_decay_type = job_config.lr_scheduler.decay_type
     lr_min = job_config.lr_scheduler.lr_min
 
@@ -146,13 +155,17 @@ def linear_warmup_stable_decay(
             # linear warmup
             # 0-indexed step, hence + 1 adjustments
             current_step += 1
-            curr_adjustment = float(current_step / (warmup_steps + 1))
+            assert (
+                warmup_steps != 0
+            ), "warmup_steps must not be zero to reach this branch"
+            curr_adjustment = float(current_step / warmup_steps)
         elif current_step < warmup_stable_steps:
             curr_adjustment = 1.0
         else:
             # 0-indexed step, hence + 1 adjustments
             current_step += 1
-            progress = float(current_step - warmup_stable_steps) / (decay_steps + 1)
+            assert decay_steps != 0, "decay_steps must not be zero to reach this branch"
+            progress = float(current_step - warmup_stable_steps) / decay_steps
 
             if lr_decay_type == "linear":
                 curr_adjustment = 1 - progress