Experimental implementation for bnb.optim.GaLoreAdamW8bit

matthewdouglas · matthewdouglas · commit 5b9891b45721 · 2024-03-18T20:05:39.000-04:00
diff --git a/bitsandbytes/optim/__init__.py b/bitsandbytes/optim/__init__.py
@@ -9,6 +9,7 @@
     AdamW,
     AdamW8bit,
     AdamW32bit,
+    GaLoreAdamW8bit,
     PagedAdamW,
     PagedAdamW8bit,
     PagedAdamW32bit,
diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py
@@ -2,8 +2,18 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+import torch
+
 from bitsandbytes.optim.optimizer import Optimizer2State
 
+_galore_available = False
+try:
+    from galore_torch.galore_projector import GaLoreProjector
+
+    _galore_available = True
+except ImportError:
+    pass
+
 
 class AdamW(Optimizer2State):
     def __init__(
@@ -127,6 +137,133 @@ def __init__(
         )
 
 
+class GaLoreAdamW8bit(Optimizer2State):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=1e-2,
+        amsgrad=False,
+        optim_bits=8,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        block_wise=True,
+        is_paged=False,
+    ):
+        if not _galore_available:
+            raise RuntimeError("The galore_torch package must be installed to use GaLoreAdamW8bit.")
+        super().__init__(
+            "adam",
+            params,
+            lr,
+            betas,
+            eps,
+            weight_decay,
+            optim_bits,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            block_wise,
+            is_paged=is_paged,
+        )
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        overflows = []
+
+        if not self.initialized:
+            self.check_overrides()
+            self.to_gpu()  # needed for fairseq pure fp16 training
+            self.initialized = True
+
+        # if self.is_paged: self.page_mng.prefetch_all()
+        for gindex, group in enumerate(self.param_groups):
+            for pindex, p in enumerate(group["params"]):
+                if p.grad is None:
+                    continue
+                state = self.state[p]
+
+                if "step" not in state:
+                    state["step"] = 0
+
+                if "rank" in group:
+                    if "projector" not in state:
+                        state["projector"] = GaLoreProjector(
+                            group["rank"],
+                            update_proj_gap=group["update_proj_gap"],
+                            scale=group["scale"],
+                            proj_type=group["proj_type"],
+                        )
+
+                    grad = state["projector"].project(p.grad, state["step"])
+
+                else:
+                    pass
+
+                ####
+
+                # GaLore Projection
+                if "rank" in group:
+                    if "projector" not in state:
+                        state["projector"] = GaLoreProjector(
+                            group["rank"],
+                            update_proj_gap=group["update_proj_gap"],
+                            scale=group["scale"],
+                            proj_type=group["proj_type"],
+                        )
+
+                    grad = state["projector"].project(p.grad, state["step"])
+
+                    # suboptimal implementation
+                    # p.saved_data = p.data.clone()
+                    # p.data = grad.clone().to(p.data.dtype).to(p.data.device)
+                    # p.data.zero_()
+                    # p.grad = grad
+                    lor_update = torch.zeros_like(
+                        grad, dtype=p.data.dtype, device=p.data.device, requires_grad=grad.requires_grad
+                    )
+                    lor_update.grad = grad
+
+                if "state1" not in state:
+                    self.init_state(group, p, gindex, pindex)
+
+                self.prefetch_state(p)
+
+                if "rank" in group:
+                    self.update_step(group, p, gindex, pindex, return_updates=lor_update)
+
+                    # GaLore Projection Back
+                    p.data.add_(state["projector"].project_back(lor_update))
+
+                    if "weight_decay" in group and group["weight_decay"] > 0:
+                        p.data.add_(p.data, alpha=-group["lr"] * group["weight_decay"])
+                else:
+                    self.update_step(group, p, gindex, pindex)
+
+                torch.cuda.synchronize()
+
+        if self.is_paged:
+            # all paged operation are asynchronous, we need
+            # to sync to make sure all tensors are in the right state
+            torch.cuda.synchronize()
+
+        return loss
+
+
 class AdamW32bit(Optimizer2State):
     def __init__(
         self,