bitsandbytes-foundation
diff --git a/‎bitsandbytes/functional.py
Lines changed: 20 additions & 9 deletions b/‎bitsandbytes/functional.py
Lines changed: 20 additions & 9 deletions
diff --git a/‎bitsandbytes/optim/optimizer.py
Lines changed: 13 additions & 3 deletions b/‎bitsandbytes/optim/optimizer.py
Lines changed: 13 additions & 3 deletions
@@ -1555,9 +1555,9 @@ def dequantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] =
 
 def optimizer_update_32bit(
     optimizer_name: str,
-    g: Tensor,
-    p: Tensor,
-    state1: Tensor,
+    g: torch.Tensor,
+    p: torch.Tensor,
+    state1: torch.Tensor,
     beta1: float,
     eps: float,
     step: int,
@@ -1571,6 +1571,7 @@ def optimizer_update_32bit(
     unorm_vec: Optional[torch.Tensor] = None,
     max_unorm: float = 0.0,
     skip_zeros=False,
+    return_updates: Optional[torch.Tensor] = None,
 ) -> None:
     """
     Performs an inplace optimizer update with one or two optimizer states.
@@ -1613,6 +1614,8 @@ def optimizer_update_32bit(
         The maximum update norm relative to the weight norm.
     skip_zeros : bool
         Whether to skip zero-valued gradients or not (default: False).
+    return_updates: Optional[torch.Tensor]
+        When provided, updates are written to this tensor and not applied directly to `p`. (default: None)
     """
 
     param_norm = 0.0
@@ -1636,6 +1639,7 @@ def optimizer_update_32bit(
     optim_func(
         get_ptr(g),
         get_ptr(p),
+        get_ptr(return_updates),
         get_ptr(state1),
         get_ptr(state2),
         get_ptr(unorm_vec),
@@ -1658,25 +1662,26 @@ def optimizer_update_32bit(
 
 def optimizer_update_8bit(
     optimizer_name: str,
-    g: Tensor,
-    p: Tensor,
-    state1: Tensor,
+    g: torch.Tensor,
+    p: torch.Tensor,
+    state1: torch.Tensor,
     state2: Optional[torch.Tensor],
     beta1: float,
     beta2: float,
     eps: float,
     step: int,
     lr: float,
-    qmap1: Tensor,
+    qmap1: torch.Tensor,
     qmap2: Optional[torch.Tensor],
-    max1: Tensor,
+    max1: torch.Tensor,
     max2: Optional[torch.Tensor],
-    new_max1: Tensor,
+    new_max1: torch.Tensor,
     new_max2: Optional[torch.Tensor],
     weight_decay: float = 0.0,
     gnorm_scale: float = 1.0,
     unorm_vec: Optional[torch.Tensor] = None,
     max_unorm: float = 0.0,
+    return_updates: Optional[torch.Tensor] = None,
 ) -> None:
     """
     Performs an inplace Adam update.
@@ -1726,6 +1731,8 @@ def optimizer_update_8bit(
         The tensor for the update norm.
     max_unorm : float
         The maximum update norm relative to the weight norm.
+    return_updates: Optional[torch.Tensor]
+        When provided, updates are written to this tensor and not applied directly to `p`. (default: None)
     """
 
     param_norm = 0.0
@@ -1738,6 +1745,7 @@ def optimizer_update_8bit(
         str2optimizer8bit[optimizer_name][0](
             get_ptr(p),
             get_ptr(g),
+            get_ptr(return_updates),
             get_ptr(state1),
             get_ptr(state2),
             get_ptr(unorm_vec),
@@ -1762,6 +1770,7 @@ def optimizer_update_8bit(
         str2optimizer8bit[optimizer_name][1](
             get_ptr(p),
             get_ptr(g),
+            get_ptr(return_updates),
             get_ptr(state1),
             get_ptr(state2),
             get_ptr(unorm_vec),
@@ -1809,6 +1818,7 @@ def optimizer_update_8bit_blockwise(
     weight_decay: float = 0.0,
     gnorm_scale: float = 1.0,
     skip_zeros=False,
+    return_updates: Optional[torch.Tensor] = None,
 ) -> None:
     optim_func = None
     prev_device = pre_call(g.device)
@@ -1835,6 +1845,7 @@ def optimizer_update_8bit_blockwise(
     optim_func(
         get_ptr(p),
         get_ptr(g),
+        get_ptr(return_updates),
         get_ptr(state1),
         get_ptr(state2),
         ct.c_float(beta1),
 
@@ -5,7 +5,7 @@
 from collections import abc as container_abcs, defaultdict
 from copy import deepcopy
 from itertools import chain
-from typing import Optional
+from typing import Any, Dict, Optional
 
 import torch
 
@@ -320,7 +320,7 @@ def get_config(self, gindex, pindex, group):
     def init_state(self, group, p, gindex, pindex):
         raise NotImplementedError("init_state method needs to be overridden")
 
-    def update_step(self, group, p, gindex, pindex):
+    def update_step(self, group, p, gindex, pindex, return_updates):
         raise NotImplementedError("The update_step method needs to be overridden")
 
     def get_state_buffer(self, p, dtype=torch.float32):
@@ -494,7 +494,14 @@ def init_state(self, group, p, gindex, pindex):
             state["unorm_vec"] = torch.zeros((1,), device=p.device)
 
     @torch.no_grad()
-    def update_step(self, group, p, gindex, pindex):
+    def update_step(
+        self,
+        group: Dict[str, Any],
+        p: torch.Tensor,
+        gindex: int,
+        pindex: int,
+        return_updates: Optional[torch.Tensor] = None,
+    ):
         # avoid update error from non-contiguous memory layout
         p.data = p.data.contiguous()
         p.grad = p.grad.contiguous()
@@ -536,6 +543,7 @@ def update_step(self, group, p, gindex, pindex):
                 state["unorm_vec"] if config["max_unorm"] > 0.0 else None,
                 max_unorm=config["max_unorm"],
                 skip_zeros=config["skip_zeros"],
+                return_updates=return_updates,
             )
 
         elif state["state1"].dtype == torch.uint8 and not config["block_wise"]:
@@ -560,6 +568,7 @@ def update_step(self, group, p, gindex, pindex):
                 gnorm_scale=gnorm_scale,
                 unorm_vec=state["unorm_vec"] if config["max_unorm"] > 0.0 else None,
                 max_unorm=config["max_unorm"],
+                return_updates=return_updates,
             )
 
             # swap maxes
@@ -586,6 +595,7 @@ def update_step(self, group, p, gindex, pindex):
                 config["weight_decay"],
                 gnorm_scale=gnorm_scale,
                 skip_zeros=config["skip_zeros"],
+                return_updates=return_updates,
             )