pytorch
diff --git a/‎README.md
Lines changed: 3 additions & 3 deletions b/‎README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmarks/benchmark_low_bit_adam.py
Lines changed: 6 additions & 8 deletions b/‎benchmarks/benchmark_low_bit_adam.py
Lines changed: 6 additions & 8 deletions
diff --git a/‎benchmarks/quantized_training/pretrain_llama2.py
Lines changed: 6 additions & 7 deletions b/‎benchmarks/quantized_training/pretrain_llama2.py
Lines changed: 6 additions & 7 deletions
diff --git a/‎test/prototype/test_low_bit_optim.py renamed to ‎test/test_low_bit_optim.py
Lines changed: 23 additions & 29 deletions b/‎test/prototype/test_low_bit_optim.py renamed to ‎test/test_low_bit_optim.py
Lines changed: 23 additions & 29 deletions
diff --git a/‎torchao/__init__.py
Lines changed: 2 additions & 1 deletion b/‎torchao/__init__.py
Lines changed: 2 additions & 1 deletion
@@ -115,13 +115,13 @@ swap_linear_with_semi_sparse_linear(model, {"seq.0": SemiSparseLinear})
 ADAM takes 2x as much memory as the model params so we can quantize the optimizer state to either 8 or 4 bit effectively reducing the optimizer VRAM requirements by 2x or 4x respectively over an fp16 baseline
 
 ```python
-from torchao.prototype.low_bit_optim import AdamW8bit, AdamW4bit, AdamWFp8
+from torchao.optim import AdamW8bit, AdamW4bit, AdamWFp8
 optim = AdamW8bit(model.parameters()) # replace with Adam4bit and AdamFp8 for the 4 / fp8 versions
 ```
 
-In practice, we are a tiny bit slower than expertly written kernels but the implementations for these optimizers were written in a **few hundred lines of PyTorch code** and compiled so please use them or copy-paste them for your quantized optimizers. Benchmarks [here](https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim)
+In practice, we are a tiny bit slower than expertly written kernels but the implementations for these optimizers were written in a **few hundred lines of PyTorch code** and compiled so please use them or copy-paste them for your quantized optimizers. Benchmarks [here](https://github.com/pytorch/ao/tree/main/torchao/optim)
 
-We also have support for [single GPU CPU offloading](https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim#optimizer-cpu-offload) where both the gradients (same size as weights) and the optimizers will be efficiently sent to the CPU. This alone can **reduce your VRAM requirements by 60%**
+We also have support for [single GPU CPU offloading](https://github.com/pytorch/ao/tree/main/torchao/optim#optimizer-cpu-offload) where both the gradients (same size as weights) and the optimizers will be efficiently sent to the CPU. This alone can **reduce your VRAM requirements by 60%**
 
 ```python
 optim = CPUOffloadOptimizer(model.parameters(), torch.optim.AdamW, fused=True)
 
@@ -34,7 +34,7 @@
 from torchvision.transforms import v2
 from tqdm import tqdm
 
-from torchao.prototype import low_bit_optim
+from torchao import optim
 from torchao.utils import get_available_devices
 
 _DEVICE = get_available_devices()[-1]
@@ -43,9 +43,9 @@
 OPTIM_MAP = dict(
     AdamW=partial(torch.optim.AdamW, fused=True),
     AdamW8bitBnb=bnb.optim.AdamW8bit,
-    AdamW8bitAo=low_bit_optim.AdamW8bit,
-    AdamWFp8Ao=low_bit_optim.AdamWFp8,
-    AdamW4bitAo=low_bit_optim.AdamW4bit,
+    AdamW8bitAo=optim.AdamW8bit,
+    AdamWFp8Ao=optim.AdamWFp8,
+    AdamW4bitAo=optim.AdamW4bit,
 )
 
 try:
@@ -249,12 +249,10 @@ def evaluate_model(model, args):
         optim_cls = OPTIM_MAP[args.optim]
 
         if args.optim_cpu_offload == "ao":
-            optim_cls = partial(
-                low_bit_optim.CPUOffloadOptimizer, optimizer_class=optim_cls
-            )
+            optim_cls = partial(optim.CPUOffloadOptimizer, optimizer_class=optim_cls)
         elif args.optim_cpu_offload == "ao_offload_grads":
             optim_cls = partial(
-                low_bit_optim.CPUOffloadOptimizer,
+                optim.CPUOffloadOptimizer,
                 optimizer_class=optim_cls,
                 offload_gradients=True,
             )
 
@@ -22,14 +22,13 @@
 from torch.utils.checkpoint import checkpoint
 from tqdm import tqdm
 
-from torchao import quantize_
+from torchao import optim, quantize_
 from torchao._models.llama.model import (
     ModelArgs,
     RMSNorm,
     Transformer,
     transformer_configs,
 )
-from torchao.prototype import low_bit_optim
 from torchao.prototype.quantized_training import (
     bitnet_training,
     int8_mixed_precision_training,
@@ -190,10 +189,10 @@ def insert_rmsnorm(module: torch.nn.Module):
     print(f"No. of buffers: {sum(p.numel() for p in model.buffers()):,}")
     torch.cuda.reset_peak_memory_stats()  # don't count memory occupied by unquantized weights
 
-    # only use optimizers from torchao.prototype.low_bit_optim to support quantized training
+    # only use optimizers from torchao.optim to support quantized training
     if args.optim == "AdamW":
         args.optim = "_AdamW"
-    optim = getattr(low_bit_optim, args.optim)(
+    optimizer = getattr(optim, args.optim)(
         model.parameters(),
         lr=args.lr,
         weight_decay=args.weight_decay,
@@ -228,15 +227,15 @@ def insert_rmsnorm(module: torch.nn.Module):
         if step % args.log_interval == 0:
             log_dict = dict(
                 loss=loss.item(),
-                lr=optim.param_groups[0]["lr"],
+                lr=optimizer.param_groups[0]["lr"],
                 max_memory_allocated=torch.cuda.max_memory_allocated() / 1e9,
                 max_memory_reserved=torch.cuda.max_memory_reserved() / 1e9,
             )
             run.log(log_dict, step=step)
             pbar.set_postfix(loss=log_dict["loss"])
 
-        optim.step()
-        optim.zero_grad()
+        optimizer.step()
+        optimizer.zero_grad()
 
         step += 1
         pbar.update()
 
@@ -17,15 +17,15 @@
 )
 
 from packaging.version import Version
-from torchao.prototype import low_bit_optim
-from torchao.prototype.low_bit_optim.quant_utils import (
+from torchao import optim
+from torchao.optim.quant_utils import (
     _fp32_to_bf16_sr,
     quantize_4bit_with_qmap,
     quantize_8bit_with_qmap,
 )
-from torchao.prototype.low_bit_optim.subclass_4bit import OptimState4bit
-from torchao.prototype.low_bit_optim.subclass_8bit import OptimState8bit
-from torchao.prototype.low_bit_optim.subclass_fp8 import OptimStateFp8
+from torchao.optim.subclass_4bit import OptimState4bit
+from torchao.optim.subclass_8bit import OptimState8bit
+from torchao.optim.subclass_fp8 import OptimStateFp8
 from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_4,
@@ -125,29 +125,29 @@ def test_optim_smoke(self, optim_name, dtype, device):
 
         model = nn.Sequential(nn.Linear(32, 256), nn.ReLU(), nn.Linear(256, 32))
         model.to(device=device, dtype=dtype)
-        optim = getattr(low_bit_optim, optim_name)(model.parameters())
+        optimizer = getattr(optim, optim_name)(model.parameters())
 
         x = torch.randn(4, 32, device=device, dtype=dtype)
         loss = model(x).sum()
         loss.backward()
-        optim.step()
-        optim.zero_grad()
+        optimizer.step()
+        optimizer.zero_grad()
 
         # test serialization. also test the case CUDA optim loads CPU state dict
         with tempfile.NamedTemporaryFile() as f:
-            torch.save(optim.state_dict(), f.name)
+            torch.save(optimizer.state_dict(), f.name)
             state_dict = torch.load(f.name, map_location="cpu")
 
         model2 = copy.deepcopy(model)
-        optim2 = getattr(low_bit_optim, optim_name)(model2.parameters())
+        optim2 = getattr(optim, optim_name)(model2.parameters())
         optim2.load_state_dict(state_dict)
 
         for _ in range(2):
             x = torch.randn(4, 32, device=device, dtype=dtype)
 
             model(x).sum().backward()
-            optim.step()
-            optim.zero_grad()
+            optimizer.step()
+            optimizer.zero_grad()
 
             model2(x).sum().backward()
             optim2.step()
@@ -201,9 +201,7 @@ def test_optim_8bit_correctness(self, optim_name):
         block_size = 256 if Version(bnb.__version__) >= Version("0.44.0") else 2048
 
         optim1 = getattr(bnb.optim, optim_name)(model1.parameters())
-        optim2 = getattr(low_bit_optim, optim_name)(
-            model2.parameters(), block_size=block_size
-        )
+        optim2 = getattr(optim, optim_name)(model2.parameters(), block_size=block_size)
 
         for _ in range(2):
             x = torch.randn(4, 32, device=device)
@@ -240,7 +238,7 @@ def test_optim_4bit_correctness(self, optim_name):
             optim1 = lpmm.optim.AdamW(model1.parameters())
         else:
             raise ValueError(f"Unsupported {optim_name} optimizer for lpmm")
-        optim2 = getattr(low_bit_optim, optim_name)(model2.parameters())
+        optim2 = getattr(optim, optim_name)(model2.parameters())
 
         for _ in range(2):
             x = torch.randn(4, 32, device=device)
@@ -286,7 +284,7 @@ def test_optim_cpu_offload_correctness(self, offload_grad, grad_accum):
         model2 = copy.deepcopy(model1)
 
         optim1 = torch.optim.AdamW(model1.parameters())
-        optim2 = low_bit_optim.CPUOffloadOptimizer(
+        optim2 = optim.CPUOffloadOptimizer(
             model2.parameters(),
             torch.optim.AdamW,
             offload_gradients=offload_grad,
@@ -335,9 +333,7 @@ def test_optim_cpu_offload_save_load(self):
             nn.Linear(32, 1024, bias=True), nn.ReLU(), nn.Linear(1024, 128, bias=True)
         )
         model1.to(device)
-        optim1 = low_bit_optim.CPUOffloadOptimizer(
-            model1.parameters(), torch.optim.AdamW
-        )
+        optim1 = optim.CPUOffloadOptimizer(model1.parameters(), torch.optim.AdamW)
 
         for _ in range(2):
             x = torch.randn(4, 32, device=device)
@@ -352,9 +348,7 @@ def test_optim_cpu_offload_save_load(self):
 
         # resume training
         model2 = copy.deepcopy(model1)
-        optim2 = low_bit_optim.CPUOffloadOptimizer(
-            model2.parameters(), torch.optim.AdamW
-        )
+        optim2 = optim.CPUOffloadOptimizer(model2.parameters(), torch.optim.AdamW)
         optim2.load_state_dict(state_dict)
 
         for _ in range(2):
@@ -381,7 +375,7 @@ def test_optim_bf16_stochastic_round_correctness(self):
         # small LR so that weight update is small
         # when bf16_stochastic_round=False, the test will fail after 1 iteration
         optim1 = torch.optim.AdamW(model1.parameters(), lr=1e-5)
-        optim2 = low_bit_optim._AdamW(
+        optim2 = optim._AdamW(
             model2.parameters(),
             lr=1e-5,
             bf16_stochastic_round=True,
@@ -424,9 +418,9 @@ def world_size(self) -> int:
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
     @skip_if_rocm("ROCm enablement in progress")
     def test_fsdp2(self):
-        optim_classes = [low_bit_optim.AdamW8bit, low_bit_optim.AdamW4bit]
+        optim_classes = [optim.AdamW8bit, optim.AdamW4bit]
         if torch.cuda.get_device_capability() >= (8, 9):
-            optim_classes.append(low_bit_optim.AdamWFp8)
+            optim_classes.append(optim.AdamWFp8)
 
         self.run_subtests(
             {"optim_cls": optim_classes},
@@ -545,13 +539,13 @@ def test_uneven_shard(self):
 
         # currently all of our low-bit Adam/AdamW share the same implementation.
         # thus, we only need to test for 1 optimizer class.
-        optim = low_bit_optim.AdamW8bit(model.parameters())
+        optimizer = optim.AdamW8bit(model.parameters())
 
         for _ in range(2):
             inputs = torch.randn(2, in_dim, device="cuda")
             model(inputs).sum().backward()
-            optim.step()
-            optim.zero_grad()
+            optimizer.step()
+            optimizer.zero_grad()
 
 
 instantiate_parametrized_tests(TestQuantize)
 
@@ -48,11 +48,12 @@
     quantize_,
 )
 
-from . import dtypes, testing
+from . import dtypes, optim, testing
 
 __all__ = [
     "dtypes",
     "autoquant",
+    "optim",
     "quantize_",
     "testing",
     "ops",