Merge branch 'kylesayrs/sequential-onloading' into kylesayrs/deepseek-v3

kylesayrs · web-flow · commit 95822dff39b9 · 2025-06-15T01:22:23.000-04:00
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -3,7 +3,6 @@
 from typing import Optional
 
 import torch
-from accelerate.hooks import remove_hook_from_module
 from compressed_tensors.utils import offloaded_dispatch
 from loguru import logger
 from torch.utils.data import DataLoader
@@ -128,8 +127,9 @@ def __init__(
 
         # offload to cpu if possible
         if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
-            remove_hook_from_module(model_args.model, recurse=True)
-            offloaded_dispatch(model_args.model, model_args.oneshot_device)
+            offloaded_dispatch(
+                model_args.model, execution_device=model_args.oneshot_device
+            )
         else:
             logger.warning("CUDA is not available! Compressing model on CPU instead")
 
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -3,7 +3,6 @@
 from pathlib import PosixPath
 from typing import Optional, Tuple
 
-from accelerate.hooks import remove_hook_from_module
 from loguru import logger
 from torch.nn import Module
 from transformers import (
@@ -106,9 +105,6 @@ def post_process(
             "Ex. `oneshot(..., output_dir=...)`"
         )
 
-    # Remove any existing hooks (maybe added by oneshot sequential onloading)
-    remove_hook_from_module(model_args.model, recurse=True)
-
     # Reset the one-time-use session upon completion
     if recipe_args is not None and recipe_args.clear_sparse_session:
         reset_session()
diff --git a/src/llmcompressor/transformers/compression/helpers.py b/src/llmcompressor/transformers/compression/helpers.py
@@ -104,33 +104,6 @@ def infer_sparsity_structure_from_model(model: torch.nn.Module) -> Optional[str]
     return None
 
 
-def quantization_memory_requirement(model: torch.nn.Module) -> int:
-    """
-    Determines the max number of bytes needed to store quantization scale and zp data
-
-    :param model: model to calculate requirements for
-    :return: number of bytes required to reserve for quantization
-    """
-
-    total_elements = 0
-    for _, module in model.named_modules():
-        if isinstance(module, Linear):
-            for param in module.parameters():
-                # assume the max of group 128 and static scale/zp
-                # TODO: base this on the recipe instead instead of assuming max
-
-                # potentially just bias term
-                max_quant_shape = param.shape[0] // 128
-
-                if len(param.size()) > 1:  # weights
-                    max_quant_shape *= param.shape[1]
-
-                total_elements += max_quant_shape * 4
-
-    bytes_ratio = 32 // 16  # assuming float16
-    return total_elements * bytes_ratio
-
-
 def infer_sparse_targets_and_ignores(
     model: torch.nn.Module,
     sparsity_structure: str,