Skip to content

Commit 95822df

Browse files
authored
Merge branch 'kylesayrs/sequential-onloading' into kylesayrs/deepseek-v3
2 parents b83474f + 96476fe commit 95822df

File tree

3 files changed

+3
-34
lines changed

3 files changed

+3
-34
lines changed

src/llmcompressor/entrypoints/oneshot.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from typing import Optional
44

55
import torch
6-
from accelerate.hooks import remove_hook_from_module
76
from compressed_tensors.utils import offloaded_dispatch
87
from loguru import logger
98
from torch.utils.data import DataLoader
@@ -128,8 +127,9 @@ def __init__(
128127

129128
# offload to cpu if possible
130129
if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
131-
remove_hook_from_module(model_args.model, recurse=True)
132-
offloaded_dispatch(model_args.model, model_args.oneshot_device)
130+
offloaded_dispatch(
131+
model_args.model, execution_device=model_args.oneshot_device
132+
)
133133
else:
134134
logger.warning("CUDA is not available! Compressing model on CPU instead")
135135

src/llmcompressor/entrypoints/utils.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from pathlib import PosixPath
44
from typing import Optional, Tuple
55

6-
from accelerate.hooks import remove_hook_from_module
76
from loguru import logger
87
from torch.nn import Module
98
from transformers import (
@@ -106,9 +105,6 @@ def post_process(
106105
"Ex. `oneshot(..., output_dir=...)`"
107106
)
108107

109-
# Remove any existing hooks (maybe added by oneshot sequential onloading)
110-
remove_hook_from_module(model_args.model, recurse=True)
111-
112108
# Reset the one-time-use session upon completion
113109
if recipe_args is not None and recipe_args.clear_sparse_session:
114110
reset_session()

src/llmcompressor/transformers/compression/helpers.py

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -104,33 +104,6 @@ def infer_sparsity_structure_from_model(model: torch.nn.Module) -> Optional[str]
104104
return None
105105

106106

107-
def quantization_memory_requirement(model: torch.nn.Module) -> int:
108-
"""
109-
Determines the max number of bytes needed to store quantization scale and zp data
110-
111-
:param model: model to calculate requirements for
112-
:return: number of bytes required to reserve for quantization
113-
"""
114-
115-
total_elements = 0
116-
for _, module in model.named_modules():
117-
if isinstance(module, Linear):
118-
for param in module.parameters():
119-
# assume the max of group 128 and static scale/zp
120-
# TODO: base this on the recipe instead instead of assuming max
121-
122-
# potentially just bias term
123-
max_quant_shape = param.shape[0] // 128
124-
125-
if len(param.size()) > 1: # weights
126-
max_quant_shape *= param.shape[1]
127-
128-
total_elements += max_quant_shape * 4
129-
130-
bytes_ratio = 32 // 16 # assuming float16
131-
return total_elements * bytes_ratio
132-
133-
134107
def infer_sparse_targets_and_ignores(
135108
model: torch.nn.Module,
136109
sparsity_structure: str,

0 commit comments

Comments
 (0)