Use model compression pathways (#1419)

kylesayrs · brian-dellabetta · web-flow · commit 421bd613cee9 · 2025-06-05T18:52:29.000-04:00
## Purpose ## * Use in-memory model compression pathway in order to reduce memory requirements when saving models * These changes along with [postprocessing changes](https://github.com/vllm-project/llm-compressor/blob/main/src/llmcompressor/entrypoints/utils.py#L102) move users towards a pattern where they are aware of the status of the model (frozen/compressed) and call `save_pretrained` manually ## Prerequisites ## * #1449 ## Changes ## * Modify `save_pretrained_wrapper` to use `compress_model(model)` rather than `compress(state_dict)` * Modify `save_pretrained_wrapper` so that the state dict is only retrieved if not skipping compression stats * Modify `save_pretrained_wrapper` to save dictionary and python files, even if there is no explicit compressor * Modify `save_checkpoint` (used by training) to decompress after the checkpoint is saved ## Example/Testing Changes ## As far as I can tell, below lists all of the instances where a model undergoes saving (no immediately followed by script exit) File Path | Solution -- | -- examples/trl_mixin/ex_trl_constant.py <br> test_oneshot_and_finetune.py <br> tests/llmcompressor/transformers/obcq/test_obcq_completion.py | Decompress in between stages examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py <br> test_oneshot_and_finetune_with_tokenizer.py | Do not save in between stages to avoid compressed state test_oneshot_then_finetune.py | No work is required, as model is decompressed upon loading from disk test_compress_tensor_utils.py | Fix test to use `dispatch_model` (which is actually used by transformers) rather than `cpu_offload` ## Testing ## State Dict | In Memory -- | -- ![previous](https://github.com/user-attachments/assets/f661a9a9-f546-4196-bb7c-58e48409d86d) | ![now](https://github.com/user-attachments/assets/b5edb8f9-1bfb-4474-83c4-48d1942f7c53) <details><summary>oneshot_save.py</summary> ```python3 import torch from transformers import AutoModelForCausalLM from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier from pttp import TensorProfiler #MODEL_ID = "DeepSeek-V3_local_bf16" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" with TensorProfiler() as prof: prof.mark_event("Load model") model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16) prof.mark_event("Oneshot") oneshot( model=model, recipe=QuantizationModifier(targets="Linear", scheme="W4A16"), trust_remote_code_model=True, ) prof.mark_event("Save model") model.save_pretrained("sav_testing", save_compressed=True, skip_compression_stats=True) prof.save_memory_timeline("save_timeline.png") ``` </details> ## Testing ## * Nightly: https://github.com/neuralmagic/llm-compressor-testing/actions/runs/15453075963 --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -68,15 +68,13 @@
     model=model,
     **oneshot_kwargs,
     stage="sparsity_stage",
-    output_dir=output_dir,
 )
 
 # Sparse finetune
 finetune_applied_model = train(
     model=oneshot_applied_model,
     **oneshot_kwargs,
     **training_kwargs,
-    output_dir=output_dir,
     stage="finetuning_stage",
 )
 
diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -41,6 +41,10 @@ def save_checkpoint(
     :param save_safetensors: save model checkpoint using safetensors file type
     :param save_compressed: save model checkpoint using compressed-tensors format
     """
+    from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
+        get_model_compressor,  # avoid circular import
+    )
+
     # saving the model also saves the recipe
     model.save_pretrained(
         save_path,
@@ -51,6 +55,16 @@ def save_checkpoint(
     if processor is not None:
         processor.save_pretrained(save_path)
 
+    # saving the model modifies the model strcuture
+    # as this is only a checkpoint, decompress model to enable future training/oneshot
+    compressor = get_model_compressor(
+        model=model,
+        save_compressed=save_compressed,
+        skip_sparsity_compression_stats=skip_sparsity_compression_stats,
+    )
+    if compressor is not None:
+        compressor.decompress_model(model)
+
 
 def fallback_to_cpu(device: str) -> str:
     """
diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -2,7 +2,7 @@
 import re
 import weakref
 from functools import wraps
-from typing import Dict, Optional
+from typing import Optional
 
 import torch
 import transformers
@@ -91,45 +91,27 @@ def save_pretrained_wrapper(
             # https://github.com/huggingface/transformers/pull/30488
             transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size
 
-            # state_dict gets passed in as a kwarg for FSDP models
-            state_dict = kwargs.pop("state_dict", None)
-            if state_dict is None:
-                logger.info("Fetching state_dict - this may take some time")
-                state_dict = get_state_dict_offloaded_model(model)
-
-            logger.info("Fetching compressor")
+            # compress model using compressor
             compressor = get_model_compressor(
                 model=model,
                 sparsity_config=sparsity_config,
                 quantization_format=quantization_format,
                 save_compressed=save_compressed,
                 skip_sparsity_compression_stats=skip_sparsity_compression_stats,
-                state_dict=state_dict,
                 disable_sparse_compression=disable_sparse_compression,
             )
+            if compressor is not None:
+                compressor.compress_model(model)
+
+            # save (compressed) model structure
+            original_save_pretrained.__get__(model, model_class)(
+                save_directory,
+                safe_serialization=safe_serialization,
+                **kwargs,
+            )
 
-            if compressor is None:
-                # model is not compressed or quantized, save as normal
-                original_save_pretrained_func = original_save_pretrained.__get__(
-                    model, model_class
-                )
-                original_save_pretrained_func(
-                    save_directory, state_dict=state_dict, **kwargs
-                )
-                return
-
-            # make sure we're on the main process when saving
-            if state_dict is not None and len(state_dict) > 0:
-                compressed_state_dict = compressor.compress(
-                    model, state_dict, show_progress=True
-                )
-                logger.info("Saving compressed model to disk")
-                original_save_pretrained.__get__(model, model_class)(
-                    save_directory,
-                    state_dict=compressed_state_dict,
-                    safe_serialization=safe_serialization,
-                    **kwargs,
-                )
+            # update config to reflect compression
+            if compressor is not None:
                 compressor.update_config(save_directory)
 
             # update existing recipe
@@ -197,7 +179,6 @@ def get_model_compressor(
     quantization_format: Optional[str] = None,
     save_compressed: bool = True,
     skip_sparsity_compression_stats: bool = True,
-    state_dict: Optional[Dict] = None,
     disable_sparse_compression: bool = False,
 ):
     """
@@ -211,12 +192,8 @@ def get_model_compressor(
     :param save_compressed: boolean representing to save in a compressed
         format
     :param skip_sparsity_compression_stats: bool allowing compression stats on std out
-    :param state_dict: state_dict of the model
     :param disable_sparse_compression: bool to skip sparse compression
     """
-    # find offloaded state dict if none is provided
-    if state_dict is None:
-        state_dict = get_state_dict_offloaded_model(model)
 
     if sparsity_config is None:
         """
@@ -244,6 +221,8 @@ def get_model_compressor(
             )
             sparsity_config = None
         else:
+            state_dict = get_state_dict_offloaded_model(model)
+
             sparsity_config = SparsityConfigMetadata.from_pretrained(
                 model,
                 state_dict=state_dict,
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py
@@ -7,6 +7,9 @@
 from parameterized import parameterized_class
 from transformers import AutoConfig
 
+from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
+    get_model_compressor,
+)
 from tests.testing_utils import parse_params, requires_gpu
 
 CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_oneshot_configs"
@@ -34,17 +37,21 @@ def _test_oneshot_and_finetune(self):
             output_dir=self.output,
         )
 
-        train_args = dict(
-            num_train_epochs=self.num_train_epochs,
-            precision="bfloat16",
-            bf16=True,
-        )
         oneshot_model = oneshot(
             model=self.model,
             **oneshot_args,
             stage="test_oneshot_stage",
         )
 
+        compressor = get_model_compressor(model=oneshot_model, save_compressed=True)
+        if compressor is not None:
+            compressor.decompress_model(oneshot_model)
+
+        train_args = dict(
+            num_train_epochs=self.num_train_epochs,
+            precision="bfloat16",
+            bf16=True,
+        )
         train(
             model=oneshot_model,
             **oneshot_args,
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py
@@ -55,7 +55,6 @@ def test_oneshot_and_finetune_with_tokenizer(self):
             concatenate_data=concatenate_data,
             splits=splits,
             tokenizer=tokenizer,
-            output_dir=self.output,
         )
 
         oneshot_model = oneshot(
@@ -70,6 +69,7 @@ def test_oneshot_and_finetune_with_tokenizer(self):
             max_steps=max_steps,
             stage="test_train_stage",
             **model_and_data_kwargs,
+            output_dir=self.output,
         )
 
         input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py
@@ -35,7 +35,7 @@ def labeled_dataloader(self, dataset_name, model_name):
         dataset_manager = TextGenerationDataset.load_from_registry(
             dataset_args.dataset,
             dataset_args=dataset_args,
-            split="train",
+            split=f"train[:{self.num_samples}]",
             processor=tokenizer,
         )
         calib_dataset = dataset_manager()
@@ -51,10 +51,14 @@ def _test_oneshot_completion(self, model_name: str = None):
         from llmcompressor import oneshot
         from llmcompressor.pytorch.model_load.helpers import get_session_model
         from llmcompressor.pytorch.utils import tensors_to_device
+        from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
+            get_model_compressor,  # avoid circular import
+        )
 
         oneshot(
             model=self.model,
             dataset=self.dataset,
+            splits={"calibration": f"train[:{self.num_samples}]"},
             oneshot_device=self.device,
             recipe=self.recipe,
             max_seq_length=512,
@@ -65,6 +69,13 @@ def _test_oneshot_completion(self, model_name: str = None):
         )
 
         first_tiny_model = get_session_model()
+        compressor = get_model_compressor(
+            model=first_tiny_model,
+            save_compressed=True,
+            skip_sparsity_compression_stats=False,
+        )
+        if compressor is not None:
+            compressor.decompress_model(first_tiny_model)
 
         dataset = "open_platypus"
 

Original file line number	Diff line number	Diff line change
`@@ -68,15 +68,13 @@`
`68`	`68`	`model=model,`
`69`	`69`	`**oneshot_kwargs,`
`70`	`70`	`stage="sparsity_stage",`
`71`		`- output_dir=output_dir,`
`72`	`71`	`)`
`73`	`72`
`74`	`73`	`# Sparse finetune`
`75`	`74`	`finetune_applied_model = train(`
`76`	`75`	`model=oneshot_applied_model,`
`77`	`76`	`**oneshot_kwargs,`
`78`	`77`	`**training_kwargs,`
`79`		`- output_dir=output_dir,`
`80`	`78`	`stage="finetuning_stage",`
`81`	`79`	`)`
`82`	`80`
Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,6 @@ def test_oneshot_and_finetune_with_tokenizer(self):`
`55`	`55`	`concatenate_data=concatenate_data,`
`56`	`56`	`splits=splits,`
`57`	`57`	`tokenizer=tokenizer,`
`58`		`- output_dir=self.output,`
`59`	`58`	`)`
`60`	`59`
`61`	`60`	`oneshot_model = oneshot(`
`@@ -70,6 +69,7 @@ def test_oneshot_and_finetune_with_tokenizer(self):`
`70`	`69`	`max_steps=max_steps,`
`71`	`70`	`stage="test_train_stage",`
`72`	`71`	`**model_and_data_kwargs,`
	`72`	`+ output_dir=self.output,`
`73`	`73`	`)`
`74`	`74`
`75`	`75`	`input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(`