[Pipelines] infer model device with optional override (#1572)

kylesayrs · web-flow · commit 50bb6569f291 · 2025-06-20T17:32:46.000-04:00
## Purpose ## * Fix support for deepseekv2.5 * Add more robustness inference for model devices when calibrating ## Prerequisites ## * neuralmagic/compressed-tensors#363 ## Background ## Normally, starting model inputs on the cpu is not an issue for the sequential pipeline, since the sequential pipeline offloads models and offloaded models automatically place inputs on the proper devices. However, the deepseekv2.5 model is an exception, as this model [performs an add operation](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/modeling_deepseek.py#L886) between a module output (`attn_weights` and a model input `attention_mask`) before the model input has a chance to be placed on the proper device. ## Changes ## * Use `model_device` when deciding the onload device for model inputs ## Testing ## * Ran deepseekv2.5 example to completion * TODO: run nightly to confirm other models work with new input device placement --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments):
             "will execute code present on the Hub on your local machine."
         },
     )
+    # --- pipeline arguments --- #
     pipeline: Optional[str] = field(
         default="independent",
         metadata={
diff --git a/src/llmcompressor/pipelines/layer_sequential/helpers.py b/src/llmcompressor/pipelines/layer_sequential/helpers.py
@@ -44,6 +44,7 @@ def capture_first_layer_intermediates(
     model: Module,
     first_layer: Module,
     dataloader: DataLoader,
+    model_device: torch.device = torch.device("cpu"),
     mask_padding: bool = True,
 ) -> IntermediatesCache:
     """
@@ -68,7 +69,7 @@ def capture_first_layer_intermediates(
         desc = "Preparing intermediates cache"
         for batch_index, batch in enumerate(tqdm.tqdm(dataloader, desc=desc)):
             batch = apply_pad_mask_to_batch(batch) if mask_padding else batch
-            batch = tensors_to_device(batch, torch.device("cpu"))
+            batch = tensors_to_device(batch, model_device)
 
             try:
                 model(**batch)
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -2,7 +2,7 @@
 
 import torch
 import tqdm
-from compressed_tensors.utils import disable_offloading
+from compressed_tensors.utils import disable_offloading, get_execution_device
 from torch.utils.data.dataloader import DataLoader
 
 from llmcompressor.core import LifecycleCallbacks, active_session
@@ -60,6 +60,7 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
+        model_device = get_execution_device(model)
 
         # find layers
         modifiers = session.get_modifiers()
@@ -71,7 +72,7 @@ def __call__(
         with calibration_forward_context(model), DisableQuantization(model):
             # prepare intermediates cache
             intermediates: IntermediatesCache = capture_first_layer_intermediates(
-                model, layers[0], dataloader
+                model, layers[0], dataloader, model_device
             )
 
             num_layers = len(layers)
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 
 import torch
-from compressed_tensors.utils import disable_offloading
+from compressed_tensors.utils import disable_offloading, get_execution_device
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
 
@@ -54,6 +54,7 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
+        model_device = get_execution_device(model)
 
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()
@@ -69,7 +70,7 @@ def __call__(
 
         with calibration_forward_context(model), DisableQuantization(model):
             # prepare intermediates cache
-            activations = IntermediatesCache.from_dataloader(dataloader)
+            activations = IntermediatesCache.from_dataloader(dataloader, model_device)
 
             for subgraph_index, subgraph in enumerate(subgraphs):
                 # prepare tqdm description texts

Original file line number	Diff line number	Diff line change
`@@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments):`
`171`	`171`	`"will execute code present on the Hub on your local machine."`
`172`	`172`	`},`
`173`	`173`	`)`
	`174`	`+ # --- pipeline arguments --- #`
`174`	`175`	`pipeline: Optional[str] = field(`
`175`	`176`	`default="independent",`
`176`	`177`	`metadata={`