Merge remote-tracking branch 'origin' into kylesayrs/deepseek-v3

kylesayrs · kylesayrs · commit 941deacefa49 · 2025-06-21T14:20:59.000-04:00
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments):
             "will execute code present on the Hub on your local machine."
         },
     )
+    # --- pipeline arguments --- #
     pipeline: Optional[str] = field(
         default="independent",
         metadata={
diff --git a/src/llmcompressor/pipelines/layer_sequential/helpers.py b/src/llmcompressor/pipelines/layer_sequential/helpers.py
@@ -44,6 +44,7 @@ def capture_first_layer_intermediates(
     model: Module,
     first_layer: Module,
     dataloader: DataLoader,
+    model_device: torch.device = torch.device("cpu"),
     mask_padding: bool = True,
 ) -> IntermediatesCache:
     """
@@ -68,7 +69,7 @@ def capture_first_layer_intermediates(
         desc = "Preparing intermediates cache"
         for batch_index, batch in enumerate(tqdm.tqdm(dataloader, desc=desc)):
             batch = apply_pad_mask_to_batch(batch) if mask_padding else batch
-            batch = tensors_to_device(batch, torch.device("cpu"))
+            batch = tensors_to_device(batch, model_device)
 
             try:
                 model(**batch)
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -2,7 +2,7 @@
 
 import torch
 import tqdm
-from compressed_tensors.utils import disable_offloading
+from compressed_tensors.utils import disable_offloading, get_execution_device
 from torch.utils.data.dataloader import DataLoader
 
 from llmcompressor.core import LifecycleCallbacks, active_session
@@ -60,6 +60,7 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
+        model_device = get_execution_device(model)
 
         # find layers
         modifiers = session.get_modifiers()
@@ -71,7 +72,7 @@ def __call__(
         with calibration_forward_context(model), DisableQuantization(model):
             # prepare intermediates cache
             intermediates: IntermediatesCache = capture_first_layer_intermediates(
-                model, layers[0], dataloader
+                model, layers[0], dataloader, model_device
             )
 
             num_layers = len(layers)
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 
 import torch
-from compressed_tensors.utils import disable_offloading
+from compressed_tensors.utils import disable_offloading, get_execution_device
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
 
@@ -54,6 +54,7 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
+        model_device = get_execution_device(model)
 
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()
@@ -69,7 +70,7 @@ def __call__(
 
         with calibration_forward_context(model), DisableQuantization(model):
             # prepare intermediates cache
-            activations = IntermediatesCache.from_dataloader(dataloader)
+            activations = IntermediatesCache.from_dataloader(dataloader, model_device)
 
             for subgraph_index, subgraph in enumerate(subgraphs):
                 # prepare tqdm description texts

Original file line number	Diff line number	Diff line change
`@@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments):`
`171`	`171`	`"will execute code present on the Hub on your local machine."`
`172`	`172`	`},`
`173`	`173`	`)`
	`174`	`+ # --- pipeline arguments --- #`
`174`	`175`	`pipeline: Optional[str] = field(`
`175`	`176`	`default="independent",`
`176`	`177`	`metadata={`