[Bugfix] Fix multigpu dispatch_for_generation (#1567)

kylesayrs · web-flow · commit f6010ce1fba7 · 2025-06-18T16:55:25.000Z
## Purpose ## * Fix `test_oneshot_and_finetune_with_tokenizer.py` with multiple cuda devices * This test had two failures, the first failure occurred as a result of sequential onloading introducing `dispatch_for_generation`. This function is also used to dispatch for training. However, this method did not account for no split modules * The second failure is an existing failure where HFTrainer.compute_loss does not account for multi-gpu models. This will be fixed in the next release by huggingface/transformers#38029 ## Changes ## * Pass no split modules when computing a device map for generation (and training) * Load model on CPU (since this is now the default flow as of sequential onloading landing ## Testing ## * Ran `test_oneshot_and_finetune_with_tokenizer` with two GPUs to completion (with upstream transformers) --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -270,7 +270,7 @@ def compute_loss(
         model: Module,
         inputs: Dict[str, Any],
         return_outputs: bool = False,
-        num_items_in_batch: Optional[int] = None,
+        num_items_in_batch: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, Any]]:
         """
         Override for the compute_loss to factor trigger callbacks and filter columns
@@ -279,6 +279,7 @@ def compute_loss(
         :param inputs: the inputs to pass through the model for calculating the loss
         :param return_outputs: True to return the outputs with the loss,
             False otherwise
+        :param num_items_in_batch: the number of items which contribute to loss
         :return: the resulting loss if not return_outputs, otherwise a tuple
             containing the loss and the model's outputs
         """
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
@@ -126,11 +126,17 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
     """
     remove_dispatch(model)
 
+    no_split_module_classes = model._get_no_split_modules("auto")
     max_memory = get_balanced_memory(
         model,
         dtype=model.dtype,
-        no_split_module_classes=model._get_no_split_modules("auto"),
+        no_split_module_classes=no_split_module_classes,
+    )
+    device_map = infer_auto_device_map(
+        model,
+        dtype=model.dtype,
+        max_memory=max_memory,
+        no_split_module_classes=no_split_module_classes,
     )
-    device_map = infer_auto_device_map(model, dtype=model.dtype, max_memory=max_memory)
 
     return dispatch_model(model, device_map=device_map)
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py
@@ -36,7 +36,7 @@ def test_oneshot_and_finetune_with_tokenizer(self):
             self.model,
         )
         model_loaded = AutoModelForCausalLM.from_pretrained(
-            self.model, device_map="cuda:0", torch_dtype="auto"
+            self.model, torch_dtype="auto"
         )
 
         dataset_loaded = load_dataset(

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def test_oneshot_and_finetune_with_tokenizer(self):`
`36`	`36`	`self.model,`
`37`	`37`	`)`
`38`	`38`	`model_loaded = AutoModelForCausalLM.from_pretrained(`
`39`		`- self.model, device_map="cuda:0", torch_dtype="auto"`
	`39`	`+ self.model, torch_dtype="auto"`
`40`	`40`	`)`
`41`	`41`
`42`	`42`	`dataset_loaded = load_dataset(`