[Tests] Start oneshot tests on CPU (#1555)

kylesayrs · dsikka · web-flow · commit c4b6365750a9 · 2025-06-25T14:45:30.000Z
## Purpose ## * Speed up tests by reducing device movement ## Background ## As of #1263, the model is dispatched to different device maps depending on which pipelines are used. If the model starts on anything but the CPU, then these dispatches and undispatches create device movement. Starting on the CPU will ensure no device movement occurs when offloaded dispatches happen. Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
diff --git a/tests/llmcompressor/recipe/test_recipe_parsing.py b/tests/llmcompressor/recipe/test_recipe_parsing.py
@@ -18,7 +18,6 @@ def setup_model_and_config(tmp_path):
     """
     model = AutoModelForCausalLM.from_pretrained(
         "Xenova/llama2.c-stories110M",
-        device_map="auto",
         torch_dtype="auto",
     )
 
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
@@ -21,7 +21,7 @@ def setUp(self):
     def test_oneshot_sparsification_then_finetune(self):
         recipe_str = "tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml"
         model = AutoModelForCausalLM.from_pretrained(
-            "nm-testing/llama2.c-stories15M", device_map="auto", torch_dtype="auto"
+            "nm-testing/llama2.c-stories15M", torch_dtype="auto"
         )
         dataset = "open_platypus"
         concatenate_data = False
@@ -47,12 +47,11 @@ def test_oneshot_sparsification_then_finetune(self):
         # Explictly decompress the model for training using quantization_config
         model = AutoModelForCausalLM.from_pretrained(
             self.output / "oneshot_out",
-            device_map="auto",
             torch_dtype="auto",
             quantization_config=self.quantization_config,
         )
         distill_teacher = AutoModelForCausalLM.from_pretrained(
-            "nm-testing/llama2.c-stories15M", device_map="auto", torch_dtype="auto"
+            "nm-testing/llama2.c-stories15M", torch_dtype="auto"
         )
         dataset = "open_platypus"
         concatenate_data = False
@@ -88,7 +87,6 @@ def test_oneshot_sparsification_then_finetune(self):
         # Explictly decompress the model for training using quantization_config
         model = AutoModelForCausalLM.from_pretrained(
             output_dir,
-            device_map="auto",
             torch_dtype="auto",
             quantization_config=self.quantization_config,
         )
@@ -112,7 +110,7 @@ def test_oneshot_quantization_then_finetune(self):
         )
 
         model = AutoModelForCausalLM.from_pretrained(
-            "TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto", torch_dtype="auto"
+            "TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype="auto"
         )
         dataset = "open_platypus"
         concatenate_data = False
@@ -136,7 +134,6 @@ def test_oneshot_quantization_then_finetune(self):
         quantization_config = CompressedTensorsConfig(run_compressed=False)
         model = AutoModelForCausalLM.from_pretrained(
             output_dir,
-            device_map="auto",
             torch_dtype="auto",
             quantization_config=quantization_config,
         )
@@ -159,7 +156,6 @@ def test_oneshot_quantization_then_finetune(self):
         # test reloading checkpoint and final model
         model = AutoModelForCausalLM.from_pretrained(
             output_dir,
-            device_map="auto",
             torch_dtype="auto",
             quantization_config=quantization_config,
         )
diff --git a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py
@@ -44,7 +44,6 @@ def _test_consecutive_runs(
 
         first_model = AutoModelForCausalLM.from_pretrained(
             self.output_first,
-            device_map="auto",
             torch_dtype="auto",
             quantization_config=self.quantization_config,
         )
@@ -74,7 +73,6 @@ def _test_consecutive_runs(
         second_model = AutoModelForCausalLM.from_pretrained(
             self.output_second,
             quantization_config=self.quantization_config,
-            device_map="auto",
             torch_dtype="auto",
         )
 

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,6 @@ def setup_model_and_config(tmp_path):`
`18`	`18`	`"""`
`19`	`19`	`model = AutoModelForCausalLM.from_pretrained(`
`20`	`20`	`"Xenova/llama2.c-stories110M",`
`21`		`- device_map="auto",`
`22`	`21`	`torch_dtype="auto",`
`23`	`22`	`)`
`24`	`23`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,6 @@ def _test_consecutive_runs(`
`44`	`44`
`45`	`45`	`first_model = AutoModelForCausalLM.from_pretrained(`
`46`	`46`	`self.output_first,`
`47`		`- device_map="auto",`
`48`	`47`	`torch_dtype="auto",`
`49`	`48`	`quantization_config=self.quantization_config,`
`50`	`49`	`)`
`@@ -74,7 +73,6 @@ def _test_consecutive_runs(`
`74`	`73`	`second_model = AutoModelForCausalLM.from_pretrained(`
`75`	`74`	`self.output_second,`
`76`	`75`	`quantization_config=self.quantization_config,`
`77`		`- device_map="auto",`
`78`	`76`	`torch_dtype="auto",`
`79`	`77`	`)`
`80`	`78`