vllm-project · kylesayrs · Jun 3, 2025 · Jun 5, 2025 · Jun 6, 2025 · Jun 6, 2025
diff --git a/examples/awq/README.md b/examples/awq/README.md
@@ -18,11 +18,7 @@ recipe = [
 To use your own model, start with an existing example change the `model_id` to match your own model stub.
 ```python
 model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 ```
 
 ## Adding Mappings ##

diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
@@ -7,9 +7,7 @@
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.

diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
@@ -3,13 +3,12 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen3-30B-A3B"
 
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
@@ -71,6 +70,7 @@ def tokenize(sample):
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))

diff --git a/examples/big_models_with_accelerate/README.md b/examples/big_models_with_accelerate/README.md
diff --git a/examples/big_models_with_accelerate/cpu_offloading_fp8.py b/examples/big_models_with_accelerate/cpu_offloading_fp8.py
diff --git a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py
diff --git a/examples/big_models_with_sequential_onloading/README.md b/examples/big_models_with_sequential_onloading/README.md
@@ -0,0 +1,12 @@
+## Big Modeling with Sequential Onloading ##
+### What is Sequential Onloading? ###
+Sequential onloading is a memory-efficient approach for compressing large language models (LLMs) using only a single GPU. Instead of loading the entire model into memory—which can easily require hundreds of gigabytes—this method loads and compresses one layer at a time. The outputs are offloaded before the next layer is processed, dramatically reducing peak memory usage while maintaining high compression fidelity.
+
+<p align="center">
+    <img src="assets/sequential_onloading.png"/>
+</p>
+
+For more information, see the [RedHat AI blog post](https://developers.redhat.com/articles/2025/05/09/llm-compressor-optimize-llms-low-latency-deployments#generalizing_to_multimodal_and_moe_architectures) or the [LLM Compressor Office Hours Recording](https://www.youtube.com/watch?v=GrhuqQDmBk8).
+
+### Using Sequential Onloading ###
+Sequential onloading is enabled by default within LLM Compressor. To disable sequential onloading, add the `pipeline="basic"` argument to the LLM Compressor `oneshot` function call.
diff --git a/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png b/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png
diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md
@@ -21,11 +21,7 @@ This directory contains example scripts for quantizing a variety of audio langua
 To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
 ```python3
 model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 ```
 
 ## Customizing GPTQModifier Parameters ##

diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
@@ -4,15 +4,12 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "openai/whisper-large-v3"
 
-model = WhisperForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
 model.config.forced_decoder_ids = None
 processor = WhisperProcessor.from_pretrained(MODEL_ID)
 
@@ -91,13 +88,13 @@ def data_collator(batch):
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 sample_features = next(iter(ds))["input_features"]
 sample_decoder_ids = [processor.tokenizer.prefix_tokens]
 sample_input = {
     "input_features": torch.tensor(sample_features).to(model.device),
     "decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device),
 }
-
 output = model.generate(**sample_input, language="en")
 print(processor.batch_decode(output, skip_special_tokens=True))
 print("==========================================\n\n")