diff --git a/examples/awq/README.md b/examples/awq/README.md
index 0a837d6f3..fd4cb4b62 100644
--- a/examples/awq/README.md
+++ b/examples/awq/README.md
@@ -18,11 +18,7 @@ recipe = [
To use your own model, start with an existing example change the `model_id` to match your own model stub.
```python
model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(
- model_id,
- device_map="auto",
- torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
```
## Adding Mappings ##
diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
index 024e08cd7..b8b45c4f4 100644
--- a/examples/awq/llama_example.py
+++ b/examples/awq/llama_example.py
@@ -7,9 +7,7 @@
# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# Select calibration dataset.
diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
index 56bba8fe3..5fdc231c9 100644
--- a/examples/awq/qwen3_moe_example.py
+++ b/examples/awq/qwen3_moe_example.py
@@ -3,13 +3,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.utils import dispatch_for_generation
# Select model and load it.
MODEL_ID = "Qwen/Qwen3-30B-A3B"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# Select calibration dataset.
@@ -71,6 +70,7 @@ def tokenize(sample):
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
diff --git a/examples/big_models_with_accelerate/README.md b/examples/big_models_with_accelerate/README.md
deleted file mode 100644
index 801f46a2f..000000000
--- a/examples/big_models_with_accelerate/README.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# Quantizing Big Models with HF Accelerate
-
-`llmcompressor` integrates with `accelerate` to support quantizing large models such as Llama 70B and 405B, or quantizing any model with limited GPU resources.
-
-## Overview
-
-[`accelerate`]((https://huggingface.co/docs/accelerate/en/index)) is a highly useful library in the Hugging Face ecosystem that supports for working with large models, including:
-- Offloading parameters to CPU
-- Sharding models across multiple GPUs with pipeline-parallelism
-
-
-### Using `device_map`
-
-To enable `accelerate` features with `llmcompressor`, simple insert `device_map` in `from_pretrained` during model load.
-
-```python
-from transformers import AutoModelForCausalLM
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-
-# device_map="auto" triggers usage of accelerate
-# if > 1 GPU, the model will be sharded across the GPUs
-# if not enough GPU memory to fit the model, parameters are offloaded to the CPU
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto")
-```
-
-`llmcompressor` is designed to respect the `device_map`, so calls to `oneshot`
-will work properly out of the box for basic quantization with `QuantizationModifier`,
-even for CPU offloaded models.
-
-To enable CPU offloading for second-order quantization methods such as GPTQ, we need to
-allocate additional memory upfront when computing the device map. Not doing so risks
-potentially going out-of-memory.
-
-```python
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
-from transformers import AutoModelForCausalLM
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-
-# Load model, reserving memory in the device map for sequential GPTQ (adjust num_gpus as needed)
-device_map = calculate_offload_device_map(MODEL_ID, reserve_for_hessians=True, num_gpus=1)
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID,
- device_map=device_map,
- torch_dtype="auto",
-)
-```
-
-### Practical Advice
-
-When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down. If more gpu memory is not available, consider reducing the precision of the loaded model to a lower-width dtype such as `torch.bfloat16`.
-
-## Examples
-
-We will show working examples for each use case:
-- **CPU Offloading**: Quantize `Llama-70B` to `FP8` using `PTQ` with a single GPU
-- **Multi-GPU**: Quantize `Llama-70B` to `INT8` using `GPTQ` and `SmoothQuant` with 2 GPUs
-
-### Installation
-
-Install `llmcompressor`:
-
-```bash
-pip install llmcompressor
-```
-
-### CPU Offloading: `FP8` Quantization with `PTQ`
-
-CPU offloading is slow. As a result, we recommend using this feature only with data-free quantization methods. For example, when quantizing a model to `fp8`, we typically use simple `PTQ` to statically quantize the weights and use dynamic quantization for the activations. These methods do not require calibration data.
-
-- `cpu_offloading_fp8.py` demonstrates quantizing the weights and activations of `Llama-70B` to `fp8` on a single GPU:
-
-```bash
-export CUDA_VISIBLE_DEVICES=0
-python cpu_offloading_fp8.py
-```
-
-The resulting model `./Meta-Llama-3-70B-Instruct-FP8-Dynamic` is ready to run with `vllm`!
-
-### Multi-GPU: `INT8` Quantization with `GPTQ`
-
-For quantization methods that require calibration data (e.g. `GPTQ`), CPU offloading is too slow. For these methods, `llmcompressor` can use `accelerate` multi-GPU to quantize models that are larger than a single GPU. For example, when quantizing a model to `int8`, we typically use `GPTQ` to statically quantize the weights, which requires calibration data.
-
-- `multi_gpu_int8.py` demonstrates quantizing the weights and activations of `Llama-70B` to `int8` on 2 A100s:
-
-```python
-export CUDA_VISIBLE_DEVICES=0,1
-python multi_gpu_int8.py
-```
-
-The resulting model `./Meta-Llama-3-70B-Instruct-INT8-Dynamic` is quantized and ready to run with `vllm`!
-
-## Questions or Feature Request?
-
-Please open up an issue on `vllm-project/llm-compressor`
diff --git a/examples/big_models_with_accelerate/cpu_offloading_fp8.py b/examples/big_models_with_accelerate/cpu_offloading_fp8.py
deleted file mode 100644
index ded5ff8d6..000000000
--- a/examples/big_models_with_accelerate/cpu_offloading_fp8.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-OUTPUT_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
-
-# Load model
-# Note: device_map="auto" will offload to CPU if not enough space on GPU.
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
-)
-
-# Configure the quantization scheme and algorithm (PTQ + FP8_DYNAMIC).
-recipe = QuantizationModifier(
- targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
-)
-
-# Apply quantization and save in `compressed-tensors` format.
-oneshot(
- model=model,
- recipe=recipe,
- tokenizer=AutoTokenizer.from_pretrained(MODEL_ID),
- output_dir=OUTPUT_DIR,
-)
diff --git a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
deleted file mode 100644
index d98051d21..000000000
--- a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import torch
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
-
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-
-# adjust based off number of desired GPUs
-# reserve_for_hessians=True reserves memory which is required by
-# GPTQModifier and SparseGPTModifier
-device_map = calculate_offload_device_map(
- MODEL_ID, num_gpus=1, reserve_for_hessians=True, torch_dtype=torch.bfloat16
-)
-
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
- return {
- "text": tokenizer.apply_chat_template(
- example["messages"],
- tokenize=False,
- )
- }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
- return tokenizer(
- sample["text"],
- padding=False,
- max_length=MAX_SEQUENCE_LENGTH,
- truncation=True,
- add_special_tokens=False,
- )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for W8A8 quantization
-recipe = [
- SmoothQuantModifier(smoothing_strength=0.8),
- GPTQModifier(
- targets="Linear",
- scheme="W8A8",
- ignore=["lm_head"],
- ),
-]
-
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-INT8"
-
-oneshot(
- model=model,
- dataset=ds,
- recipe=recipe,
- max_seq_length=MAX_SEQUENCE_LENGTH,
- num_calibration_samples=NUM_CALIBRATION_SAMPLES,
- save_compressed=True,
- output_dir=SAVE_DIR,
-)
diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py
deleted file mode 100644
index 9c1679eab..000000000
--- a/examples/big_models_with_accelerate/multi_gpu_int8.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic"
-
-# 1) Load model (device_map="auto" with shard the model over multiple GPUs!).
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID,
- device_map="auto",
- torch_dtype="auto",
- trust_remote_code=True,
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# 2) Prepare calibration dataset (in this case, we use ultrachat).
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-
-# Select number of samples. 512 samples is a good place to start.
-# Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 1024
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
- return {
- "text": tokenizer.apply_chat_template(
- example["messages"],
- tokenize=False,
- )
- }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
- return tokenizer(
- sample["text"],
- padding=False,
- max_length=MAX_SEQUENCE_LENGTH,
- truncation=True,
- add_special_tokens=False,
- )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# 3) Configure algorithms. In this case, we:
-# * quantize the weights to int8 with GPTQ (static per channel)
-# * quantize the activations to int8 (dynamic per token)
-recipe = [
- GPTQModifier(
- targets="Linear", scheme="W8A8", ignore=["lm_head"], dampening_frac=0.1
- ),
-]
-
-# 4) Apply algorithms and save in `compressed-tensors` format.
-# if you encounter GPU out-of-memory issues, consider using an explicit
-# device map (see multi_gpus_int8_device_map.py)
-oneshot(
- model=model,
- tokenizer=tokenizer,
- dataset=ds,
- recipe=recipe,
- max_seq_length=MAX_SEQUENCE_LENGTH,
- num_calibration_samples=NUM_CALIBRATION_SAMPLES,
- output_dir=SAVE_DIR,
-)
diff --git a/examples/big_models_with_sequential_onloading/README.md b/examples/big_models_with_sequential_onloading/README.md
new file mode 100644
index 000000000..f10e1e394
--- /dev/null
+++ b/examples/big_models_with_sequential_onloading/README.md
@@ -0,0 +1,12 @@
+## Big Modeling with Sequential Onloading ##
+### What is Sequential Onloading? ###
+Sequential onloading is a memory-efficient approach for compressing large language models (LLMs) using only a single GPU. Instead of loading the entire model into memory—which can easily require hundreds of gigabytes—this method loads and compresses one layer at a time. The outputs are offloaded before the next layer is processed, dramatically reducing peak memory usage while maintaining high compression fidelity.
+
+
+
+
+
+For more information, see the [RedHat AI blog post](https://developers.redhat.com/articles/2025/05/09/llm-compressor-optimize-llms-low-latency-deployments#generalizing_to_multimodal_and_moe_architectures) or the [LLM Compressor Office Hours Recording](https://www.youtube.com/watch?v=GrhuqQDmBk8).
+
+### Using Sequential Onloading ###
+Sequential onloading is enabled by default within LLM Compressor. To disable sequential onloading, add the `pipeline="basic"` argument to the LLM Compressor `oneshot` function call.
\ No newline at end of file
diff --git a/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png b/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png
new file mode 100644
index 000000000..a499cb66a
Binary files /dev/null and b/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png differ
diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md
index d3d0631f9..e7ecca950 100644
--- a/examples/multimodal_audio/README.md
+++ b/examples/multimodal_audio/README.md
@@ -21,11 +21,7 @@ This directory contains example scripts for quantizing a variety of audio langua
To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
```python3
model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(
- model_id,
- device_map="auto",
- torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
```
## Customizing GPTQModifier Parameters ##
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index 5aad2ed96..9c2e494a8 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -4,15 +4,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
# Select model and load it.
MODEL_ID = "openai/whisper-large-v3"
-model = WhisperForConditionalGeneration.from_pretrained(
- MODEL_ID,
- device_map="auto",
- torch_dtype="auto",
-)
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
model.config.forced_decoder_ids = None
processor = WhisperProcessor.from_pretrained(MODEL_ID)
@@ -91,13 +88,13 @@ def data_collator(batch):
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
sample_features = next(iter(ds))["input_features"]
sample_decoder_ids = [processor.tokenizer.prefix_tokens]
sample_input = {
"input_features": torch.tensor(sample_features).to(model.device),
"decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device),
}
-
output = model.generate(**sample_input, language="en")
print(processor.batch_decode(output, skip_special_tokens=True))
print("==========================================\n\n")
diff --git a/examples/multimodal_vision/README.md b/examples/multimodal_vision/README.md
index 9d6d12295..c0d0808b4 100644
--- a/examples/multimodal_vision/README.md
+++ b/examples/multimodal_vision/README.md
@@ -25,11 +25,7 @@ This directory contains example scripts for quantizing a variety of vision-langu
To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
```python3
model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(
- model_id,
- device_map="auto",
- torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
```
## Customizing GPTQModifier Parameters ##
diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py
index 0ac78069b..5437ba36c 100644
--- a/examples/multimodal_vision/gemma3_example.py
+++ b/examples/multimodal_vision/gemma3_example.py
@@ -5,12 +5,11 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
# Load model.
model_id = "google/gemma-3-4b-it"
-model = Gemma3ForConditionalGeneration.from_pretrained(
- model_id, device_map="auto", torch_dtype="auto"
-)
+model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Oneshot arguments
@@ -54,6 +53,7 @@ def data_collator(batch):
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
messages = [
{
"role": "user",
diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
index a73213b21..1225349c4 100644
--- a/examples/multimodal_vision/idefics3_example.py
+++ b/examples/multimodal_vision/idefics3_example.py
@@ -6,12 +6,11 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
# Load model.
model_id = "HuggingFaceM4/Idefics3-8B-Llama3" # or "HuggingFaceTB/SmolVLM-Instruct"
-model = Idefics3ForConditionalGeneration.from_pretrained(
- model_id, device_map="auto", torch_dtype="auto"
-)
+model = Idefics3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Oneshot arguments
@@ -32,7 +31,6 @@ def data_collator(batch):
GPTQModifier(
targets="Linear",
scheme="W4A16",
- sequential_targets=["LlamaDecoderLayer"],
ignore=["re:.*lm_head", "re:model.vision_model.*", "re:model.connector.*"],
),
]
@@ -92,10 +90,12 @@ def tokenize(sample):
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
data_collator=data_collator,
+ sequential_targets=["LlamaDecoderLayer"],
)
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
messages = [
{
"role": "user",
diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
index d611ff110..0a17d8c50 100644
--- a/examples/multimodal_vision/llava_example.py
+++ b/examples/multimodal_vision/llava_example.py
@@ -5,12 +5,11 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
# Load model.
model_id = "llava-hf/llava-1.5-7b-hf"
-model = LlavaForConditionalGeneration.from_pretrained(
- model_id, device_map="auto", torch_dtype="auto"
-)
+model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Oneshot arguments
@@ -31,7 +30,6 @@ def data_collator(batch):
GPTQModifier(
targets="Linear",
scheme="W4A16",
- sequential_targets=["LlamaDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
@@ -47,10 +45,12 @@ def data_collator(batch):
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
data_collator=data_collator,
+ sequential_targets=["LlamaDecoderLayer"],
)
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
messages = [
{
"role": "user",
diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py
index 331ec0d95..e70ee43ec 100644
--- a/examples/multimodal_vision/mistral3_example.py
+++ b/examples/multimodal_vision/mistral3_example.py
@@ -8,12 +8,11 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
# Load model.
model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-model = Mistral3ForConditionalGeneration.from_pretrained(
- model_id, device_map="auto", torch_dtype="auto"
-)
+model = Mistral3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Use a custom calibration chat template, rather than the overly-verbose default
@@ -44,7 +43,6 @@ def data_collator(batch):
GPTQModifier(
targets="Linear",
scheme="W4A16",
- sequential_targets=["MistralDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
@@ -60,10 +58,12 @@ def data_collator(batch):
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
data_collator=data_collator,
+ sequential_targets=["MistralDecoderLayer"],
)
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
messages = [
{
"role": "user",
diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
index e4f71c64e..6672aff2e 100644
--- a/examples/multimodal_vision/mllama_example.py
+++ b/examples/multimodal_vision/mllama_example.py
@@ -5,12 +5,11 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
# Load model.
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-model = MllamaForConditionalGeneration.from_pretrained(
- model_id, device_map="auto", torch_dtype="auto"
-)
+model = MllamaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Oneshot arguments
@@ -31,7 +30,6 @@ def data_collator(batch):
GPTQModifier(
targets="Linear",
scheme="W4A16",
- sequential_targets=["MllamaSelfAttentionDecoderLayer"],
ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"],
),
]
@@ -47,10 +45,12 @@ def data_collator(batch):
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
data_collator=data_collator,
+ sequential_targets=["MllamaSelfAttentionDecoderLayer"],
)
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
messages = [
{
"role": "user",
diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
index d6615fc86..fa4b0feab 100644
--- a/examples/multimodal_vision/phi3_vision_example.py
+++ b/examples/multimodal_vision/phi3_vision_example.py
@@ -7,12 +7,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
# Load model.
model_id = "microsoft/Phi-3-vision-128k-instruct"
model = AutoModelForCausalLM.from_pretrained(
model_id,
- device_map="auto",
torch_dtype="auto",
trust_remote_code=True,
_attn_implementation="eager",
@@ -75,7 +75,6 @@ def data_collator(batch):
recipe = GPTQModifier(
targets="Linear",
scheme="W4A16",
- sequential_targets=["Phi3DecoderLayer"],
ignore=["lm_head", "re:model.vision_embed_tokens.*"],
)
@@ -88,10 +87,12 @@ def data_collator(batch):
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
data_collator=data_collator,
+ sequential_targets=["Phi3DecoderLayer"],
)
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(processor.decode(output[0]))
diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
index a62b223a5..a0ed50ef4 100644
--- a/examples/multimodal_vision/pixtral_example.py
+++ b/examples/multimodal_vision/pixtral_example.py
@@ -5,12 +5,11 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
# Load model.
model_id = "mgoin/pixtral-12b"
-model = LlavaForConditionalGeneration.from_pretrained(
- model_id, device_map="auto", torch_dtype="auto"
-)
+model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Oneshot arguments
@@ -37,7 +36,6 @@ def data_collator(batch):
GPTQModifier(
targets="Linear",
scheme="W4A16",
- sequential_targets=["MistralDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
@@ -53,10 +51,12 @@ def data_collator(batch):
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
data_collator=data_collator,
+ sequential_targets=["MistralDecoderLayer"],
)
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
messages = [
{
"role": "user",
diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
index 14686d6ab..8cccf768e 100644
--- a/examples/multimodal_vision/qwen2_vl_example.py
+++ b/examples/multimodal_vision/qwen2_vl_example.py
@@ -8,14 +8,11 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
# Load model.
model_id = "Qwen/Qwen2-VL-2B-Instruct"
-model = Qwen2VLForConditionalGeneration.from_pretrained(
- model_id,
- device_map="auto",
- torch_dtype="auto",
-)
+model = Qwen2VLForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Oneshot arguments
@@ -82,7 +79,6 @@ def data_collator(batch):
GPTQModifier(
targets="Linear",
scheme="W4A16",
- sequential_targets=["Qwen2VLDecoderLayer"],
ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
),
]
@@ -97,10 +93,12 @@ def data_collator(batch):
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
data_collator=data_collator,
+ sequential_targets=["Qwen2VLDecoderLayer"],
)
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
messages = [
{
"role": "user",
diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py
index d5aacfe6a..10a0edeec 100644
--- a/examples/multimodal_vision/qwen_2_5_vl_example.py
+++ b/examples/multimodal_vision/qwen_2_5_vl_example.py
@@ -8,14 +8,11 @@
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
+from llmcompressor.utils import dispatch_for_generation
# Load model.
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
- model_id,
- device_map="auto",
- torch_dtype="auto",
-)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Oneshot arguments
@@ -76,7 +73,6 @@ def data_collator(batch):
GPTQModifier(
targets="Linear",
scheme="W4A16",
- sequential_targets=["Qwen2_5_VLDecoderLayer"],
ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
),
]
@@ -91,10 +87,12 @@ def data_collator(batch):
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
data_collator=data_collator,
+ sequential_targets=["Qwen2_5_VLDecoderLayer"],
)
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
messages = [
{
"role": "user",
diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md
index c011ecf1d..51e04dd98 100644
--- a/examples/quantization_2of4_sparse_w4a16/README.md
+++ b/examples/quantization_2of4_sparse_w4a16/README.md
@@ -49,9 +49,7 @@ import torch
from transformers import AutoModelForCausalLM
model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
-model = AutoModelForCausalLM.from_pretrained(
- model_stub, torch_dtype=torch.bfloat16, device_map="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16)
dataset = "ultrachat-200k"
splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}
diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
index 419ddc267..0b83d7384 100644
--- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
+++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -3,12 +3,11 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot, train
+from llmcompressor.utils import dispatch_for_generation
# load the model in as bfloat16 to save on memory and compute
model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
-model = AutoModelForCausalLM.from_pretrained(
- model_stub, torch_dtype=torch.bfloat16, device_map="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_stub)
# uses LLM Compressor's built-in preprocessing for ultra chat
@@ -71,6 +70,7 @@
)
# Sparse finetune
+dispatch_for_generation(model)
finetune_applied_model = train(
model=oneshot_applied_model,
**oneshot_kwargs,
@@ -79,6 +79,7 @@
)
# Oneshot quantization
+model.to("cpu")
quantized_model = oneshot(
model=finetune_applied_model,
**oneshot_kwargs,
@@ -90,8 +91,8 @@
tokenizer.save_pretrained(f"{output_dir}/quantization_stage")
logger.info(
- "llmcompressor does not currently support running "
+ "llmcompressor does not currently support running ",
"compressed models in the marlin24 format. "
- "The model produced from this example can be "
- "run on vLLM with dtype=torch.float16."
+ "The model produced from this example can be ",
+ "run on vLLM with dtype=torch.float16.",
)
diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md
index 1cfef1433..62da49c88 100644
--- a/examples/quantization_kv_cache/README.md
+++ b/examples/quantization_kv_cache/README.md
@@ -39,11 +39,7 @@ Load the model using `AutoModelForCausalLM`:
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID,
- device_map="auto",
- torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
index 1291dc4fa..f753d71dd 100644
--- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -2,14 +2,11 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
+from llmcompressor.utils import dispatch_for_generation
# Select model and load it.
MODEL_ID = "google/gemma-2-9b-it"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID,
- device_map="auto",
- torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset.
@@ -91,6 +88,7 @@ def process_and_tokenize(example):
# Consider either downgrading your transformers version to a previous version
# or use vLLM for sample generation.
print("\n\n")
+dispatch_for_generation(model)
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
index eda448d0d..339c353fa 100644
--- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -3,14 +3,11 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
+from llmcompressor.utils import dispatch_for_generation
# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID,
- device_map="auto",
- torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset.
@@ -90,6 +87,7 @@ def process_and_tokenize(example):
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
index 31e394e8a..0d16e1b22 100644
--- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -2,16 +2,13 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
+from llmcompressor.utils import dispatch_for_generation
# Select model and load it.
# Phi-3.5 is a special case for KV cache quantization because it has
# fused QKV linear layers.
MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID,
- device_map="auto",
- torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset.
@@ -90,6 +87,7 @@ def process_and_tokenize(example):
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
diff --git a/examples/quantization_w4a16/README.md b/examples/quantization_w4a16/README.md
index 49762893e..3e29c29c2 100644
--- a/examples/quantization_w4a16/README.md
+++ b/examples/quantization_w4a16/README.md
@@ -40,9 +40,7 @@ Load the model using `AutoModelForCausalLM` for handling quantized saving and lo
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 73b41e520..89c9d353e 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -3,16 +3,12 @@
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
+from llmcompressor.utils import dispatch_for_generation
# Select model and load it.
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID,
- device_map="auto",
- torch_dtype="auto",
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
@@ -69,12 +65,14 @@ def tokenize(sample):
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=100)
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
# Save to disk compressed.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py
index e393af767..d35de8d30 100644
--- a/examples/quantization_w4a16_fp4/llama3_example.py
+++ b/examples/quantization_w4a16_fp4/llama3_example.py
@@ -2,13 +2,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
# Load model.
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Configure the quantization algorithm and scheme.
@@ -21,6 +20,7 @@
print("\n\n")
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
index 8b9706955..95d01657b 100644
--- a/examples/quantization_w4a4_fp4/llama3_example.py
+++ b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -3,13 +3,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
# Load model.
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -69,6 +68,7 @@ def tokenize(sample):
print("\n\n")
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
diff --git a/examples/quantization_w8a8_fp8/README.md b/examples/quantization_w8a8_fp8/README.md
index d9a4eec5e..99fa65dbc 100644
--- a/examples/quantization_w8a8_fp8/README.md
+++ b/examples/quantization_w8a8_fp8/README.md
@@ -38,8 +38,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
index c1819450b..1b56512b4 100644
--- a/examples/quantization_w8a8_fp8/gemma2_example.py
+++ b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -2,13 +2,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
MODEL_ID = "google/gemma-2-27b-it"
# 1) Load model.
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# 2) Configure the quantization algorithm and scheme.
@@ -31,6 +30,7 @@
# Consider either downgrading your transformers version to a previous version
# or use vLLM for sample generation.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))
diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
index 57ff6034e..6a1454cd0 100644
--- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
+++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -2,13 +2,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# Load model.
-model = MllamaForConditionalGeneration.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = MllamaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(MODEL_ID)
# Configure the quantization algorithm and scheme.
@@ -26,6 +25,7 @@
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(processor.decode(output[0]))
diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py
index ca067541f..39c196752 100644
--- a/examples/quantization_w8a8_fp8/llama3_example.py
+++ b/examples/quantization_w8a8_fp8/llama3_example.py
@@ -2,13 +2,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
# Load model.
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Configure the quantization algorithm and scheme.
@@ -24,6 +23,7 @@
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))
diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
index 2a5a8c746..a03188a61 100644
--- a/examples/quantization_w8a8_fp8/llava1.5_example.py
+++ b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -2,13 +2,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
MODEL_ID = "llava-hf/llava-1.5-7b-hf"
# Load model.
-model = LlavaForConditionalGeneration.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = LlavaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(MODEL_ID)
# Configure the quantization algorithm and scheme.
@@ -26,6 +25,7 @@
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(processor.decode(output[0]))
diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py
index 720a368b0..ebadbe973 100644
--- a/examples/quantization_w8a8_fp8/qwen2vl_example.py
+++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py
@@ -2,13 +2,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
# Load model.
-model = Qwen2VLForConditionalGeneration.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(MODEL_ID)
# Configure the quantization algorithm and scheme.
@@ -26,6 +25,7 @@
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(processor.decode(output[0]))
diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py
index f7309fcaf..2cbbebe7d 100644
--- a/examples/quantization_w8a8_fp8/whisper_example.py
+++ b/examples/quantization_w8a8_fp8/whisper_example.py
@@ -3,13 +3,12 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
MODEL_ID = "openai/whisper-large-v2"
# Load model.
-model = WhisperForConditionalGeneration.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
model.config.forced_decoder_ids = None
processor = AutoProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
@@ -27,6 +26,7 @@
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]"
)
diff --git a/examples/quantization_w8a8_int8/README.md b/examples/quantization_w8a8_int8/README.md
index 07b1471bc..807113118 100644
--- a/examples/quantization_w8a8_int8/README.md
+++ b/examples/quantization_w8a8_int8/README.md
@@ -38,9 +38,7 @@ Load the model using `AutoModelForCausalLM` for handling quantized saving and lo
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
index 876dd768c..d332532b0 100644
--- a/examples/quantization_w8a8_int8/gemma2_example.py
+++ b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -3,14 +3,11 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
# 1) Select model and load it.
MODEL_ID = "google/gemma-2-2b-it"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID,
- device_map="auto",
- torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# 2) Prepare calibration dataset.
@@ -71,6 +68,7 @@ def tokenize(sample):
# Consider either downgrading your transformers version to a previous version
# or use vLLM for sample generation.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))
diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py
index 70db0d047..feab87455 100644
--- a/examples/quantization_w8a8_int8/llama3_example.py
+++ b/examples/quantization_w8a8_int8/llama3_example.py
@@ -4,14 +4,11 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.utils import dispatch_for_generation
# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID,
- device_map="auto",
- torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset.
@@ -74,6 +71,7 @@ def tokenize(sample):
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
index 2e7cfbd84..9880e9248 100644
--- a/examples/quantizing_moe/deepseek_moe_w4a16.py
+++ b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -4,7 +4,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
from llmcompressor import oneshot
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
+from llmcompressor.utils import dispatch_for_generation
# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
# Please consider either downgrading your transformers version to a
@@ -13,18 +13,8 @@
# select a Mixture of Experts model for quantization
MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
-# adjust based off number of desired GPUs
-# if not enough memory is available, some layers will automatically be offlaoded to cpu
-device_map = calculate_offload_device_map(
- MODEL_ID,
- reserve_for_hessians=True,
- num_gpus=2,
- torch_dtype=torch.bfloat16,
- trust_remote_code=True,
-)
-
model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
+ MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -84,6 +74,7 @@ def tokenize(sample):
# Generation is broken for deepseek models when using the latest transformers package
if Version(__version__) < Version("4.48"):
print("========== SAMPLE GENERATION ==============")
+ dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
index 0fddcc23b..0bc9c24df 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -4,6 +4,7 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
# Please consider either downgrading your transformers version to a
@@ -13,7 +14,7 @@
MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
+ MODEL_ID, torch_dtype="auto", trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -79,6 +80,7 @@ def tokenize(sample):
# Generation is broken for deepseek models when using the latest transformers package
if Version(__version__) < Version("4.48"):
print("========== SAMPLE GENERATION ==============")
+ dispatch_for_generation(model)
SAMPLE_INPUT = ["I love quantization because"]
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
index 84397bbd6..3ec506c34 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -5,7 +5,7 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
+from llmcompressor.utils import dispatch_for_generation
# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
# Please consider either downgrading your transformers version to a
@@ -14,18 +14,8 @@
# select a Mixture of Experts model for quantization
MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
-# adjust based off number of desired GPUs
-# if not enough memory is available, some layers will automatically be offlaoded to cpu
-device_map = calculate_offload_device_map(
- MODEL_ID,
- reserve_for_hessians=True,
- num_gpus=2,
- torch_dtype=torch.bfloat16,
- trust_remote_code=True,
-)
-
model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
+ MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -91,6 +81,7 @@ def tokenize(sample):
# Generation is broken for deepseek models when using the latest transformers package
if Version(__version__) < Version("4.48"):
print("========== SAMPLE GENERATION ==============")
+ dispatch_for_generation(model)
SAMPLE_INPUT = ["I love quantization because"]
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
index 2fcdff371..a17bf873d 100644
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -5,19 +5,11 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
+from llmcompressor.utils import dispatch_for_generation
MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-NUM_GPUS = 2
-# Adjust based off number of desired GPUs
-device_map = calculate_offload_device_map(
- MODEL_ID, reserve_for_hessians=True, num_gpus=NUM_GPUS, torch_dtype="auto"
-)
-
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map=device_map, torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -49,6 +41,7 @@
# Generation is broken for deepseek models when using the latest transformers package
if Version(__version__) < Version("4.48"):
print("========== SAMPLE GENERATION ==============")
+ dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
index 40f576ff2..40a78a9b7 100644
--- a/examples/quantizing_moe/qwen_moe_w4a16.py
+++ b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -4,23 +4,13 @@
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
+from llmcompressor.utils import dispatch_for_generation
# select a Mixture of Experts model for quantization
MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
-# adjust based off number of desired GPUs
-# if not enough memory is available, some layers will automatically be offloaded to cpu
-device_map = calculate_offload_device_map(
- MODEL_ID,
- reserve_for_hessians=True,
- num_gpus=2,
- torch_dtype=torch.bfloat16,
- trust_remote_code=True,
-)
-
model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
+ MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -82,6 +72,7 @@ def tokenize(sample):
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))
diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
index dd5f4e34c..590b74611 100644
--- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
+++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -6,6 +6,7 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.obcq import SparseGPTModifier
from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
# Configuration
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -77,9 +78,7 @@ def get_recipe(fp8_enabled):
args = parse_args()
# Load model and tokenizer
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Load and preprocess dataset
@@ -103,6 +102,7 @@ def get_recipe(fp8_enabled):
# Validate the compressed model
print("\n========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
diff --git a/examples/trl_mixin/ex_trl_constant.py b/examples/trl_mixin/ex_trl_constant.py
index c26e9f41d..ff8a370c9 100644
--- a/examples/trl_mixin/ex_trl_constant.py
+++ b/examples/trl_mixin/ex_trl_constant.py
@@ -7,9 +7,7 @@
model_path = "neuralmagic/Llama-2-7b-pruned50-retrained"
output_dir = "./output_trl_sft_test_7b_gsm8k_sft_data"
-model = AutoModelForCausalLM.from_pretrained(
- model_path, torch_dtype="auto", device_map="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
index f60c9560e..949933f97 100644
--- a/src/llmcompressor/args/dataset_arguments.py
+++ b/src/llmcompressor/args/dataset_arguments.py
@@ -186,3 +186,13 @@ class DatasetArguments(CustomDatasetArguments):
"{module}.{method_name} or {function_name}"
},
)
+ sequential_targets: Optional[List[str]] = field(
+ default=None,
+ metadata={
+ "help": "List of layer targets for the sequential pipeline. "
+ "This is typically a single DecoderLayer. "
+ "Not specifying this argument will cause the sequential pipeline to "
+ "default to using the `no_split_params` specified by the HF model "
+ "definition"
+ },
+ )
diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
index 870f6d772..ea3c3936a 100644
--- a/src/llmcompressor/args/model_arguments.py
+++ b/src/llmcompressor/args/model_arguments.py
@@ -81,8 +81,11 @@ class ModelArguments:
metadata={"help": "Whether to compress sparse models during save"},
)
oneshot_device: Optional[str] = field(
- default="cuda:0",
- metadata={"help": "Device to run oneshot calibration on"},
+ default="cuda",
+ metadata={
+ "help": "This argument is deprecated and nonfunctional "
+ "and will be removed in future release"
+ },
)
model_revision: str = field(
default="main",
diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md
index 5e907b802..f023d3c02 100644
--- a/src/llmcompressor/entrypoints/README.md
+++ b/src/llmcompressor/entrypoints/README.md
@@ -29,9 +29,7 @@ from llmcompressor.modifiers.quantization import QuantizationModifier
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
# Load the model
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -114,7 +112,6 @@ output_dir = "./oneshot_model"
# The model to train
model = AutoModelForCausalLM.from_pretrained(
output_dir,
- device_map="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
@@ -148,7 +145,6 @@ Comparisons are defined in `/src/llmcompressor/modifiers/distillation/utils/pyto
# Define the teacher model
distill_teacher = AutoModelForCausalLM.from_pretrained(
"meta-llama/Meta-Llama-3-8B-Instruct",
- device_map="auto",
)
# Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with
@@ -204,9 +200,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
oneshot_output_dir = "./oneshot_model"
# Load the model
-model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -226,7 +220,6 @@ from llmcompressor import create_session, train
# Student model
model = AutoModelForCausalLM.from_pretrained(
oneshot_output_dir,
- device_map="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
@@ -241,7 +234,6 @@ num_calibration_samples = 8 # The number of workers processing datasets in para
# Define teacher model
distill_teacher = AutoModelForCausalLM.from_pretrained(
"meta-llama/Meta-Llama-3-8B-Instruct",
- device_map="auto",
)
# Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index bedca7392..54a36abfe 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -10,7 +10,7 @@
from llmcompressor.core.session_functions import active_session
from llmcompressor.datasets import get_calibration_dataloader
from llmcompressor.entrypoints.utils import post_process, pre_process
-from llmcompressor.pipelines.registry import CalibrationPipeline
+from llmcompressor.pipelines import CalibrationPipeline
__all__ = ["Oneshot", "oneshot"]
diff --git a/src/llmcompressor/entrypoints/train.py b/src/llmcompressor/entrypoints/train.py
index 4b5d8b73b..0bfb26e53 100644
--- a/src/llmcompressor/entrypoints/train.py
+++ b/src/llmcompressor/entrypoints/train.py
@@ -8,6 +8,7 @@
from llmcompressor.core.session_functions import active_session
from llmcompressor.datasets.utils import get_processed_dataset
from llmcompressor.transformers.finetune.trainer import Trainer
+from llmcompressor.utils.dev import dispatch_for_generation
from .utils import post_process, pre_process
@@ -63,6 +64,7 @@ def train(**kwargs) -> PreTrainedModel:
)
pre_process(model_args)
+ dispatch_for_generation(model_args.model) # train is dispatched same as generation
processed_dataset = get_processed_dataset(
dataset_args=dataset_args,
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
index 0186628f0..418725d47 100644
--- a/src/llmcompressor/entrypoints/utils.py
+++ b/src/llmcompressor/entrypoints/utils.py
@@ -3,6 +3,7 @@
from pathlib import PosixPath
from typing import Optional, Tuple
+from compressed_tensors.utils import remove_dispatch
from loguru import logger
from torch.nn import Module
from transformers import (
@@ -16,7 +17,7 @@
from llmcompressor.args import ModelArguments, RecipeArguments, TrainingArguments
from llmcompressor.core import reset_session
-from llmcompressor.pytorch.model_load.helpers import fallback_to_cpu, parse_dtype
+from llmcompressor.pytorch.model_load.helpers import parse_dtype
from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
modify_save_pretrained,
patch_tied_tensors_bug,
@@ -84,6 +85,10 @@ def post_process(
Raises:
ValueError: If saving fails due to an invalid `output_dir` or other issues.
"""
+ # remove any existing dispatches
+ if model_args is not None and model_args.model is not None:
+ remove_dispatch(model_args.model)
+
if model_args is not None and output_dir is not None:
if recipe_args is not None and getattr(recipe_args, "stage", None) is not None:
output_dir = os.path.join(output_dir, recipe_args.stage)
@@ -193,20 +198,12 @@ def initialize_model_from_path(
else model_args.model_name_or_path
)
- # Fallback to CPU if GPU requested and not available
- model_args.oneshot_device = fallback_to_cpu(model_args.oneshot_device)
-
- device_map = model_args.oneshot_device
- if training_args is not None and training_args.do_train:
- device_map = "auto"
-
model_kwargs = {
"config": config,
"cache_dir": model_args.cache_dir,
"revision": model_args.model_revision,
"use_auth_token": True if model_args.use_auth_token else None,
"torch_dtype": parse_dtype(model_args.precision),
- "device_map": device_map,
"trust_remote_code": model_args.trust_remote_code_model,
}
@@ -216,10 +213,7 @@ def initialize_model_from_path(
run_compressed=False
)
- model = AutoModelForCausalLM.from_pretrained(
- model_path,
- **model_kwargs,
- )
+ model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
if "sequence_length" in model_kwargs:
model.seqlen = model_kwargs["sequence_length"]
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
index 1bec18e2a..51e8cf8b9 100644
--- a/src/llmcompressor/modifiers/awq/base.py
+++ b/src/llmcompressor/modifiers/awq/base.py
@@ -34,7 +34,6 @@
__all__ = ["AWQModifier"]
-# TODO (Brian INFERENG-531) Add support for offloaded models
class AWQModifier(Modifier, QuantizationMixin):
"""
Implements the AWQ (Activation-Weighted Quantization) algorithm,
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index 7ae61f3e2..fb8baf0a7 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -1,5 +1,6 @@
import contextlib
import warnings
+from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Optional, Tuple, Union
import torch
@@ -22,12 +23,12 @@
from llmcompressor.modifiers import Modifier
from llmcompressor.modifiers.quantization.gptq.gptq_quantize import (
accumulate_hessian,
+ initialize_linalg,
make_empty_hessian,
quantize_weight,
)
from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
from llmcompressor.sentinel import Sentinel
-from llmcompressor.utils.metric_logging import CompressionLogger
__all__ = ["GPTQModifier"]
@@ -252,34 +253,54 @@ def compress_modules(self):
"""
Quantize modules which have been calibrated
"""
- for module in list(self._num_samples.keys()):
- name = self._module_names[module]
- num_samples = self._num_samples[module]
- quant_args = getattr_chain(module, "quantization_scheme.weights")
-
- logger.info(f"Quantizing {name} using {num_samples} samples")
- with torch.no_grad(), align_module_device(
- module
- ), self._maybe_onload_hessian(module), CompressionLogger(
- module
- ) as comp_logger:
- loss, quantized_weight, scale, zero_point, g_idx = quantize_weight(
- module=module,
- quant_args=quant_args,
- hessians_dict=self._hessians,
- blocksize=self.block_size,
- percdamp=self.dampening_frac,
- )
- comp_logger.set_loss(loss)
+ import time
+
+ start_time = time.time()
+
+ futures = []
+ with ThreadPoolExecutor() as executor:
+ for module in list(self._num_samples.keys()):
+ initialize_linalg(get_execution_device(module))
+ future = executor.submit(self._compress_module, module)
+ futures.append(future)
+
+ for future in as_completed(futures, timeout=300): # no timeout
+ name, num_samples, loss = future.result()
+ logger.info(f"Quantized {name}")
+ logger.info(f" num_samples={num_samples}")
+ logger.info(f" loss={loss:.2f}")
+
+ logger.info(
+ f"Quantized {len(futures)} modules in {time.time() - start_time: .1f}s"
+ )
+
+ def _compress_module(self, module: torch.nn.Module) -> Tuple[str, int, float]:
+ name = self._module_names[module]
+ num_samples = self._num_samples[module]
+ quant_args = getattr_chain(module, "quantization_scheme.weights")
+
+ with torch.no_grad(), align_module_device(module), self._maybe_onload_hessian(
+ module
+ ):
+ logger.info(f"Quantizing {name}...")
+ loss, quantized_weight, scale, zero_point, g_idx = quantize_weight(
+ module=module,
+ quant_args=quant_args,
+ hessians_dict=self._hessians,
+ blocksize=self.block_size,
+ percdamp=self.dampening_frac,
+ )
+
+ update_offload_parameter(module, "weight", quantized_weight)
+ update_offload_parameter(module, "weight_scale", scale)
+ update_offload_parameter(module, "weight_zero_point", zero_point)
+ if g_idx is not None:
+ update_offload_parameter(module, "weight_g_idx", g_idx)
- update_offload_parameter(module, "weight", quantized_weight)
- update_offload_parameter(module, "weight_scale", scale)
- update_offload_parameter(module, "weight_zero_point", zero_point)
- if g_idx is not None:
- update_offload_parameter(module, "weight_g_idx", g_idx)
+ # self._hessians[module] already deleted by quantize_weight
+ del self._num_samples[module]
- # self._hessians[module] already deleted by quantize_weight
- del self._num_samples[module]
+ return name, num_samples, loss
def on_end(self, state: State, event: Event, **kwargs):
"""
diff --git a/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py
index 4392ed8cf..b3fc63fab 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py
@@ -17,8 +17,25 @@
from llmcompressor.pytorch.utils.helpers import tensor_sparsity
GPTQ_PRECISION = torch.float32
-
-__all__ = ["make_empty_hessian", "accumulate_hessian", "quantize_weight"]
+INITIALIZED_DEVICES = set()
+
+__all__ = [
+ "initialize_linalg",
+ "make_empty_hessian",
+ "accumulate_hessian",
+ "quantize_weight",
+]
+
+
+def initialize_linalg(device: torch.device):
+ # pre-load torch.linalg module to avoid loading the module in threads,
+ # which can cause lazy loading assertion errors
+ # https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp#L50 # noqa: E501
+ # https://github.com/pytorch/ignite/issues/3004
+ if device not in INITIALIZED_DEVICES:
+ _input = torch.ones((1, 1), device=device)
+ _ = torch.cholesky_inverse(torch.linalg.cholesky(_input))
+ INITIALIZED_DEVICES.add(device)
def make_empty_hessian(
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
index 15b94786a..dfb99172e 100644
--- a/src/llmcompressor/pipelines/basic/pipeline.py
+++ b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -9,6 +9,7 @@
from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
from llmcompressor.pipelines.registry import CalibrationPipeline
from llmcompressor.pytorch.utils.helpers import tensors_to_device
+from llmcompressor.utils.dev import dispatch_for_generation
from llmcompressor.utils.helpers import calibration_forward_context
if TYPE_CHECKING:
@@ -37,6 +38,7 @@ def __call__(
:param dataloader: loads data for calibration
:param dataset_args: dataset arguments relevant to pipelines
"""
+ dispatch_for_generation(model) # basic dispatch is identical to generation
model_device = get_execution_device(model)
LifecycleCallbacks.calibration_epoch_start()
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 9cb2f3708..d8ad73a10 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -2,6 +2,7 @@
import torch
import tqdm
+from compressed_tensors.utils import disable_offloading
from torch.utils.data.dataloader import DataLoader
from llmcompressor.core import LifecycleCallbacks, active_session
@@ -14,7 +15,10 @@
to_next_layer_kwargs,
)
from llmcompressor.pipelines.registry import CalibrationPipeline
-from llmcompressor.pipelines.sequential.helpers import get_targets_from_modifiers
+from llmcompressor.pipelines.sequential.helpers import (
+ dispatch_for_sequential,
+ get_sequential_targets,
+)
from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
if TYPE_CHECKING:
@@ -54,9 +58,12 @@ def __call__(
"""
session = active_session()
+ # prepare model for sequential onloading
+ dispatch_for_sequential(model)
+
# find layers
modifiers = session.get_modifiers()
- sequential_targets, _ = get_targets_from_modifiers(modifiers, model)
+ sequential_targets = get_sequential_targets(modifiers, model, dataset_args)
layers = match_modules(model, sequential_targets)
LifecycleCallbacks.calibration_epoch_start()
@@ -73,29 +80,34 @@ def __call__(
calib_desc = f"({layer_index + 1}/{num_layers}): Calibrating"
prop_desc = f"({layer_index + 1}/{num_layers}): Propagating"
- # do a preliminary pass to trigger modifier hooks
- for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
- inputs = intermediates.fetch(batch_idx)
- layer(**inputs)
-
- LifecycleCallbacks.sequential_epoch_end()
-
- # this pass does not trigger modifier hooks
- # and is only used for capturing outputs from newly compressed modules
- with HooksMixin.disable_hooks():
- for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=prop_desc):
+ # reduce memory movement by keeping modules onloaded
+ with disable_offloading():
+ # do a preliminary pass to trigger modifier hooks
+ for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
inputs = intermediates.fetch(batch_idx)
- output = layer(**inputs)
-
- if layer_index < num_layers - 1:
- next_layer = layers[layer_index + 1]
- output = to_next_layer_kwargs(output, next_layer)
- output = maybe_inject_pos_embeddings(
- output, next_layer, inputs
- )
-
- intermediates.delete(batch_idx)
- intermediates.update(batch_idx, output)
+ layer(**inputs)
+
+ LifecycleCallbacks.sequential_epoch_end()
+
+ # this pass does not trigger modifier hooks
+ # and is only used for capturing outputs from
+ # newly compressed modules
+ with HooksMixin.disable_hooks():
+ for batch_idx in tqdm.tqdm(
+ range(len(dataloader)), desc=prop_desc
+ ):
+ inputs = intermediates.fetch(batch_idx)
+ output = layer(**inputs)
+
+ if layer_index < num_layers - 1:
+ next_layer = layers[layer_index + 1]
+ output = to_next_layer_kwargs(output, next_layer)
+ output = maybe_inject_pos_embeddings(
+ output, next_layer, inputs
+ )
+
+ intermediates.delete(batch_idx)
+ intermediates.update(batch_idx, output)
# redundant, finish any remaining compression
LifecycleCallbacks.calibration_epoch_end()
diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py
index 77d6e79ab..2c1a54cf5 100644
--- a/src/llmcompressor/pipelines/registry.py
+++ b/src/llmcompressor/pipelines/registry.py
@@ -7,18 +7,13 @@
from torch.utils.data.dataloader import DataLoader
from llmcompressor.modifiers import Modifier
-from llmcompressor.modifiers.awq import AWQModifier
-from llmcompressor.modifiers.obcq.sgpt_base import SparsityModifierBase
-from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationMixin
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
if TYPE_CHECKING:
from llmcompressor.args.dataset_arguments import DatasetArguments
__all__ = ["CalibrationPipeline"]
-SEQUENTIAL_MODIFIERS = (AWQModifier, GPTQModifier, SparsityModifierBase)
-
class CalibrationPipeline(ABC, RegistryMixin):
@staticmethod
@@ -43,7 +38,7 @@ def from_modifiers(
:return: CalibrationPipeline instance to be called with data (if not datafree)
"""
user = standardize_lookup_name(user) if user else None
- inferred = standardize_lookup_name(cls._validate_infer_pipeline(modifiers))
+ inferred = standardize_lookup_name(cls._infer_pipeline(modifiers))
independent = standardize_lookup_name("independent")
if user == independent:
@@ -59,35 +54,11 @@ def from_modifiers(
return cls.load_from_registry(pipeline)
@staticmethod
- def _validate_infer_pipeline(modifiers: List[Modifier]) -> str:
- if any(isinstance(modifier, SEQUENTIAL_MODIFIERS) for modifier in modifiers):
- return "sequential"
-
- active_qmods = _get_active_quant_modifiers(modifiers)
- if len(active_qmods) > 1:
- raise ValueError(
- f"Recipe contains more than one active quantization config "
- f"({active_qmods}). These configs may be conflicting, Please modify "
- "your recipe to use at most one quantization config"
- )
-
- if len(active_qmods) == 1:
- quant_modifier = active_qmods[0]
- config = quant_modifier.resolve_quantization_config()
- if config.requires_calibration_data():
- return "basic"
- else:
+ def _infer_pipeline(modifiers: List[Modifier]) -> str:
+ # only in the case of weight-only qmod quantization can we skip calibration
+ if len(modifiers) == 1 and isinstance(modifiers[0], QuantizationModifier):
+ config = modifiers[0].resolve_quantization_config()
+ if not config.requires_calibration_data():
return "datafree"
- if any(isinstance(modifier, SmoothQuantModifier) for modifier in modifiers):
- return "basic"
-
- return "datafree"
-
-
-def _get_active_quant_modifiers(modifiers: List[Modifier]) -> List[QuantizationMixin]:
- return [
- modifier
- for modifier in modifiers
- if isinstance(modifier, QuantizationMixin) and modifier.has_config()
- ]
+ return "sequential"
diff --git a/src/llmcompressor/pipelines/sequential/__init__.py b/src/llmcompressor/pipelines/sequential/__init__.py
index d96ee6987..7c726f6c4 100644
--- a/src/llmcompressor/pipelines/sequential/__init__.py
+++ b/src/llmcompressor/pipelines/sequential/__init__.py
@@ -1,3 +1,2 @@
# flake8: noqa
-from .helpers import get_targets_from_modifiers
from .pipeline import *
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index b7937a2fc..4f562818a 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -2,11 +2,15 @@
import inspect
from collections import deque
from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Set
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
import torch
-from compressed_tensors import has_offloaded_params
from compressed_tensors.quantization import find_name_or_class_matches
+from compressed_tensors.utils import (
+ has_offloaded_params,
+ offloaded_dispatch,
+ remove_dispatch,
+)
from loguru import logger
from torch.fx import Graph, GraphModule, Node
from torch.fx.graph import PythonCode
@@ -23,7 +27,15 @@
from .ast_helpers import autowrap_forwards
-__all__ = ["trace_subgraphs", "Subgraph", "get_targets_from_modifiers"]
+if TYPE_CHECKING:
+ from llmcompressor.args.dataset_arguments import DatasetArguments
+
+__all__ = [
+ "trace_subgraphs",
+ "Subgraph",
+ "get_sequential_targets",
+ "dispatch_for_sequential",
+]
@dataclass
@@ -416,44 +428,59 @@ def match_modules(model: Module, target_names: List[str]) -> Set[Module]:
)
-def get_targets_from_modifiers(
- modifiers: List[Modifier], model: PreTrainedModel
+def get_sequential_targets(
+ modifiers: List[Modifier], model: PreTrainedModel, args: "DatasetArguments"
) -> List[str]:
"""
- Infer sequential targets from modifiers list
+ Infer sequential targets from modifiers list and dataset args
:param model: model being calibrated
:param modifiers: list of modifiers being applied during calibration
+ :param dataset_args: dataset arguments passed by user
:return: list of sequential targets
"""
- # avoid circular import
- from llmcompressor.pipelines.registry import SEQUENTIAL_MODIFIERS
-
- sequential_modifiers = [
- modifier for modifier in modifiers if isinstance(modifier, SEQUENTIAL_MODIFIERS)
+ modifier_targets = [
+ (modifier, modifier.sequential_targets)
+ for modifier in modifiers
+ if getattr(modifier, "sequential_targets", None) is not None
]
- if len(sequential_modifiers) >= 2:
- types = [type(modifier) for modifier in sequential_modifiers]
+ # deprecation warning
+ if len(modifier_targets) >= 1:
logger.warning(
+ "Passing sequential targets through modifiers is deprecated, "
+ "please use `oneshot(sequential_targets=...)`"
+ )
+
+ # cannot infer from multiple modifiers
+ if len(modifier_targets) >= 2:
+ types = [type(modifier) for modifier, _ in modifier_targets]
+ raise ValueError(
"Cannot infer sequential targets from multiple sequential modifiers "
- f"({types}). Defaulting to {types[0]}"
+ f"({types})"
)
- elif len(sequential_modifiers) <= 0:
- types = [type(modifier) for modifier in modifiers]
- raise ValueError(f"Cannot infer sequential targets from list of {types}")
- modifier = sequential_modifiers[0]
+ # resolve single modifier
+ if len(modifier_targets) == 1:
+ if args.sequential_targets is not None:
+ raise ValueError(
+ f"Got sequential targets from both {type(modifier_targets[0][0])} "
+ "and dataset arguments `sequential_targets`"
+ )
+
+ sequential_targets = modifier_targets[0][1]
- # infer sequential targets
- if modifier.sequential_targets is None:
- sequential_targets = get_no_split_params(model)
- elif isinstance(modifier.sequential_targets, str):
- sequential_targets = [modifier.sequential_targets]
+ # if no modifiers, use data args
else:
- sequential_targets = modifier.sequential_targets
+ sequential_targets = args.sequential_targets # may be `None`
- return sequential_targets
+ # validate and infer
+ if sequential_targets is None:
+ return get_no_split_params(model)
+ elif isinstance(sequential_targets, str):
+ return [sequential_targets]
+ else:
+ return sequential_targets
def add_line_numbers(text: str) -> str:
@@ -485,3 +512,22 @@ def is_ancestor(module: Module) -> bool:
is_ancestor(model)
return ancestors
+
+
+def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel:
+ """
+ Dispatch a model for sequential calibration using a sequential pipeline.
+ The model will be offloaded to the CPU and dispatched to CUDA device if available.
+ Removes any existing hooks.
+
+ :param model: model to dispatch
+ :return: dispatched model
+ """
+ remove_dispatch(model)
+
+ if torch.cuda.is_available():
+ offloaded_dispatch(model, execution_device=torch.device("cuda:0"))
+ else:
+ logger.warning("CUDA is not available! Compressing model on CPU instead")
+
+ return model
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index 22c47d894..a2a714565 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -1,16 +1,17 @@
from typing import TYPE_CHECKING
import torch
-import tqdm
-from compressed_tensors.utils import get_execution_device
+from compressed_tensors.utils import disable_offloading, get_execution_device
from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
from llmcompressor.core import LifecycleCallbacks, active_session
from llmcompressor.modifiers.utils.hooks import HooksMixin
from llmcompressor.pipelines.cache import IntermediatesCache
from llmcompressor.pipelines.registry import CalibrationPipeline
from llmcompressor.pipelines.sequential.helpers import (
- get_targets_from_modifiers,
+ dispatch_for_sequential,
+ get_sequential_targets,
trace_subgraphs,
)
from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
@@ -51,45 +52,50 @@ def __call__(
"""
session = active_session()
+ # prepare model for sequential onloading
+ dispatch_for_sequential(model)
+
# prepare to trace subgraphs
modifiers = session.get_modifiers()
- sequential_targets = get_targets_from_modifiers(modifiers, model)
+ sequential_targets = get_sequential_targets(modifiers, model, dataset_args)
ignore = dataset_args.tracing_ignore
# trace subgraphs
sample_input = next(iter(dataloader))
subgraphs = trace_subgraphs(model, sample_input, sequential_targets, ignore)
+ num_subgraphs = len(subgraphs)
LifecycleCallbacks.calibration_epoch_start()
with calibration_forward_context(model), DisableQuantization(model):
# prepare intermediates cache
model_device = get_execution_device(model)
- intermediates = IntermediatesCache.from_dataloader(dataloader, model_device)
+ activations = IntermediatesCache.from_dataloader(dataloader, model_device)
- num_subgraphs = len(subgraphs)
for subgraph_index, subgraph in enumerate(subgraphs):
# prepare tqdm description texts
calib_desc = f"({subgraph_index + 1}/{num_subgraphs}): Calibrating"
prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating"
- # do a preliminary pass to trigger modifier hooks
- for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
- inputs = intermediates.fetch(batch_idx, subgraph.input_names)
- subgraph.forward(model, **inputs)
-
- LifecycleCallbacks.sequential_epoch_end()
-
- # this pass does not trigger modifier hooks
- # and is only used for capturing outputs from newly compressed modules
- with HooksMixin.disable_hooks():
- for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=prop_desc):
- inputs = intermediates.fetch(batch_idx, subgraph.input_names)
- output = subgraph.forward(model, **inputs)
-
- if subgraph_index < num_subgraphs - 1:
- intermediates.update(batch_idx, output)
- intermediates.delete(batch_idx, subgraph.consumed_names)
+ # reduce memory movement by keeping modules onloaded
+ with disable_offloading():
+ # do a preliminary pass to trigger modifier hooks
+ for batch_idx in tqdm(range(len(dataloader)), desc=calib_desc):
+ inputs = activations.fetch(batch_idx, subgraph.input_names)
+ subgraph.forward(model, **inputs)
+
+ LifecycleCallbacks.sequential_epoch_end()
+
+ # this pass does not trigger modifier hooks
+ # and is only used for capturing outputs of newly compressed modules
+ with HooksMixin.disable_hooks():
+ for batch_idx in tqdm(range(len(dataloader)), desc=prop_desc):
+ inputs = activations.fetch(batch_idx, subgraph.input_names)
+ output = subgraph.forward(model, **inputs)
+
+ if subgraph_index < num_subgraphs - 1:
+ activations.update(batch_idx, output)
+ activations.delete(batch_idx, subgraph.consumed_names)
# redundant, finish any remaining compression
LifecycleCallbacks.calibration_epoch_end()
diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
index 0ffbd053e..de4b061ec 100644
--- a/src/llmcompressor/pytorch/model_load/helpers.py
+++ b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -15,7 +15,6 @@
__all__ = [
"copy_python_files_from_model_cache",
- "fallback_to_cpu",
"parse_dtype",
"get_session_model",
"get_completed_stages",
@@ -71,22 +70,6 @@ def save_checkpoint(
compressor.decompress_model(model)
-def fallback_to_cpu(device: str) -> str:
- """
- Takes in a device string and forces it to cpu if cuda is not available
-
- :param device: device id to check
- :return: device modified for CUDA status
- """
- if "cuda" in device and not torch.cuda.is_available():
- logger.warning(
- f"Requested {device} but CUDA is not available, falling back to CPU"
- )
- return "cpu"
-
- return device
-
-
def parse_dtype(dtype_arg: Union[str, torch.dtype]) -> torch.dtype:
"""
:param dtype_arg: dtype or string to parse
diff --git a/src/llmcompressor/transformers/compression/helpers.py b/src/llmcompressor/transformers/compression/helpers.py
index 179d5bc11..4dd004454 100644
--- a/src/llmcompressor/transformers/compression/helpers.py
+++ b/src/llmcompressor/transformers/compression/helpers.py
@@ -1,27 +1,19 @@
from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import Dict, List, Optional, Tuple
-import psutil
import torch
-from accelerate import infer_auto_device_map, init_empty_weights
from accelerate.accelerator import get_state_dict_offloaded_model
from compressed_tensors.quantization.utils import iter_named_leaf_modules, module_type
from compressed_tensors.utils import align_module_device
-from torch.nn.modules import Linear
from tqdm import tqdm
-from transformers import AutoModelForCausalLM
from llmcompressor.pytorch.utils import get_linear_layers
from llmcompressor.pytorch.utils.helpers import tensor_sparsity
-from llmcompressor.utils.pytorch import get_layers, get_no_split_params
__ALL__ = [
"tensor_follows_mask_structure",
"infer_sparsity_structure_from_stage_modifiers",
"infer_sparsity_structure_from_model",
- "hessian_memory_requirements",
- "custom_offload_device_map",
- "calculate_offload_device_map",
"infer_sparse_targets_and_ignores",
"is_sparse_compression_target",
]
@@ -111,156 +103,6 @@ def infer_sparsity_structure_from_model(model: torch.nn.Module) -> Optional[str]
return None
-def hessian_memory_requirements(model: torch.nn.Module) -> int:
- """
- Determines the number of bytes needed to store Hessian data for a single
- transformer layer in model. This is used for reserving memory for GPTQ
- quantization
-
- :param model: model to calculate requirements for
- :return: number of bytes required to reserve for GPTQ on a single layer
- """
- transformer_layers = get_layers(get_no_split_params(model), model)
- total_hessian_elems = {}
- max_column_size = {}
- for no_split_name, no_split_layer in transformer_layers.items():
- total_hessian_elems[no_split_name] = 0
- max_column_size[no_split_name] = 0
- for _name, module in no_split_layer.named_modules():
- if isinstance(module, Linear) and hasattr(module, "weight"):
- column_size = module.weight.shape[1]
- total_hessian_elems[no_split_name] += column_size * column_size
- if column_size > max_column_size[no_split_name]:
- # max extra memory for inverse calculation
- max_column_size[no_split_name] = column_size
-
- max_total_hessian_elems = max(total_hessian_elems.values())
- overall_max_column_size = max(max_column_size.values())
- bytes_per_weight = 32 // 8 # hessians are float32
- inverse_reserved = overall_max_column_size * overall_max_column_size
- return (max_total_hessian_elems + inverse_reserved) * bytes_per_weight
-
-
-def quantization_memory_requirement(model: torch.nn.Module) -> int:
- """
- Determines the max number of bytes needed to store quantization scale and zp data
-
- :param model: model to calculate requirements for
- :return: number of bytes required to reserve for quantization
- """
-
- total_elements = 0
- for _, module in model.named_modules():
- if isinstance(module, Linear):
- for param in module.parameters():
- # assume the max of group 128 and static scale/zp
- # TODO: base this on the recipe instead instead of assuming max
-
- # potentially just bias term
- max_quant_shape = param.shape[0] // 128
-
- if len(param.size()) > 1: # weights
- max_quant_shape *= param.shape[1]
-
- total_elements += max_quant_shape * 4
-
- bytes_ratio = 32 // 16 # assuming float16
- return total_elements * bytes_ratio
-
-
-def custom_offload_device_map(
- model_stub: str,
- max_memory_per_gpu: Union[str, int],
- num_gpus: int = 1,
- model_cls: Type = AutoModelForCausalLM,
- **model_kwargs,
-) -> Dict[Union[int, str], Union[int, str]]:
- """
- Calculates the optimal gpu mappings for model_stub stored as torch_dtype, where
- each GPU is restricted to allocating a specific amount of memory.
-
- :param model_stub: local path or HF stub to calculate mapping for
- :param max_memory_per_gpu: Max memory to allocate on each GPU, as either a string
- such as "10GB" or an integer number of bytes
- :param num_gpus: number of gpus to utilize
- :param model_cls: model class to use when initializing model structure,
- default is AutoModelForCausalLM
- :param model_kwargs: keyword arguments to pass to model initializer
- :return: memory mapping for layers of model_stub to be passed to from_pretrained()
- """
- max_cpu_memory = psutil.virtual_memory().available
- memory_limits = {device: max_memory_per_gpu for device in range(num_gpus)}
- memory_limits["cpu"] = max_cpu_memory
-
- device_map = {}
- with init_empty_weights():
- dummy_model = model_cls.from_pretrained(model_stub, **model_kwargs)
- device_map = infer_auto_device_map(
- dummy_model,
- max_memory=memory_limits,
- no_split_module_classes=dummy_model._no_split_modules,
- )
- del dummy_model
-
- return device_map
-
-
-def calculate_offload_device_map(
- model_stub: str,
- reserve_for_hessians=False,
- num_gpus: int = 1,
- torch_dtype: torch.dtype = torch.float16,
- model_cls: Type = AutoModelForCausalLM,
- **model_kwargs,
-) -> Dict[Union[int, str], Union[int, str]]:
- """
- Calculates the optimal gpu mappings for model_stub stored as torch_dtype. Takes
- into account extra memory required for quantization and (optionally) GPTQ hessians
-
- :param model_stub: local path or HF stub to calculate mapping for
- :param reserve_for_hessians: whether to reserve memory for GPTQ
- :param num_gpus: number of gpus to utilize
- :param model_cls: model class to use when initializing model structure,
- default is AutoModelForCausalLM
- :param model_kwargs: keyword arguments to pass to model initializer
- :return: memory mapping for layers of model_stub to be passed to from_pretrained()
- """
- max_cpu_memory = psutil.virtual_memory().available
- max_gpu_memory = torch.cuda.mem_get_info(0)[0]
- available_gpus = torch.cuda.device_count()
- if available_gpus < num_gpus:
- raise ValueError(
- f"Requested {num_gpus} GPUs but only {available_gpus} are available."
- )
- max_gpu_memory = [max_gpu_memory] * num_gpus
-
- device_map = {}
- with init_empty_weights():
- dummy_model = model_cls.from_pretrained(
- model_stub, torch_dtype=torch_dtype, **model_kwargs
- )
-
- reserved_memory = 0
- if reserve_for_hessians:
- reserved_memory = hessian_memory_requirements(dummy_model)
- reserved_memory += quantization_memory_requirement(dummy_model)
-
- memory_limits = {
- idx: (max_memory - reserved_memory)
- for idx, max_memory in enumerate(max_gpu_memory)
- }
- memory_limits["cpu"] = max_cpu_memory
-
- device_map = infer_auto_device_map(
- dummy_model,
- max_memory=memory_limits,
- no_split_module_classes=dummy_model._no_split_modules,
- )
- del dummy_model
-
- return device_map
-
-
def infer_sparse_targets_and_ignores(
model: torch.nn.Module,
sparsity_structure: str,
diff --git a/src/llmcompressor/utils/__init__.py b/src/llmcompressor/utils/__init__.py
index 98d5e1c65..c4fb71cdc 100644
--- a/src/llmcompressor/utils/__init__.py
+++ b/src/llmcompressor/utils/__init__.py
@@ -4,4 +4,5 @@
# flake8: noqa
+from .dev import *
from .helpers import *
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
index 4af08448b..57ce74fb1 100644
--- a/src/llmcompressor/utils/dev.py
+++ b/src/llmcompressor/utils/dev.py
@@ -5,6 +5,9 @@
from typing import Type
import torch
+from accelerate import dispatch_model, infer_auto_device_map
+from accelerate.utils import get_balanced_memory
+from compressed_tensors.utils import remove_dispatch
from huggingface_hub import snapshot_download
from safetensors.torch import save_file
from transformers import AutoModelForCausalLM, PreTrainedModel
@@ -13,7 +16,11 @@
from llmcompressor.utils.helpers import patch_attr
-__all__ = ["skip_weights_download", "patch_transformers_logger_level"]
+__all__ = [
+ "skip_weights_download",
+ "patch_transformers_logger_level",
+ "dispatch_for_generation",
+]
@contextlib.contextmanager
@@ -106,3 +113,24 @@ def patch_transformers_logger_level(level: int = logging.ERROR):
transformers_logger.setLevel(level=level)
yield
transformers_logger.setLevel(level=restore_log_level)
+
+
+def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
+ """
+ Dispatch a model autoregressive generation. This means that modules are dispatched
+ evenly across avaiable devices and kept onloaded if possible. Removes any HF hooks
+ that may have existed previously.
+
+ :param model: model to dispatch
+ :return: model which is dispatched
+ """
+ remove_dispatch(model)
+
+ max_memory = get_balanced_memory(
+ model,
+ dtype=model.dtype,
+ no_split_module_classes=model._get_no_split_modules("auto"),
+ )
+ device_map = infer_auto_device_map(model, dtype=model.dtype, max_memory=max_memory)
+
+ return dispatch_model(model, device_map=device_map)
diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
index 2325b7a34..853d2318b 100644
--- a/tests/e2e/e2e_utils.py
+++ b/tests/e2e/e2e_utils.py
@@ -14,28 +14,21 @@
def _load_model_and_processor(
model: str,
model_class: str,
- device: str,
):
pretrained_model_class = getattr(transformers, model_class)
- loaded_model = pretrained_model_class.from_pretrained(
- model, device_map=device, torch_dtype="auto"
- )
+ loaded_model = pretrained_model_class.from_pretrained(model, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(model)
return loaded_model, processor
@log_time
-def _run_oneshot(device: str, **oneshot_kwargs):
- oneshot(
- **oneshot_kwargs,
- oneshot_device=device,
- )
+def _run_oneshot(**oneshot_kwargs):
+ oneshot(**oneshot_kwargs)
def run_oneshot_for_e2e_testing(
model: str,
model_class: str,
- device: str,
num_calibration_samples: int,
max_seq_length: int,
dataset_id: str,
@@ -49,7 +42,7 @@ def run_oneshot_for_e2e_testing(
oneshot_kwargs = {}
loaded_model, processor = _load_model_and_processor(
- model=model, model_class=model_class, device=device
+ model=model, model_class=model_class
)
if dataset_id:
@@ -86,6 +79,6 @@ def data_collator(batch):
# Apply quantization.
logger.info("ONESHOT KWARGS", oneshot_kwargs)
- _run_oneshot(device=device, **oneshot_kwargs)
+ _run_oneshot(**oneshot_kwargs)
return oneshot_kwargs["model"], processor
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
index 64d8204e5..89ddb5219 100644
--- a/tests/e2e/vLLM/test_vllm.py
+++ b/tests/e2e/vLLM/test_vllm.py
@@ -88,7 +88,6 @@ def set_up(self, test_data_file: str):
logger.info("========== RUNNING ==============")
logger.info(self.save_dir)
- self.device = "cuda:0"
self.prompts = [
"The capital of France is",
"The president of the US is",
@@ -105,7 +104,6 @@ def test_vllm(self, test_data_file: str):
oneshot_model, tokenizer = run_oneshot_for_e2e_testing(
model=self.model,
model_class=self.model_class,
- device=self.device,
num_calibration_samples=self.num_calibration_samples,
max_seq_length=self.max_seq_length,
scheme=self.scheme,
diff --git a/tests/examples/test_big_models_with_accelerate.py b/tests/examples/test_big_models_with_accelerate.py
deleted file mode 100644
index 019017bdd..000000000
--- a/tests/examples/test_big_models_with_accelerate.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from pathlib import Path
-
-import pytest
-
-from tests.examples.utils import (
- ReadMe,
- copy_and_run_script,
- gen_cmd_fail_message,
- requires_gpu_count,
-)
-
-
-@pytest.fixture
-def example_dir() -> str:
- return "examples/big_models_with_accelerate"
-
-
-@pytest.mark.example
-class TestBigModelsWithAccelerate:
- """
- Tests for examples in the "big_models_with_accelerate" example folder.
- """
-
- def test_readme_has_install_command(self, example_dir: str):
- """
- Test that the README has a valid install command.
- """
- readme_path = Path.cwd() / example_dir / "README.md"
- readme = ReadMe(readme_path)
-
- code = readme.get_code_block_content(position=1, lang="shell")
- assert "pip install" in code
-
- assert code.startswith("pip install llmcompressor")
-
- @pytest.mark.parametrize(
- ("script_filename", "visible_gpus"),
- [
- pytest.param("cpu_offloading_fp8.py", "0", id="cpu_offloading"),
- pytest.param(
- "multi_gpu_int8.py",
- "",
- id="multi_gpu_int8",
- marks=[
- requires_gpu_count(2),
- pytest.mark.multi_gpu,
- ],
- ),
- pytest.param(
- "mult_gpus_int8_device_map.py",
- "0",
- id="mult_gpus_int8_device_map",
- ),
- ],
- )
- @requires_gpu_count(1)
- def test_example_scripts(
- self,
- example_dir: str,
- visible_gpus: str,
- script_filename: str,
- tmp_path: Path,
- monkeypatch: pytest.MonkeyPatch,
- ):
- """
- Test for the example scripts in the folder.
- """
-
- if visible_gpus:
- monkeypatch.setenv("CUDA_VISIBLE_DEVICES", visible_gpus)
-
- command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
-
- assert result.returncode == 0, gen_cmd_fail_message(command, result)
diff --git a/tests/llmcompressor/transformers/tracing/test_models.py b/tests/llmcompressor/transformers/tracing/test_models.py
index 327f3d55d..135928902 100644
--- a/tests/llmcompressor/transformers/tracing/test_models.py
+++ b/tests/llmcompressor/transformers/tracing/test_models.py
@@ -136,7 +136,6 @@ def test_model_trace(model_id, model_class, targets, modality, backends):
modality=modality,
trust_remote_code=True,
skip_weights=True,
- device_map="cpu",
)
target_modules = get_target_modules(model, targets)
diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py
index d4d6e6056..51aa50665 100644
--- a/tests/lmeval/test_lmeval.py
+++ b/tests/lmeval/test_lmeval.py
@@ -90,7 +90,6 @@ def set_up(self, test_data_file: str):
logger.info("========== RUNNING ==============")
logger.info(self.scheme)
- self.device = "cuda:0"
self.num_calibration_samples = 512
self.max_seq_length = 2048
@@ -103,7 +102,6 @@ def test_lm_eval(self, test_data_file: str):
oneshot_model, processor = run_oneshot_for_e2e_testing(
model=self.model,
model_class=self.model_class,
- device=self.device,
num_calibration_samples=self.num_calibration_samples,
max_seq_length=self.max_seq_length,
scheme=self.scheme,