From b30eade3ed2b505eb59950609c9cbc6e728addc0 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 19 Jun 2025 10:55:47 -0400 Subject: [PATCH 01/22] deepseekv3 Signed-off-by: Kyle Sayers --- examples/quantizing_moe/deepseekv3_example.py | 85 +++++++++++++++++++ src/llmcompressor/entrypoints/oneshot.py | 10 +++ src/llmcompressor/modeling/__init__.py | 3 + src/llmcompressor/modeling/deepseek_v3.py | 48 +++++++++++ src/llmcompressor/modeling/prepare.py | 22 +++++ src/llmcompressor/utils/module.py | 27 ++++++ 6 files changed, 195 insertions(+) create mode 100644 examples/quantizing_moe/deepseekv3_example.py create mode 100644 src/llmcompressor/modeling/__init__.py create mode 100644 src/llmcompressor/modeling/deepseek_v3.py create mode 100644 src/llmcompressor/modeling/prepare.py create mode 100644 src/llmcompressor/utils/module.py diff --git a/examples/quantizing_moe/deepseekv3_example.py b/examples/quantizing_moe/deepseekv3_example.py new file mode 100644 index 000000000..ecec45a19 --- /dev/null +++ b/examples/quantizing_moe/deepseekv3_example.py @@ -0,0 +1,85 @@ +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor.modeling import prepare_for_quantization +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.transformers import oneshot + +# Select model and load it. +model_id = "RedHatAI/DeepSeek-V3-BF16" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = prepare_for_quantization(model) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + +# Select number of samples. 512 samples is a good place to start. +# Increasing the number of samples can improve accuracy. +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# Configure the quantization algorithm to run. +# * quantize the weights to 4 bit with GPTQ with a group size 128 +recipe = GPTQModifier( + targets="Linear", + scheme="W4A16", + ignore=["lm_head"], + sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"], +) + +# Apply algorithms. +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[-1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 945c71943..fe5624cd8 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -2,6 +2,8 @@ from datetime import datetime from typing import TYPE_CHECKING, List, Optional, Union +import torch +from compressed_tensors.utils import offloaded_dispatch from loguru import logger from torch.utils.data import DataLoader from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin @@ -127,6 +129,14 @@ def __init__( # initialize the model and processor pre_process(model_args) + # offload to cpu if possible + if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available(): + offloaded_dispatch( + model_args.model, execution_device=model_args.oneshot_device + ) + else: + logger.warning("CUDA is not available! Compressing model on CPU instead") + # Set instance attributes self.model = self.model_args.model self.processor = self.model_args.processor diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py new file mode 100644 index 000000000..e2c22ed1f --- /dev/null +++ b/src/llmcompressor/modeling/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa + +from .prepare import * diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py new file mode 100644 index 000000000..4b885ff64 --- /dev/null +++ b/src/llmcompressor/modeling/deepseek_v3.py @@ -0,0 +1,48 @@ +import torch +from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE + + +class DeepseekV3MoECalibrate(torch.nn.Module): + def __init__(self, config, experts, gate, shared_experts): + super().__init__() + self.config = config + self.experts = experts + self.gate = gate + self.shared_experts = shared_experts + + def forward(self, hidden_states): + residuals = hidden_states + orig_shape = hidden_states.shape + topk_indices, topk_weights = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + + # Begin MoE + final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype) + expert_mask = torch.nn.functional.one_hot( + topk_indices, num_classes=len(self.experts) + ) + expert_mask = expert_mask.permute(2, 0, 1) + + for expert_idx in range(len(self.experts)): + expert = self.experts[expert_idx] + mask = expert_mask[expert_idx] + token_indices, weight_indices = torch.where(mask) + + expert_weights = topk_weights[token_indices, weight_indices] + expert_input = hidden_states[token_indices] + expert_output = expert(expert_input) + weighted_output = expert_output * expert_weights.unsqueeze(-1) + + if token_indices.numel() > 0: + final_hidden_states.index_add_(0, token_indices, weighted_output) + # End MoE + + hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape) + hidden_states = hidden_states + self.shared_experts(residuals) + return hidden_states + + +def replace(module: DeepseekV3MoE) -> DeepseekV3MoECalibrate: + return DeepseekV3MoECalibrate( + module.config, module.experts, module.gate, module.shared_experts + ) diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py new file mode 100644 index 000000000..a8dedf8ee --- /dev/null +++ b/src/llmcompressor/modeling/prepare.py @@ -0,0 +1,22 @@ +import torch +from transformers import PreTrainedModel +from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE + +from llmcompressor.modeling.deepseek_v3 import replace as replace_DeepseekV3MoE +from llmcompressor.utils.module import module_bfs + +__all__ = ["prepare_for_quantization"] + +replacements = { + DeepseekV3MoE: replace_DeepseekV3MoE, +} + + +def prepare_for_quantization(model: PreTrainedModel) -> PreTrainedModel: + def replace(module: torch.nn.Module) -> torch.nn.Module: + if module.__class__ in replacements: + return replacements[module.__class__](module) + else: + return module + + return module_bfs(model, replace, progress=True) diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py new file mode 100644 index 000000000..a02aa8b4a --- /dev/null +++ b/src/llmcompressor/utils/module.py @@ -0,0 +1,27 @@ +from typing import Callable, Union + +import torch +import tqdm + +__all__ = ["module_bfs"] + + +def module_bfs( + module: torch.nn.Module, + func: Callable[[torch.nn.Module], torch.nn.Module], + pre: bool = True, + progress: Union[bool, tqdm.tqdm] = False, +) -> torch.nn.Module: + if progress is True: + total = len(list(module.modules())) + progress = tqdm.tqdm(total=total) + if pre: + module = func(module) + for name, child in list(module.named_children()): + module.add_module(name, module_bfs(child, func, pre, progress)) + if not pre: + module = func(module) + if isinstance(progress, tqdm.tqdm): + progress.update(1) + + return module From a957f2f2c98b3b5e3efa8fea5339dd1502682fe3 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 19 Jun 2025 10:56:53 -0400 Subject: [PATCH 02/22] remove dreg Signed-off-by: Kyle Sayers --- src/llmcompressor/entrypoints/oneshot.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index fe5624cd8..945c71943 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -2,8 +2,6 @@ from datetime import datetime from typing import TYPE_CHECKING, List, Optional, Union -import torch -from compressed_tensors.utils import offloaded_dispatch from loguru import logger from torch.utils.data import DataLoader from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin @@ -129,14 +127,6 @@ def __init__( # initialize the model and processor pre_process(model_args) - # offload to cpu if possible - if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available(): - offloaded_dispatch( - model_args.model, execution_device=model_args.oneshot_device - ) - else: - logger.warning("CUDA is not available! Compressing model on CPU instead") - # Set instance attributes self.model = self.model_args.model self.processor = self.model_args.processor From 2fd2a25569114ce8059bccfff2dc077790b38d0b Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 19 Jun 2025 11:03:33 -0400 Subject: [PATCH 03/22] reformat example Signed-off-by: Kyle Sayers --- examples/quantizing_moe/deepseekv3_example.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/quantizing_moe/deepseekv3_example.py b/examples/quantizing_moe/deepseekv3_example.py index ecec45a19..b34a9faa7 100644 --- a/examples/quantizing_moe/deepseekv3_example.py +++ b/examples/quantizing_moe/deepseekv3_example.py @@ -4,6 +4,7 @@ from llmcompressor.modeling import prepare_for_quantization from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot +from llmcompressor.utils import dispatch_for_generation # Select model and load it. model_id = "RedHatAI/DeepSeek-V3-BF16" @@ -68,18 +69,17 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[-1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") -output = model.generate(input_ids, max_new_tokens=100) +dispatch_for_generation(model) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to("cuda") for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) From b8b217c7bfeff3992ac167db5fbdcaf1dc208dee Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 19 Jun 2025 11:24:23 -0400 Subject: [PATCH 04/22] wip: clean up moe examples Signed-off-by: Kyle Sayers --- examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ------------------ .../quantizing_moe/deepseek_moe_w8a8_fp8.py | 99 -------------- .../quantizing_moe/deepseek_recipe_w4a16.yaml | 8 -- ...e_w8a8_int8.py => deepseekv2_5_example.py} | 29 ++-- examples/quantizing_moe/deepseekv3_example.py | 13 +- .../quantizing_moe/mixtral_moe_w8a8_fp8.py | 96 +++++++++----- examples/quantizing_moe/qwen_moe_w4a16.py | 7 +- 7 files changed, 85 insertions(+), 292 deletions(-) delete mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py delete mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py delete mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml rename examples/quantizing_moe/{deepseek_moe_w8a8_int8.py => deepseekv2_5_example.py} (76%) diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py deleted file mode 100644 index 9880e9248..000000000 --- a/examples/quantizing_moe/deepseek_moe_w4a16.py +++ /dev/null @@ -1,125 +0,0 @@ -import torch -from datasets import load_dataset -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.utils import dispatch_for_generation - -# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. -# Please consider either downgrading your transformers version to a -# previous version or upgrading to a version where this bug is fixed - -# select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-V2.5" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for W416 quantization -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = "deepseek_recipe_w4a16.yaml" - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - save_compressed=True, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") - output = model.generate(input_ids, max_new_tokens=20) - print(tokenizer.decode(output[0])) - print("==========================================") -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - - -# Run the model on vLLM -try: - from vllm import LLM, SamplingParams - - vllm_installed = True -except ImportError: - vllm_installed = False - -if vllm_installed: - print("vLLM installed, running using vLLM") - sampling_params = SamplingParams(temperature=0.80, top_p=0.95) - llm = LLM( - model=SAVE_DIR, - tensor_parallel_size=2, - trust_remote_code=True, - max_model_len=1042, - dtype=torch.half, - ) - prompts = [ - "The capital of France is", - "The president of the US is", - "My name is", - ] - - outputs = llm.generate(prompts, sampling_params) - print("================= vLLM GENERATION ======================") - for output in outputs: - assert output - prompt = output.prompt - generated_text = output.outputs[0].text - print("PROMPT", prompt) - print("GENERATED TEXT", generated_text) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py deleted file mode 100644 index 0bc9c24df..000000000 --- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py +++ /dev/null @@ -1,99 +0,0 @@ -from datasets import load_dataset -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils import dispatch_for_generation - -# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. -# Please consider either downgrading your transformers version to a -# previous version or upgrading to a version where this bug is fixed - -# select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype="auto", trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -# its recommended to use more calibration samples for MoE models so each expert is hit -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 2048 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for FP8 W8A8 quantization -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = [ - QuantizationModifier( - targets="Linear", - scheme="FP8", - ignore=["lm_head", "re:.*mlp.gate$"], - ), -] - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - SAMPLE_INPUT = ["I love quantization because"] - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) - output = model.generate(**inputs, max_length=50) - text_output = tokenizer.batch_decode(output) - print(text_output) -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml deleted file mode 100644 index 23f276e2f..000000000 --- a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml +++ /dev/null @@ -1,8 +0,0 @@ -quant_stage: - quant_modifiers: - GPTQModifier: - ignore: [lm_head, "re:.*mlp.gate$"] - config_groups: - group_0: - weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false} - targets: [Linear] diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseekv2_5_example.py similarity index 76% rename from examples/quantizing_moe/deepseek_moe_w8a8_int8.py rename to examples/quantizing_moe/deepseekv2_5_example.py index 3ec506c34..c2b3b0305 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py +++ b/examples/quantizing_moe/deepseekv2_5_example.py @@ -12,7 +12,7 @@ # previous version or upgrading to a version where this bug is fixed # select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" +MODEL_ID = "deepseek-ai/DeepSeek-V2.5" model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True @@ -20,10 +20,9 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset. -# its recommended to use more calibration samples for MoE models so each expert is hit DATASET_ID = "HuggingFaceH4/ultrachat_200k" DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 2048 +NUM_CALIBRATION_SAMPLES = 512 MAX_SEQUENCE_LENGTH = 2048 @@ -57,16 +56,12 @@ def tokenize(sample): ds = ds.map(tokenize, remove_columns=ds.column_names) -# define a llmcompressor recipe for INT8 W8A8 quantization +# Configure the quantization algorithm to run. # since the MoE gate layers are sensitive to quantization, we add them to the ignore # list so they remain at full precision -recipe = [ - GPTQModifier( - targets="Linear", - scheme="W8A8", - ignore=["lm_head", "re:.*mlp.gate$"], - ), -] +recipe = GPTQModifier( + targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"] +) oneshot( model=model, @@ -82,12 +77,10 @@ def tokenize(sample): if Version(__version__) < Version("4.48"): print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) - SAMPLE_INPUT = ["I love quantization because"] - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) - output = model.generate(**inputs, max_length=50) - text_output = tokenizer.batch_decode(output) - print(text_output) + sample = tokenizer("Hello my name is", return_tensors="pt") + sample = {key: value.to("cuda") for key, value in sample.items()} + output = model.generate(**sample, max_new_tokens=100) + print(tokenizer.decode(output[0])) print("==========================================") else: print( @@ -96,6 +89,6 @@ def tokenize(sample): ) # Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8" +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseekv3_example.py b/examples/quantizing_moe/deepseekv3_example.py index b34a9faa7..1b4c334ff 100644 --- a/examples/quantizing_moe/deepseekv3_example.py +++ b/examples/quantizing_moe/deepseekv3_example.py @@ -7,6 +7,8 @@ from llmcompressor.utils import dispatch_for_generation # Select model and load it. +# For DeepSeekv3, we require a full precision model in order to properly calibrate +# `DeepSeek-V3-BF16` is a DeepSeek-V3 FP8 model which has been converted to BF16 model_id = "RedHatAI/DeepSeek-V3-BF16" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -52,21 +54,22 @@ def tokenize(sample): ds = ds.map(tokenize, remove_columns=ds.column_names) # Configure the quantization algorithm to run. -# * quantize the weights to 4 bit with GPTQ with a group size 128 +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision recipe = GPTQModifier( - targets="Linear", - scheme="W4A16", - ignore=["lm_head"], - sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"], + targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"] ) # Apply algorithms. +# due to the large size of DeepSeekV3, we specify sequential targets such that +# only one MLP is loaded into GPU memory at a time oneshot( model=model, dataset=ds, recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, + sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"], ) # Confirm generations of the quantized model look sane. diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py index a17bf873d..5021c7947 100644 --- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py @@ -1,56 +1,84 @@ -from typing import List - -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.utils import dispatch_for_generation +# select a Mixture of Experts model for quantization MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" -model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Dataset config parameters -DATASET_ID = "open_platypus" -DATASET_SPLIT = "train" -MAX_SEQ_LENGTH = 2048 +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + -# Recipe -layers_to_ignore: List[str] = [ - "lm_head", - "re:.*block_sparse_moe.gate", # does not quantize well -] -recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore) +ds = ds.map(preprocess) +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# Configure the quantization algorithm to run. +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = QuantizationModifier( + scheme="FP8", + targets="Linear", + ignore=[ + "lm_head", + "re:.*block_sparse_moe.gate", # does not quantize well + ], +) + oneshot( model=model, - tokenizer=tokenizer, - dataset=DATASET_ID, - splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"}, + dataset=ds, recipe=recipe, - max_seq_length=MAX_SEQ_LENGTH, + max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, ) -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") - output = model.generate(input_ids, max_new_tokens=20) - print(tokenizer.decode(output[0])) - print("==========================================") -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to("cuda") for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================") # Save to disk in compressed-tensors format. SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py index 40a78a9b7..2531e6528 100644 --- a/examples/quantizing_moe/qwen_moe_w4a16.py +++ b/examples/quantizing_moe/qwen_moe_w4a16.py @@ -73,12 +73,13 @@ def tokenize(sample): # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") -output = model.generate(input_ids, max_new_tokens=20) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to("cuda") for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================") # Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16" +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) From 43bc91df08aa5c14e9cd7653fd3a65d52fe50c52 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 19 Jun 2025 17:21:19 -0400 Subject: [PATCH 05/22] remove deepseek2.5 for now Signed-off-by: Kyle Sayers --- .../quantizing_moe/deepseekv2_5_example.py | 94 ------------------- 1 file changed, 94 deletions(-) delete mode 100644 examples/quantizing_moe/deepseekv2_5_example.py diff --git a/examples/quantizing_moe/deepseekv2_5_example.py b/examples/quantizing_moe/deepseekv2_5_example.py deleted file mode 100644 index c2b3b0305..000000000 --- a/examples/quantizing_moe/deepseekv2_5_example.py +++ /dev/null @@ -1,94 +0,0 @@ -import torch -from datasets import load_dataset -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils import dispatch_for_generation - -# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. -# Please consider either downgrading your transformers version to a -# previous version or upgrading to a version where this bug is fixed - -# select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-V2.5" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# Configure the quantization algorithm to run. -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = GPTQModifier( - targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"] -) - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - sample = tokenizer("Hello my name is", return_tensors="pt") - sample = {key: value.to("cuda") for key, value in sample.items()} - output = model.generate(**sample, max_new_tokens=100) - print(tokenizer.decode(output[0])) - print("==========================================") -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) From 7d8ed369ae6abcdf7e1b8604c61a9d770f9b560f Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 19 Jun 2025 17:38:29 -0400 Subject: [PATCH 06/22] update readme Signed-off-by: Kyle Sayers --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8f27ff9c6..66bb0a117 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Big updates have landed in LLM Compressor! Check out these exciting new features: +* **DeepSeekV3 and Sequential Onloading Support** As of llm-compressor>=0.6.0, you can now quantize DeepSeekV3 and other large models on a single GPU. Models are broken into disjoint layers which are then onloaded to the GPU one layer at a time. For more information on sequential onloading, see [Big Modeling with Sequential Onloading](examples/big_models_with_sequential_onloading/README.md) as well as the [DeepSeekV3 Example](examples/quantizing_moe/deepseekv3_example.py). * **Preliminary FP4 Quantization Support:** Quantize weights and activations to FP4 and seamlessly run the compressed model in vLLM. Model weights and activations are quantized following the NVFP4 [configuration](https://github.com/neuralmagic/compressed-tensors/blob/f5dbfc336b9c9c361b9fe7ae085d5cb0673e56eb/src/compressed_tensors/quantization/quant_scheme.py#L104). See examples of [weight-only quantization](examples/quantization_w4a16_fp4/llama3_example.py) and [fp4 activation support](examples/quantization_w4a4_fp4/llama3_example.py). Support is currently preliminary and additional support will be added for MoEs. * **Axolotl Sparse Finetuning Integration:** Seamlessly finetune sparse LLMs with our Axolotl integration. Learn how to create [fast sparse open-source models with Axolotl and LLM Compressor](https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open). See also the [Axolotl integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#llmcompressor). * **AutoAWQ Integration:** Perform low-bit weight-only quantization efficiently using AutoAWQ, now part of LLM Compressor. *Note: This integration should be considered experimental for now. Enhanced support, including for MoE models and improved handling of larger models via layer sequential pipelining, is planned for upcoming releases.* [See the details](https://github.com/vllm-project/llm-compressor/pull/1177). From b7273a90f859f84103dc7bde04ac9a8c4f2611cd Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 19 Jun 2025 18:23:25 -0400 Subject: [PATCH 07/22] infer model device with optional override Signed-off-by: Kyle Sayers --- .../quantizing_moe/deepseekv2_5_example.py | 94 +++++++++++++++++++ src/llmcompressor/args/dataset_arguments.py | 8 ++ src/llmcompressor/pipelines/basic/pipeline.py | 6 +- .../pipelines/layer_sequential/helpers.py | 3 +- .../pipelines/layer_sequential/pipeline.py | 6 +- .../pipelines/sequential/pipeline.py | 6 +- src/llmcompressor/utils/dev.py | 28 +++++- 7 files changed, 145 insertions(+), 6 deletions(-) create mode 100644 examples/quantizing_moe/deepseekv2_5_example.py diff --git a/examples/quantizing_moe/deepseekv2_5_example.py b/examples/quantizing_moe/deepseekv2_5_example.py new file mode 100644 index 000000000..c2b3b0305 --- /dev/null +++ b/examples/quantizing_moe/deepseekv2_5_example.py @@ -0,0 +1,94 @@ +import torch +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-V2.5" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# Configure the quantization algorithm to run. +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = GPTQModifier( + targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"] +) + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + sample = tokenizer("Hello my name is", return_tensors="pt") + sample = {key: value.to("cuda") for key, value in sample.items()} + output = model.generate(**sample, max_new_tokens=100) + print(tokenizer.decode(output[0])) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py index 949933f97..f5e107b10 100644 --- a/src/llmcompressor/args/dataset_arguments.py +++ b/src/llmcompressor/args/dataset_arguments.py @@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments): "will execute code present on the Hub on your local machine." }, ) + # --- pipeline arguments --- # pipeline: Optional[str] = field( default="independent", metadata={ @@ -196,3 +197,10 @@ class DatasetArguments(CustomDatasetArguments): "definition" }, ) + model_input_device: Optional[str] = field( + default=None, + metadata={ + "help": "Device to put model inputs on for calibration." + "If none is specified, the model input device is inferred from the model" + }, + ) diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py index 605358ae9..ef192755e 100644 --- a/src/llmcompressor/pipelines/basic/pipeline.py +++ b/src/llmcompressor/pipelines/basic/pipeline.py @@ -2,7 +2,6 @@ import torch import tqdm -from compressed_tensors.utils import get_execution_device from torch.utils.data.dataloader import DataLoader from llmcompressor.core import LifecycleCallbacks @@ -10,6 +9,7 @@ from llmcompressor.pipelines.registry import CalibrationPipeline from llmcompressor.pytorch.utils.helpers import tensors_to_device from llmcompressor.utils import calibration_forward_context, dispatch_for_generation +from llmcompressor.utils.dev import infer_model_device if TYPE_CHECKING: from llmcompressor.args.dataset_arguments import DatasetArguments @@ -38,7 +38,9 @@ def __call__( :param dataset_args: dataset arguments relevant to pipelines """ dispatch_for_generation(model) # basic dispatch is identical to generation - model_device = get_execution_device(model) + model_device = dataset_args.model_input_device + if model_device is None: + model_device = infer_model_device(model) LifecycleCallbacks.calibration_epoch_start() diff --git a/src/llmcompressor/pipelines/layer_sequential/helpers.py b/src/llmcompressor/pipelines/layer_sequential/helpers.py index f9d828d14..c8b9fcbd3 100644 --- a/src/llmcompressor/pipelines/layer_sequential/helpers.py +++ b/src/llmcompressor/pipelines/layer_sequential/helpers.py @@ -44,6 +44,7 @@ def capture_first_layer_intermediates( model: Module, first_layer: Module, dataloader: DataLoader, + model_device: torch.device = torch.device("cpu"), mask_padding: bool = True, ) -> IntermediatesCache: """ @@ -68,7 +69,7 @@ def capture_first_layer_intermediates( desc = "Preparing intermediates cache" for batch_index, batch in enumerate(tqdm.tqdm(dataloader, desc=desc)): batch = apply_pad_mask_to_batch(batch) if mask_padding else batch - batch = tensors_to_device(batch, torch.device("cpu")) + batch = tensors_to_device(batch, model_device) try: model(**batch) diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index d8ad73a10..95e4a2fea 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -19,6 +19,7 @@ dispatch_for_sequential, get_sequential_targets, ) +from llmcompressor.utils.dev import infer_model_device from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context if TYPE_CHECKING: @@ -60,6 +61,9 @@ def __call__( # prepare model for sequential onloading dispatch_for_sequential(model) + model_device = dataset_args.model_input_device + if model_device is None: + model_device = infer_model_device(model) # find layers modifiers = session.get_modifiers() @@ -71,7 +75,7 @@ def __call__( with calibration_forward_context(model), DisableQuantization(model): # prepare intermediates cache intermediates: IntermediatesCache = capture_first_layer_intermediates( - model, layers[0], dataloader + model, layers[0], dataloader, model_device ) num_layers = len(layers) diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 8cefeb0cf..d8ae6661a 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -14,6 +14,7 @@ get_sequential_targets, trace_subgraphs, ) +from llmcompressor.utils.dev import infer_model_device from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context if TYPE_CHECKING: @@ -54,6 +55,9 @@ def __call__( # prepare model for sequential onloading dispatch_for_sequential(model) + model_device = dataset_args.model_input_device + if model_device is None: + model_device = infer_model_device(model) # prepare to trace subgraphs modifiers = session.get_modifiers() @@ -69,7 +73,7 @@ def __call__( with calibration_forward_context(model), DisableQuantization(model): # prepare intermediates cache - activations = IntermediatesCache.from_dataloader(dataloader) + activations = IntermediatesCache.from_dataloader(dataloader, model_device) for subgraph_index, subgraph in enumerate(subgraphs): # prepare tqdm description texts diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py index f0feb6c04..b44e76739 100644 --- a/src/llmcompressor/utils/dev.py +++ b/src/llmcompressor/utils/dev.py @@ -7,7 +7,7 @@ import torch from accelerate import dispatch_model, infer_auto_device_map from accelerate.utils import get_balanced_memory -from compressed_tensors.utils import remove_dispatch +from compressed_tensors.utils import has_offloaded_params, remove_dispatch from huggingface_hub import snapshot_download from safetensors.torch import save_file from transformers import AutoModelForCausalLM, PreTrainedModel @@ -20,6 +20,7 @@ "skip_weights_download", "patch_transformers_logger_level", "dispatch_for_generation", + "infer_model_device", ] @@ -140,3 +141,28 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel: ) return dispatch_model(model, device_map=device_map) + + +def infer_model_device(model: PreTrainedModel) -> torch.device: + """ + Gets the model's execution device (the device that model inputs should be on) + using non-guaranteed but reasonable assumptions about module and parameter order. + + If a model is offloaded, assume that modules execute in the same order + that they are returned by torch.nn.Module.modules() + + If a model is not offloaded, assume that parameters are used in the same order + that they are used + + :param model: model whose execution device is being inferred + :return: device which model inputs should be put on + """ + for module in model.modules(): + if has_offloaded_params(module): + return module._hf_hook.execution_device + + first_param = next(module.parameters(), None) + if first_param is None: + return torch.device("cpu") + + return first_param.device From afebe2e953364232ab745cd0fc95ee0ca7d9b999 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 10:43:23 -0400 Subject: [PATCH 08/22] handle nullable dataset_args Signed-off-by: Kyle Sayers --- src/llmcompressor/pipelines/basic/pipeline.py | 2 +- src/llmcompressor/pipelines/layer_sequential/pipeline.py | 2 +- src/llmcompressor/pipelines/sequential/pipeline.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py index ef192755e..87f463bfe 100644 --- a/src/llmcompressor/pipelines/basic/pipeline.py +++ b/src/llmcompressor/pipelines/basic/pipeline.py @@ -38,7 +38,7 @@ def __call__( :param dataset_args: dataset arguments relevant to pipelines """ dispatch_for_generation(model) # basic dispatch is identical to generation - model_device = dataset_args.model_input_device + model_device = getattr(dataset_args, "model_input_device") if model_device is None: model_device = infer_model_device(model) diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index 95e4a2fea..089f7dc8c 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -61,7 +61,7 @@ def __call__( # prepare model for sequential onloading dispatch_for_sequential(model) - model_device = dataset_args.model_input_device + model_device = getattr(dataset_args, "model_input_device") if model_device is None: model_device = infer_model_device(model) diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index d8ae6661a..c9ce9cd30 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -55,7 +55,7 @@ def __call__( # prepare model for sequential onloading dispatch_for_sequential(model) - model_device = dataset_args.model_input_device + model_device = getattr(dataset_args, "model_input_device") if model_device is None: model_device = infer_model_device(model) From ab3aa3e85ebcb8d4a5a7e8e2bd645c93616baf03 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 10:46:54 -0400 Subject: [PATCH 09/22] update docstrings, comments Signed-off-by: Kyle Sayers --- src/llmcompressor/args/dataset_arguments.py | 2 +- src/llmcompressor/utils/dev.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py index f5e107b10..677d09daa 100644 --- a/src/llmcompressor/args/dataset_arguments.py +++ b/src/llmcompressor/args/dataset_arguments.py @@ -200,7 +200,7 @@ class DatasetArguments(CustomDatasetArguments): model_input_device: Optional[str] = field( default=None, metadata={ - "help": "Device to put model inputs on for calibration." + "help": "Device to put model inputs on for calibration. " "If none is specified, the model input device is inferred from the model" }, ) diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py index b44e76739..558ef816e 100644 --- a/src/llmcompressor/utils/dev.py +++ b/src/llmcompressor/utils/dev.py @@ -149,10 +149,10 @@ def infer_model_device(model: PreTrainedModel) -> torch.device: using non-guaranteed but reasonable assumptions about module and parameter order. If a model is offloaded, assume that modules execute in the same order - that they are returned by torch.nn.Module.modules() + that they are returned by `model.modules()` If a model is not offloaded, assume that parameters are used in the same order - that they are used + that they are returned by `model.parameters()` :param model: model whose execution device is being inferred :return: device which model inputs should be put on @@ -161,7 +161,7 @@ def infer_model_device(model: PreTrainedModel) -> torch.device: if has_offloaded_params(module): return module._hf_hook.execution_device - first_param = next(module.parameters(), None) + first_param = next(model.parameters(), None) if first_param is None: return torch.device("cpu") From e9e30c3e6580ba171ba40158ab57746fe601ae90 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 13:37:55 -0400 Subject: [PATCH 10/22] rename files, update examples tests Signed-off-by: Kyle Sayers --- .../{mixtral_moe_w8a8_fp8.py => mixtral_example.py} | 4 ++-- .../{qwen_moe_w4a16.py => qwen_example.py} | 0 tests/examples/test_quantizing_moe.py | 11 ++++------- 3 files changed, 6 insertions(+), 9 deletions(-) rename examples/quantizing_moe/{mixtral_moe_w8a8_fp8.py => mixtral_example.py} (96%) rename examples/quantizing_moe/{qwen_moe_w4a16.py => qwen_example.py} (100%) diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_example.py similarity index 96% rename from examples/quantizing_moe/mixtral_moe_w8a8_fp8.py rename to examples/quantizing_moe/mixtral_example.py index 5021c7947..49b08c722 100644 --- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/mixtral_example.py @@ -55,7 +55,7 @@ def tokenize(sample): # since the MoE gate layers are sensitive to quantization, we add them to the ignore # list so they remain at full precision recipe = QuantizationModifier( - scheme="FP8", + scheme="W4A16", targets="Linear", ignore=[ "lm_head", @@ -81,6 +81,6 @@ def tokenize(sample): print("==========================================") # Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_example.py similarity index 100% rename from examples/quantizing_moe/qwen_moe_w4a16.py rename to examples/quantizing_moe/qwen_example.py diff --git a/tests/examples/test_quantizing_moe.py b/tests/examples/test_quantizing_moe.py index 49686d25c..50e86c2c8 100644 --- a/tests/examples/test_quantizing_moe.py +++ b/tests/examples/test_quantizing_moe.py @@ -44,14 +44,11 @@ def test_doc_example_command(self, example_dir: str, tmp_path: Path): "script_filename", [ pytest.param( - "deepseek_moe_w4a16.py", - marks=[ - pytest.mark.multi_gpu, - pytest.mark.skip(reason="exceptionally long run time"), - ], + "deepseekv3_example.py", + marks=pytest.mark.skip(reason="exceptionally long run time"), ), - pytest.param("deepseek_moe_w8a8_fp8.py"), - pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu), + pytest.param("mixtral_example.py"), + pytest.param("qwen_example.py"), ], ) def test_deepseek_example_script( From 6bf5acbc9c06b7e440012f7b0b09576ffbe3a05f Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 13:42:12 -0400 Subject: [PATCH 11/22] rebase on main Signed-off-by: Kyle Sayers --- examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ++++++++++++++++++ .../quantizing_moe/deepseek_moe_w8a8_fp8.py | 99 ++++++++++++++ .../quantizing_moe/deepseek_moe_w8a8_int8.py | 101 ++++++++++++++ .../quantizing_moe/deepseek_recipe_w4a16.yaml | 8 ++ .../quantizing_moe/mixtral_moe_w8a8_fp8.py | 96 +++++--------- examples/quantizing_moe/qwen_moe_w4a16.py | 7 +- src/llmcompressor/modeling/__init__.py | 3 - src/llmcompressor/modeling/deepseek_v3.py | 48 ------- src/llmcompressor/modeling/prepare.py | 22 --- src/llmcompressor/utils/module.py | 27 ---- 10 files changed, 370 insertions(+), 166 deletions(-) create mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_int8.py create mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml delete mode 100644 src/llmcompressor/modeling/__init__.py delete mode 100644 src/llmcompressor/modeling/deepseek_v3.py delete mode 100644 src/llmcompressor/modeling/prepare.py delete mode 100644 src/llmcompressor/utils/module.py diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py new file mode 100644 index 000000000..9880e9248 --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w4a16.py @@ -0,0 +1,125 @@ +import torch +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-V2.5" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for W416 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = "deepseek_recipe_w4a16.yaml" + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + save_compressed=True, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") + output = model.generate(input_ids, max_new_tokens=20) + print(tokenizer.decode(output[0])) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + + +# Run the model on vLLM +try: + from vllm import LLM, SamplingParams + + vllm_installed = True +except ImportError: + vllm_installed = False + +if vllm_installed: + print("vLLM installed, running using vLLM") + sampling_params = SamplingParams(temperature=0.80, top_p=0.95) + llm = LLM( + model=SAVE_DIR, + tensor_parallel_size=2, + trust_remote_code=True, + max_model_len=1042, + dtype=torch.half, + ) + prompts = [ + "The capital of France is", + "The president of the US is", + "My name is", + ] + + outputs = llm.generate(prompts, sampling_params) + print("================= vLLM GENERATION ======================") + for output in outputs: + assert output + prompt = output.prompt + generated_text = output.outputs[0].text + print("PROMPT", prompt) + print("GENERATED TEXT", generated_text) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py new file mode 100644 index 000000000..0bc9c24df --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py @@ -0,0 +1,99 @@ +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype="auto", trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +# its recommended to use more calibration samples for MoE models so each expert is hit +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 2048 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for FP8 W8A8 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = [ + QuantizationModifier( + targets="Linear", + scheme="FP8", + ignore=["lm_head", "re:.*mlp.gate$"], + ), +] + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + SAMPLE_INPUT = ["I love quantization because"] + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) + output = model.generate(**inputs, max_length=50) + text_output = tokenizer.batch_decode(output) + print(text_output) +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py new file mode 100644 index 000000000..3ec506c34 --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py @@ -0,0 +1,101 @@ +import torch +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +# its recommended to use more calibration samples for MoE models so each expert is hit +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 2048 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for INT8 W8A8 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = [ + GPTQModifier( + targets="Linear", + scheme="W8A8", + ignore=["lm_head", "re:.*mlp.gate$"], + ), +] + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + SAMPLE_INPUT = ["I love quantization because"] + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) + output = model.generate(**inputs, max_length=50) + text_output = tokenizer.batch_decode(output) + print(text_output) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml new file mode 100644 index 000000000..23f276e2f --- /dev/null +++ b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml @@ -0,0 +1,8 @@ +quant_stage: + quant_modifiers: + GPTQModifier: + ignore: [lm_head, "re:.*mlp.gate$"] + config_groups: + group_0: + weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false} + targets: [Linear] diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py index 5021c7947..a17bf873d 100644 --- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py @@ -1,84 +1,56 @@ -import torch -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer +from typing import List + +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.utils import dispatch_for_generation -# select a Mixture of Experts model for quantization MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) +# Dataset config parameters +DATASET_ID = "open_platypus" +DATASET_SPLIT = "train" +MAX_SEQ_LENGTH = 2048 +NUM_CALIBRATION_SAMPLES = 512 -ds = ds.map(tokenize, remove_columns=ds.column_names) +# Recipe +layers_to_ignore: List[str] = [ + "lm_head", + "re:.*block_sparse_moe.gate", # does not quantize well +] +recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore) -# Configure the quantization algorithm to run. -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = QuantizationModifier( - scheme="FP8", - targets="Linear", - ignore=[ - "lm_head", - "re:.*block_sparse_moe.gate", # does not quantize well - ], -) oneshot( model=model, - dataset=ds, + tokenizer=tokenizer, + dataset=DATASET_ID, + splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"}, recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, + max_seq_length=MAX_SEQ_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, - trust_remote_code_model=True, ) -print("========== SAMPLE GENERATION ==============") -dispatch_for_generation(model) -sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} -output = model.generate(**sample, max_new_tokens=100) -print(tokenizer.decode(output[0])) -print("==========================================") +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") + output = model.generate(input_ids, max_new_tokens=20) + print(tokenizer.decode(output[0])) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) # Save to disk in compressed-tensors format. SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py index 2531e6528..40a78a9b7 100644 --- a/examples/quantizing_moe/qwen_moe_w4a16.py +++ b/examples/quantizing_moe/qwen_moe_w4a16.py @@ -73,13 +73,12 @@ def tokenize(sample): # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} -output = model.generate(**sample, max_new_tokens=100) +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) print("==========================================") # Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py deleted file mode 100644 index e2c22ed1f..000000000 --- a/src/llmcompressor/modeling/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# flake8: noqa - -from .prepare import * diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py deleted file mode 100644 index 4b885ff64..000000000 --- a/src/llmcompressor/modeling/deepseek_v3.py +++ /dev/null @@ -1,48 +0,0 @@ -import torch -from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE - - -class DeepseekV3MoECalibrate(torch.nn.Module): - def __init__(self, config, experts, gate, shared_experts): - super().__init__() - self.config = config - self.experts = experts - self.gate = gate - self.shared_experts = shared_experts - - def forward(self, hidden_states): - residuals = hidden_states - orig_shape = hidden_states.shape - topk_indices, topk_weights = self.gate(hidden_states) - hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) - - # Begin MoE - final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype) - expert_mask = torch.nn.functional.one_hot( - topk_indices, num_classes=len(self.experts) - ) - expert_mask = expert_mask.permute(2, 0, 1) - - for expert_idx in range(len(self.experts)): - expert = self.experts[expert_idx] - mask = expert_mask[expert_idx] - token_indices, weight_indices = torch.where(mask) - - expert_weights = topk_weights[token_indices, weight_indices] - expert_input = hidden_states[token_indices] - expert_output = expert(expert_input) - weighted_output = expert_output * expert_weights.unsqueeze(-1) - - if token_indices.numel() > 0: - final_hidden_states.index_add_(0, token_indices, weighted_output) - # End MoE - - hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape) - hidden_states = hidden_states + self.shared_experts(residuals) - return hidden_states - - -def replace(module: DeepseekV3MoE) -> DeepseekV3MoECalibrate: - return DeepseekV3MoECalibrate( - module.config, module.experts, module.gate, module.shared_experts - ) diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py deleted file mode 100644 index a8dedf8ee..000000000 --- a/src/llmcompressor/modeling/prepare.py +++ /dev/null @@ -1,22 +0,0 @@ -import torch -from transformers import PreTrainedModel -from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE - -from llmcompressor.modeling.deepseek_v3 import replace as replace_DeepseekV3MoE -from llmcompressor.utils.module import module_bfs - -__all__ = ["prepare_for_quantization"] - -replacements = { - DeepseekV3MoE: replace_DeepseekV3MoE, -} - - -def prepare_for_quantization(model: PreTrainedModel) -> PreTrainedModel: - def replace(module: torch.nn.Module) -> torch.nn.Module: - if module.__class__ in replacements: - return replacements[module.__class__](module) - else: - return module - - return module_bfs(model, replace, progress=True) diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py deleted file mode 100644 index a02aa8b4a..000000000 --- a/src/llmcompressor/utils/module.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Callable, Union - -import torch -import tqdm - -__all__ = ["module_bfs"] - - -def module_bfs( - module: torch.nn.Module, - func: Callable[[torch.nn.Module], torch.nn.Module], - pre: bool = True, - progress: Union[bool, tqdm.tqdm] = False, -) -> torch.nn.Module: - if progress is True: - total = len(list(module.modules())) - progress = tqdm.tqdm(total=total) - if pre: - module = func(module) - for name, child in list(module.named_children()): - module.add_module(name, module_bfs(child, func, pre, progress)) - if not pre: - module = func(module) - if isinstance(progress, tqdm.tqdm): - progress.update(1) - - return module From e77a31bbf9b0c1a0a75613dc222ca456db8867df Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 13:49:31 -0400 Subject: [PATCH 12/22] clean examples Signed-off-by: Kyle Sayers --- examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ------------------ .../quantizing_moe/deepseek_moe_w8a8_fp8.py | 99 -------------- .../quantizing_moe/deepseek_moe_w8a8_int8.py | 101 -------------- .../quantizing_moe/deepseek_recipe_w4a16.yaml | 8 -- ...epseekv3_example.py => mixtral_example.py} | 46 +++---- .../quantizing_moe/mixtral_moe_w8a8_fp8.py | 58 -------- .../{qwen_moe_w4a16.py => qwen_example.py} | 7 +- tests/examples/test_quantizing_moe.py | 11 +- 8 files changed, 30 insertions(+), 425 deletions(-) delete mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py delete mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py delete mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_int8.py delete mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml rename examples/quantizing_moe/{deepseekv3_example.py => mixtral_example.py} (57%) delete mode 100644 examples/quantizing_moe/mixtral_moe_w8a8_fp8.py rename examples/quantizing_moe/{qwen_moe_w4a16.py => qwen_example.py} (90%) diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py deleted file mode 100644 index 9880e9248..000000000 --- a/examples/quantizing_moe/deepseek_moe_w4a16.py +++ /dev/null @@ -1,125 +0,0 @@ -import torch -from datasets import load_dataset -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.utils import dispatch_for_generation - -# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. -# Please consider either downgrading your transformers version to a -# previous version or upgrading to a version where this bug is fixed - -# select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-V2.5" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for W416 quantization -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = "deepseek_recipe_w4a16.yaml" - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - save_compressed=True, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") - output = model.generate(input_ids, max_new_tokens=20) - print(tokenizer.decode(output[0])) - print("==========================================") -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - - -# Run the model on vLLM -try: - from vllm import LLM, SamplingParams - - vllm_installed = True -except ImportError: - vllm_installed = False - -if vllm_installed: - print("vLLM installed, running using vLLM") - sampling_params = SamplingParams(temperature=0.80, top_p=0.95) - llm = LLM( - model=SAVE_DIR, - tensor_parallel_size=2, - trust_remote_code=True, - max_model_len=1042, - dtype=torch.half, - ) - prompts = [ - "The capital of France is", - "The president of the US is", - "My name is", - ] - - outputs = llm.generate(prompts, sampling_params) - print("================= vLLM GENERATION ======================") - for output in outputs: - assert output - prompt = output.prompt - generated_text = output.outputs[0].text - print("PROMPT", prompt) - print("GENERATED TEXT", generated_text) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py deleted file mode 100644 index 0bc9c24df..000000000 --- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py +++ /dev/null @@ -1,99 +0,0 @@ -from datasets import load_dataset -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils import dispatch_for_generation - -# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. -# Please consider either downgrading your transformers version to a -# previous version or upgrading to a version where this bug is fixed - -# select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype="auto", trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -# its recommended to use more calibration samples for MoE models so each expert is hit -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 2048 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for FP8 W8A8 quantization -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = [ - QuantizationModifier( - targets="Linear", - scheme="FP8", - ignore=["lm_head", "re:.*mlp.gate$"], - ), -] - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - SAMPLE_INPUT = ["I love quantization because"] - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) - output = model.generate(**inputs, max_length=50) - text_output = tokenizer.batch_decode(output) - print(text_output) -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py deleted file mode 100644 index 3ec506c34..000000000 --- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py +++ /dev/null @@ -1,101 +0,0 @@ -import torch -from datasets import load_dataset -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils import dispatch_for_generation - -# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. -# Please consider either downgrading your transformers version to a -# previous version or upgrading to a version where this bug is fixed - -# select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -# its recommended to use more calibration samples for MoE models so each expert is hit -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 2048 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for INT8 W8A8 quantization -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = [ - GPTQModifier( - targets="Linear", - scheme="W8A8", - ignore=["lm_head", "re:.*mlp.gate$"], - ), -] - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - SAMPLE_INPUT = ["I love quantization because"] - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) - output = model.generate(**inputs, max_length=50) - text_output = tokenizer.batch_decode(output) - print(text_output) - print("==========================================") -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml deleted file mode 100644 index 23f276e2f..000000000 --- a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml +++ /dev/null @@ -1,8 +0,0 @@ -quant_stage: - quant_modifiers: - GPTQModifier: - ignore: [lm_head, "re:.*mlp.gate$"] - config_groups: - group_0: - weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false} - targets: [Linear] diff --git a/examples/quantizing_moe/deepseekv3_example.py b/examples/quantizing_moe/mixtral_example.py similarity index 57% rename from examples/quantizing_moe/deepseekv3_example.py rename to examples/quantizing_moe/mixtral_example.py index 1b4c334ff..49b08c722 100644 --- a/examples/quantizing_moe/deepseekv3_example.py +++ b/examples/quantizing_moe/mixtral_example.py @@ -1,28 +1,26 @@ +import torch from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer -from llmcompressor.modeling import prepare_for_quantization -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.transformers import oneshot +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.utils import dispatch_for_generation -# Select model and load it. -# For DeepSeekv3, we require a full precision model in order to properly calibrate -# `DeepSeek-V3-BF16` is a DeepSeek-V3 FP8 model which has been converted to BF16 -model_id = "RedHatAI/DeepSeek-V3-BF16" -model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(model_id) -model = prepare_for_quantization(model) +# select a Mixture of Experts model for quantization +MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset. DATASET_ID = "HuggingFaceH4/ultrachat_200k" DATASET_SPLIT = "train_sft" - -# Select number of samples. 512 samples is a good place to start. -# Increasing the number of samples can improve accuracy. NUM_CALIBRATION_SAMPLES = 512 MAX_SEQUENCE_LENGTH = 2048 + # Load dataset and preprocess. ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") ds = ds.shuffle(seed=42) @@ -56,33 +54,33 @@ def tokenize(sample): # Configure the quantization algorithm to run. # since the MoE gate layers are sensitive to quantization, we add them to the ignore # list so they remain at full precision -recipe = GPTQModifier( - targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"] +recipe = QuantizationModifier( + scheme="W4A16", + targets="Linear", + ignore=[ + "lm_head", + "re:.*block_sparse_moe.gate", # does not quantize well + ], ) -# Apply algorithms. -# due to the large size of DeepSeekV3, we specify sequential targets such that -# only one MLP is loaded into GPU memory at a time oneshot( model=model, dataset=ds, recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, - sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"], + trust_remote_code_model=True, ) -# Confirm generations of the quantized model look sane. -print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) sample = tokenizer("Hello my name is", return_tensors="pt") sample = {key: value.to("cuda") for key, value in sample.items()} output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) -print("==========================================\n\n") +print("==========================================") -# Save to disk compressed. -SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py deleted file mode 100644 index a17bf873d..000000000 --- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py +++ /dev/null @@ -1,58 +0,0 @@ -from typing import List - -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils import dispatch_for_generation - -MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" - -model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - - -# Dataset config parameters -DATASET_ID = "open_platypus" -DATASET_SPLIT = "train" -MAX_SEQ_LENGTH = 2048 -NUM_CALIBRATION_SAMPLES = 512 - -# Recipe -layers_to_ignore: List[str] = [ - "lm_head", - "re:.*block_sparse_moe.gate", # does not quantize well -] -recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore) - - -oneshot( - model=model, - tokenizer=tokenizer, - dataset=DATASET_ID, - splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"}, - recipe=recipe, - max_seq_length=MAX_SEQ_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") - output = model.generate(input_ids, max_new_tokens=20) - print(tokenizer.decode(output[0])) - print("==========================================") -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_example.py similarity index 90% rename from examples/quantizing_moe/qwen_moe_w4a16.py rename to examples/quantizing_moe/qwen_example.py index 40a78a9b7..2531e6528 100644 --- a/examples/quantizing_moe/qwen_moe_w4a16.py +++ b/examples/quantizing_moe/qwen_example.py @@ -73,12 +73,13 @@ def tokenize(sample): # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") -output = model.generate(input_ids, max_new_tokens=20) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to("cuda") for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================") # Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16" +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/tests/examples/test_quantizing_moe.py b/tests/examples/test_quantizing_moe.py index 49686d25c..848d5e3bf 100644 --- a/tests/examples/test_quantizing_moe.py +++ b/tests/examples/test_quantizing_moe.py @@ -44,14 +44,11 @@ def test_doc_example_command(self, example_dir: str, tmp_path: Path): "script_filename", [ pytest.param( - "deepseek_moe_w4a16.py", - marks=[ - pytest.mark.multi_gpu, - pytest.mark.skip(reason="exceptionally long run time"), - ], + "deepseekv2_5_example.py", + marks=pytest.mark.skip(reason="exceptionally long run time"), ), - pytest.param("deepseek_moe_w8a8_fp8.py"), - pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu), + pytest.param("mixtral_example.py"), + pytest.param("qwen_example.py"), ], ) def test_deepseek_example_script( From 366ac257c801d29427154200a9aa3cb4f6be5080 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 13:51:36 -0400 Subject: [PATCH 13/22] revert examples changes Signed-off-by: Kyle Sayers --- examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ++++++++++++++++++ .../quantizing_moe/deepseek_moe_w8a8_fp8.py | 99 ++++++++++++++ .../quantizing_moe/deepseek_moe_w8a8_int8.py | 101 ++++++++++++++ .../quantizing_moe/deepseek_recipe_w4a16.yaml | 8 ++ .../quantizing_moe/mixtral_moe_w8a8_fp8.py | 58 ++++++++ examples/quantizing_moe/qwen_moe_w4a16.py | 84 ++++++++++++ 6 files changed, 475 insertions(+) create mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_int8.py create mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml create mode 100644 examples/quantizing_moe/mixtral_moe_w8a8_fp8.py create mode 100644 examples/quantizing_moe/qwen_moe_w4a16.py diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py new file mode 100644 index 000000000..9880e9248 --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w4a16.py @@ -0,0 +1,125 @@ +import torch +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-V2.5" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for W416 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = "deepseek_recipe_w4a16.yaml" + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + save_compressed=True, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") + output = model.generate(input_ids, max_new_tokens=20) + print(tokenizer.decode(output[0])) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + + +# Run the model on vLLM +try: + from vllm import LLM, SamplingParams + + vllm_installed = True +except ImportError: + vllm_installed = False + +if vllm_installed: + print("vLLM installed, running using vLLM") + sampling_params = SamplingParams(temperature=0.80, top_p=0.95) + llm = LLM( + model=SAVE_DIR, + tensor_parallel_size=2, + trust_remote_code=True, + max_model_len=1042, + dtype=torch.half, + ) + prompts = [ + "The capital of France is", + "The president of the US is", + "My name is", + ] + + outputs = llm.generate(prompts, sampling_params) + print("================= vLLM GENERATION ======================") + for output in outputs: + assert output + prompt = output.prompt + generated_text = output.outputs[0].text + print("PROMPT", prompt) + print("GENERATED TEXT", generated_text) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py new file mode 100644 index 000000000..0bc9c24df --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py @@ -0,0 +1,99 @@ +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype="auto", trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +# its recommended to use more calibration samples for MoE models so each expert is hit +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 2048 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for FP8 W8A8 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = [ + QuantizationModifier( + targets="Linear", + scheme="FP8", + ignore=["lm_head", "re:.*mlp.gate$"], + ), +] + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + SAMPLE_INPUT = ["I love quantization because"] + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) + output = model.generate(**inputs, max_length=50) + text_output = tokenizer.batch_decode(output) + print(text_output) +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py new file mode 100644 index 000000000..3ec506c34 --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py @@ -0,0 +1,101 @@ +import torch +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +# its recommended to use more calibration samples for MoE models so each expert is hit +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 2048 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for INT8 W8A8 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = [ + GPTQModifier( + targets="Linear", + scheme="W8A8", + ignore=["lm_head", "re:.*mlp.gate$"], + ), +] + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + SAMPLE_INPUT = ["I love quantization because"] + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) + output = model.generate(**inputs, max_length=50) + text_output = tokenizer.batch_decode(output) + print(text_output) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml new file mode 100644 index 000000000..23f276e2f --- /dev/null +++ b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml @@ -0,0 +1,8 @@ +quant_stage: + quant_modifiers: + GPTQModifier: + ignore: [lm_head, "re:.*mlp.gate$"] + config_groups: + group_0: + weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false} + targets: [Linear] diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py new file mode 100644 index 000000000..a17bf873d --- /dev/null +++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py @@ -0,0 +1,58 @@ +from typing import List + +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils import dispatch_for_generation + +MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" + +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + + +# Dataset config parameters +DATASET_ID = "open_platypus" +DATASET_SPLIT = "train" +MAX_SEQ_LENGTH = 2048 +NUM_CALIBRATION_SAMPLES = 512 + +# Recipe +layers_to_ignore: List[str] = [ + "lm_head", + "re:.*block_sparse_moe.gate", # does not quantize well +] +recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore) + + +oneshot( + model=model, + tokenizer=tokenizer, + dataset=DATASET_ID, + splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"}, + recipe=recipe, + max_seq_length=MAX_SEQ_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") + output = model.generate(input_ids, max_new_tokens=20) + print(tokenizer.decode(output[0])) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py new file mode 100644 index 000000000..40a78a9b7 --- /dev/null +++ b/examples/quantizing_moe/qwen_moe_w4a16.py @@ -0,0 +1,84 @@ +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.transformers import oneshot +from llmcompressor.utils import dispatch_for_generation + +# select a Mixture of Experts model for quantization +MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for W416 quantization with a group size of 128 +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = GPTQModifier( + targets="Linear", + scheme="W4A16", + ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"], +) + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + save_compressed=True, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=20) +print(tokenizer.decode(output[0])) +print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) From c44da345e8dd0d63d5cc0df3cb06bad52f968fa6 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 13:52:17 -0400 Subject: [PATCH 14/22] revert extra examples Signed-off-by: Kyle Sayers --- .../quantizing_moe/deepseekv2_5_example.py | 94 ------------------- examples/quantizing_moe/mixtral_example.py | 86 ----------------- examples/quantizing_moe/qwen_example.py | 85 ----------------- 3 files changed, 265 deletions(-) delete mode 100644 examples/quantizing_moe/deepseekv2_5_example.py delete mode 100644 examples/quantizing_moe/mixtral_example.py delete mode 100644 examples/quantizing_moe/qwen_example.py diff --git a/examples/quantizing_moe/deepseekv2_5_example.py b/examples/quantizing_moe/deepseekv2_5_example.py deleted file mode 100644 index c2b3b0305..000000000 --- a/examples/quantizing_moe/deepseekv2_5_example.py +++ /dev/null @@ -1,94 +0,0 @@ -import torch -from datasets import load_dataset -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils import dispatch_for_generation - -# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. -# Please consider either downgrading your transformers version to a -# previous version or upgrading to a version where this bug is fixed - -# select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-V2.5" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# Configure the quantization algorithm to run. -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = GPTQModifier( - targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"] -) - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - sample = tokenizer("Hello my name is", return_tensors="pt") - sample = {key: value.to("cuda") for key, value in sample.items()} - output = model.generate(**sample, max_new_tokens=100) - print(tokenizer.decode(output[0])) - print("==========================================") -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py deleted file mode 100644 index 49b08c722..000000000 --- a/examples/quantizing_moe/mixtral_example.py +++ /dev/null @@ -1,86 +0,0 @@ -import torch -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils import dispatch_for_generation - -# select a Mixture of Experts model for quantization -MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# Configure the quantization algorithm to run. -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = QuantizationModifier( - scheme="W4A16", - targets="Linear", - ignore=[ - "lm_head", - "re:.*block_sparse_moe.gate", # does not quantize well - ], -) - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - trust_remote_code_model=True, -) - -print("========== SAMPLE GENERATION ==============") -dispatch_for_generation(model) -sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} -output = model.generate(**sample, max_new_tokens=100) -print(tokenizer.decode(output[0])) -print("==========================================") - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_example.py deleted file mode 100644 index 2531e6528..000000000 --- a/examples/quantizing_moe/qwen_example.py +++ /dev/null @@ -1,85 +0,0 @@ -import torch -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.transformers import oneshot -from llmcompressor.utils import dispatch_for_generation - -# select a Mixture of Experts model for quantization -MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) -ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for W416 quantization with a group size of 128 -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = GPTQModifier( - targets="Linear", - scheme="W4A16", - ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"], -) - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - save_compressed=True, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -print("========== SAMPLE GENERATION ==============") -dispatch_for_generation(model) -sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} -output = model.generate(**sample, max_new_tokens=100) -print(tokenizer.decode(output[0])) -print("==========================================") - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) From 2db2789532215d744db24e3a9a31e7beabb0b2f0 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 13:54:24 -0400 Subject: [PATCH 15/22] revert examples changes Signed-off-by: Kyle Sayers --- examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ++++++++++++++++++ .../quantizing_moe/deepseek_moe_w8a8_fp8.py | 99 ++++++++++++++ .../quantizing_moe/deepseek_moe_w8a8_int8.py | 101 ++++++++++++++ .../quantizing_moe/deepseek_recipe_w4a16.yaml | 8 ++ .../quantizing_moe/mixtral_moe_w8a8_fp8.py | 58 ++++++++ examples/quantizing_moe/qwen_moe_w4a16.py | 84 ++++++++++++ tests/examples/test_quantizing_moe.py | 11 +- 7 files changed, 482 insertions(+), 4 deletions(-) create mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_int8.py create mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml create mode 100644 examples/quantizing_moe/mixtral_moe_w8a8_fp8.py create mode 100644 examples/quantizing_moe/qwen_moe_w4a16.py diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py new file mode 100644 index 000000000..9880e9248 --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w4a16.py @@ -0,0 +1,125 @@ +import torch +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-V2.5" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for W416 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = "deepseek_recipe_w4a16.yaml" + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + save_compressed=True, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") + output = model.generate(input_ids, max_new_tokens=20) + print(tokenizer.decode(output[0])) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + + +# Run the model on vLLM +try: + from vllm import LLM, SamplingParams + + vllm_installed = True +except ImportError: + vllm_installed = False + +if vllm_installed: + print("vLLM installed, running using vLLM") + sampling_params = SamplingParams(temperature=0.80, top_p=0.95) + llm = LLM( + model=SAVE_DIR, + tensor_parallel_size=2, + trust_remote_code=True, + max_model_len=1042, + dtype=torch.half, + ) + prompts = [ + "The capital of France is", + "The president of the US is", + "My name is", + ] + + outputs = llm.generate(prompts, sampling_params) + print("================= vLLM GENERATION ======================") + for output in outputs: + assert output + prompt = output.prompt + generated_text = output.outputs[0].text + print("PROMPT", prompt) + print("GENERATED TEXT", generated_text) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py new file mode 100644 index 000000000..0bc9c24df --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py @@ -0,0 +1,99 @@ +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype="auto", trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +# its recommended to use more calibration samples for MoE models so each expert is hit +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 2048 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for FP8 W8A8 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = [ + QuantizationModifier( + targets="Linear", + scheme="FP8", + ignore=["lm_head", "re:.*mlp.gate$"], + ), +] + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + SAMPLE_INPUT = ["I love quantization because"] + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) + output = model.generate(**inputs, max_length=50) + text_output = tokenizer.batch_decode(output) + print(text_output) +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py new file mode 100644 index 000000000..3ec506c34 --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py @@ -0,0 +1,101 @@ +import torch +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +# its recommended to use more calibration samples for MoE models so each expert is hit +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 2048 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for INT8 W8A8 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = [ + GPTQModifier( + targets="Linear", + scheme="W8A8", + ignore=["lm_head", "re:.*mlp.gate$"], + ), +] + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + SAMPLE_INPUT = ["I love quantization because"] + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) + output = model.generate(**inputs, max_length=50) + text_output = tokenizer.batch_decode(output) + print(text_output) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml new file mode 100644 index 000000000..23f276e2f --- /dev/null +++ b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml @@ -0,0 +1,8 @@ +quant_stage: + quant_modifiers: + GPTQModifier: + ignore: [lm_head, "re:.*mlp.gate$"] + config_groups: + group_0: + weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false} + targets: [Linear] diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py new file mode 100644 index 000000000..a17bf873d --- /dev/null +++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py @@ -0,0 +1,58 @@ +from typing import List + +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils import dispatch_for_generation + +MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" + +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + + +# Dataset config parameters +DATASET_ID = "open_platypus" +DATASET_SPLIT = "train" +MAX_SEQ_LENGTH = 2048 +NUM_CALIBRATION_SAMPLES = 512 + +# Recipe +layers_to_ignore: List[str] = [ + "lm_head", + "re:.*block_sparse_moe.gate", # does not quantize well +] +recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore) + + +oneshot( + model=model, + tokenizer=tokenizer, + dataset=DATASET_ID, + splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"}, + recipe=recipe, + max_seq_length=MAX_SEQ_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") + output = model.generate(input_ids, max_new_tokens=20) + print(tokenizer.decode(output[0])) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py new file mode 100644 index 000000000..40a78a9b7 --- /dev/null +++ b/examples/quantizing_moe/qwen_moe_w4a16.py @@ -0,0 +1,84 @@ +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.transformers import oneshot +from llmcompressor.utils import dispatch_for_generation + +# select a Mixture of Experts model for quantization +MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for W416 quantization with a group size of 128 +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = GPTQModifier( + targets="Linear", + scheme="W4A16", + ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"], +) + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + save_compressed=True, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=20) +print(tokenizer.decode(output[0])) +print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/tests/examples/test_quantizing_moe.py b/tests/examples/test_quantizing_moe.py index 50e86c2c8..49686d25c 100644 --- a/tests/examples/test_quantizing_moe.py +++ b/tests/examples/test_quantizing_moe.py @@ -44,11 +44,14 @@ def test_doc_example_command(self, example_dir: str, tmp_path: Path): "script_filename", [ pytest.param( - "deepseekv3_example.py", - marks=pytest.mark.skip(reason="exceptionally long run time"), + "deepseek_moe_w4a16.py", + marks=[ + pytest.mark.multi_gpu, + pytest.mark.skip(reason="exceptionally long run time"), + ], ), - pytest.param("mixtral_example.py"), - pytest.param("qwen_example.py"), + pytest.param("deepseek_moe_w8a8_fp8.py"), + pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu), ], ) def test_deepseek_example_script( From 0dc2381dd12303ae1c71127827e1efd1bc56de63 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 13:55:09 -0400 Subject: [PATCH 16/22] remove extra examples Signed-off-by: Kyle Sayers --- examples/quantizing_moe/mixtral_example.py | 86 ---------------------- examples/quantizing_moe/qwen_example.py | 85 --------------------- 2 files changed, 171 deletions(-) delete mode 100644 examples/quantizing_moe/mixtral_example.py delete mode 100644 examples/quantizing_moe/qwen_example.py diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py deleted file mode 100644 index 49b08c722..000000000 --- a/examples/quantizing_moe/mixtral_example.py +++ /dev/null @@ -1,86 +0,0 @@ -import torch -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils import dispatch_for_generation - -# select a Mixture of Experts model for quantization -MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# Configure the quantization algorithm to run. -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = QuantizationModifier( - scheme="W4A16", - targets="Linear", - ignore=[ - "lm_head", - "re:.*block_sparse_moe.gate", # does not quantize well - ], -) - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - trust_remote_code_model=True, -) - -print("========== SAMPLE GENERATION ==============") -dispatch_for_generation(model) -sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} -output = model.generate(**sample, max_new_tokens=100) -print(tokenizer.decode(output[0])) -print("==========================================") - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_example.py deleted file mode 100644 index 2531e6528..000000000 --- a/examples/quantizing_moe/qwen_example.py +++ /dev/null @@ -1,85 +0,0 @@ -import torch -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.transformers import oneshot -from llmcompressor.utils import dispatch_for_generation - -# select a Mixture of Experts model for quantization -MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) -ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for W416 quantization with a group size of 128 -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = GPTQModifier( - targets="Linear", - scheme="W4A16", - ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"], -) - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - save_compressed=True, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -print("========== SAMPLE GENERATION ==============") -dispatch_for_generation(model) -sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} -output = model.generate(**sample, max_new_tokens=100) -print(tokenizer.decode(output[0])) -print("==========================================") - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) From b70aba7e240fc6c626146c6d4bbdf7a46f0d3979 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 13:55:39 -0400 Subject: [PATCH 17/22] revert examples tests changes Signed-off-by: Kyle Sayers --- tests/examples/test_quantizing_moe.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/examples/test_quantizing_moe.py b/tests/examples/test_quantizing_moe.py index 848d5e3bf..49686d25c 100644 --- a/tests/examples/test_quantizing_moe.py +++ b/tests/examples/test_quantizing_moe.py @@ -44,11 +44,14 @@ def test_doc_example_command(self, example_dir: str, tmp_path: Path): "script_filename", [ pytest.param( - "deepseekv2_5_example.py", - marks=pytest.mark.skip(reason="exceptionally long run time"), + "deepseek_moe_w4a16.py", + marks=[ + pytest.mark.multi_gpu, + pytest.mark.skip(reason="exceptionally long run time"), + ], ), - pytest.param("mixtral_example.py"), - pytest.param("qwen_example.py"), + pytest.param("deepseek_moe_w8a8_fp8.py"), + pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu), ], ) def test_deepseek_example_script( From 5e5657be1babff32db1a5a1eb13389c72c1eca50 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 13:58:01 -0400 Subject: [PATCH 18/22] Revert "revert extra examples" This reverts commit c44da345e8dd0d63d5cc0df3cb06bad52f968fa6. --- .../quantizing_moe/deepseekv2_5_example.py | 94 +++++++++++++++++++ examples/quantizing_moe/mixtral_example.py | 86 +++++++++++++++++ examples/quantizing_moe/qwen_example.py | 85 +++++++++++++++++ 3 files changed, 265 insertions(+) create mode 100644 examples/quantizing_moe/deepseekv2_5_example.py create mode 100644 examples/quantizing_moe/mixtral_example.py create mode 100644 examples/quantizing_moe/qwen_example.py diff --git a/examples/quantizing_moe/deepseekv2_5_example.py b/examples/quantizing_moe/deepseekv2_5_example.py new file mode 100644 index 000000000..c2b3b0305 --- /dev/null +++ b/examples/quantizing_moe/deepseekv2_5_example.py @@ -0,0 +1,94 @@ +import torch +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-V2.5" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# Configure the quantization algorithm to run. +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = GPTQModifier( + targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"] +) + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + sample = tokenizer("Hello my name is", return_tensors="pt") + sample = {key: value.to("cuda") for key, value in sample.items()} + output = model.generate(**sample, max_new_tokens=100) + print(tokenizer.decode(output[0])) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py new file mode 100644 index 000000000..49b08c722 --- /dev/null +++ b/examples/quantizing_moe/mixtral_example.py @@ -0,0 +1,86 @@ +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils import dispatch_for_generation + +# select a Mixture of Experts model for quantization +MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# Configure the quantization algorithm to run. +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = QuantizationModifier( + scheme="W4A16", + targets="Linear", + ignore=[ + "lm_head", + "re:.*block_sparse_moe.gate", # does not quantize well + ], +) + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + trust_remote_code_model=True, +) + +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to("cuda") for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_example.py new file mode 100644 index 000000000..2531e6528 --- /dev/null +++ b/examples/quantizing_moe/qwen_example.py @@ -0,0 +1,85 @@ +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.transformers import oneshot +from llmcompressor.utils import dispatch_for_generation + +# select a Mixture of Experts model for quantization +MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for W416 quantization with a group size of 128 +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = GPTQModifier( + targets="Linear", + scheme="W4A16", + ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"], +) + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + save_compressed=True, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to("cuda") for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) From 48123509037db3d2be970ad5b33b452fd73262c2 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 20 Jun 2025 14:03:47 -0400 Subject: [PATCH 19/22] clean up examples Signed-off-by: Kyle Sayers --- examples/quantizing_moe/README.md | 44 +++--- examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ------------------ .../quantizing_moe/deepseek_moe_w8a8_fp8.py | 99 -------------- .../quantizing_moe/deepseek_moe_w8a8_int8.py | 101 -------------- .../quantizing_moe/deepseek_recipe_w4a16.yaml | 8 -- ...wen_moe_w4a16.py => deepseekv3_example.py} | 48 ++++--- .../quantizing_moe/mixtral_moe_w8a8_fp8.py | 58 -------- src/llmcompressor/modeling/__init__.py | 3 + src/llmcompressor/modeling/deepseek_v3.py | 48 +++++++ src/llmcompressor/modeling/prepare.py | 22 +++ src/llmcompressor/utils/module.py | 27 ++++ tests/examples/test_quantizing_moe.py | 15 ++- 12 files changed, 156 insertions(+), 442 deletions(-) delete mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py delete mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py delete mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_int8.py delete mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml rename examples/quantizing_moe/{qwen_moe_w4a16.py => deepseekv3_example.py} (52%) delete mode 100644 examples/quantizing_moe/mixtral_moe_w8a8_fp8.py create mode 100644 src/llmcompressor/modeling/__init__.py create mode 100644 src/llmcompressor/modeling/deepseek_v3.py create mode 100644 src/llmcompressor/modeling/prepare.py create mode 100644 src/llmcompressor/utils/module.py diff --git a/examples/quantizing_moe/README.md b/examples/quantizing_moe/README.md index 70243caf1..f2a162d91 100644 --- a/examples/quantizing_moe/README.md +++ b/examples/quantizing_moe/README.md @@ -1,6 +1,6 @@ -# Quantizing Mixtral-8x7B-Instruct-v0.1 Model with FP8 +# Quantizing Mixtral-8x7B-Instruct-v0.1 Model with W4A16 -This directory contains an example script for quantizing the `Mixtral-8x7B-Instruct-v0.1` model using the static per-tensor FP8 quantization scheme. +This directory contains an example script for quantizing the `Mixtral-8x7B-Instruct-v0.1` model using the static per-tensor W4A16 quantization scheme. ## Installation @@ -17,17 +17,17 @@ pip install -e . The provided example script demonstrates an end-to-end process for applying the quantization algorithm: ```bash -python3 mixtral_moe_w8a8_fp8.py +python3 mixtral_example.py ``` ## Creating a Quantized MoE Model -This example leverages `llm-compressor` and `compressed-tensors` to create an FP8-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `open_platypus` dataset. +This example leverages `llm-compressor` and `compressed-tensors` to create an W4A16-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `ultrachat_200k` dataset. You can follow the detailed steps below or simply run the example script with: ```bash -python mixtral_moe_w8a8_fp8.py +python mixtral_example.py ``` ### Step 1: Select a Model, Dataset, and Recipe @@ -36,24 +36,24 @@ In this step, you'll choose a baseline model for quantization, a dataset for cal - **Models**: Can be referenced from a local directory or retrieved from the Hugging Face Hub. - **Datasets**: Can also be from a local directory or the Hugging Face Hub. -- **Recipes**: These are YAML files or Python modifier objects that describe how a model should be optimized during or after training. In this example, we use a `QuantizationModifier` object with the scheme set to `FP8`. +- **Recipes**: These are YAML files or Python modifier objects that describe how a model should be optimized during or after training. In this example, we use a `QuantizationModifier` object with the scheme set to `W4A16`. ```python from llmcompressor.modifiers.quantization import QuantizationModifier -recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=["lm_head", "re:.*block_sparse_moe.gate"]) +recipe = QuantizationModifier(scheme="W4A16", targets="Linear", ignore=["lm_head", "re:.*block_sparse_moe.gate"]) ``` NOTE: `.*block_sparse_moe.gate` layers do not quantize well, hence they are ignored! ### Step 2: Run Quantization Using Oneshot -The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-FP8`. +The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-W4A16-G128`. ```python from llmcompressor import oneshot -output_dir = "Mixtral-8x7B-Instruct-v0.1-FP8" +output_dir = "Mixtral-8x7B-Instruct-v0.1-W4A16-G128" oneshot( model=model, @@ -74,7 +74,7 @@ NOTE: Only per-tensor quantization is supported in vLLM as of now (`vllm==0.6.1` The repository supports multiple quantization techniques configured via a recipe. Supported strategies include `tensor`, `group`, and `channel` quantization. -In the above example, FP8 per-tensor quantization is used as specified by the `FP8` scheme. For other preset schemes, refer to the [quantization schemes](https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py) in the `compressed-tensors` library. +In the above example, quantization is specified by the `W4A18` scheme. For other preset schemes, refer to the [quantization schemes](https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py) in the `compressed-tensors` library. A custom scheme can also be specified using `config_groups`: @@ -84,18 +84,18 @@ A custom scheme can also be specified using `config_groups`: from llmcompressor.modifiers.quantization.gptq import GPTQModifier config_groups = { - "group_0": { - "targets": ["Linear"], - "input_activations": None, - "output_activations": None, - "weights": { - "num_bits": 8, - "type": "int", - "symmetric": true, - "strategy": "group", - "group_size": 128, - } - } + "group_0": { + "targets": ["Linear"], + "input_activations": None, + "output_activations": None, + "weights": { + "num_bits": 8, + "type": "int", + "symmetric": true, + "strategy": "group", + "group_size": 128, + } + } } recipe = GPTQModifier(config_groups=config_groups) diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py deleted file mode 100644 index 9880e9248..000000000 --- a/examples/quantizing_moe/deepseek_moe_w4a16.py +++ /dev/null @@ -1,125 +0,0 @@ -import torch -from datasets import load_dataset -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.utils import dispatch_for_generation - -# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. -# Please consider either downgrading your transformers version to a -# previous version or upgrading to a version where this bug is fixed - -# select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-V2.5" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for W416 quantization -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = "deepseek_recipe_w4a16.yaml" - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - save_compressed=True, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") - output = model.generate(input_ids, max_new_tokens=20) - print(tokenizer.decode(output[0])) - print("==========================================") -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - - -# Run the model on vLLM -try: - from vllm import LLM, SamplingParams - - vllm_installed = True -except ImportError: - vllm_installed = False - -if vllm_installed: - print("vLLM installed, running using vLLM") - sampling_params = SamplingParams(temperature=0.80, top_p=0.95) - llm = LLM( - model=SAVE_DIR, - tensor_parallel_size=2, - trust_remote_code=True, - max_model_len=1042, - dtype=torch.half, - ) - prompts = [ - "The capital of France is", - "The president of the US is", - "My name is", - ] - - outputs = llm.generate(prompts, sampling_params) - print("================= vLLM GENERATION ======================") - for output in outputs: - assert output - prompt = output.prompt - generated_text = output.outputs[0].text - print("PROMPT", prompt) - print("GENERATED TEXT", generated_text) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py deleted file mode 100644 index 0bc9c24df..000000000 --- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py +++ /dev/null @@ -1,99 +0,0 @@ -from datasets import load_dataset -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils import dispatch_for_generation - -# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. -# Please consider either downgrading your transformers version to a -# previous version or upgrading to a version where this bug is fixed - -# select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype="auto", trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -# its recommended to use more calibration samples for MoE models so each expert is hit -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 2048 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for FP8 W8A8 quantization -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = [ - QuantizationModifier( - targets="Linear", - scheme="FP8", - ignore=["lm_head", "re:.*mlp.gate$"], - ), -] - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - SAMPLE_INPUT = ["I love quantization because"] - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) - output = model.generate(**inputs, max_length=50) - text_output = tokenizer.batch_decode(output) - print(text_output) -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py deleted file mode 100644 index 3ec506c34..000000000 --- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py +++ /dev/null @@ -1,101 +0,0 @@ -import torch -from datasets import load_dataset -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils import dispatch_for_generation - -# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. -# Please consider either downgrading your transformers version to a -# previous version or upgrading to a version where this bug is fixed - -# select a Mixture of Experts model for quantization -MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -# its recommended to use more calibration samples for MoE models so each expert is hit -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 2048 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for INT8 W8A8 quantization -# since the MoE gate layers are sensitive to quantization, we add them to the ignore -# list so they remain at full precision -recipe = [ - GPTQModifier( - targets="Linear", - scheme="W8A8", - ignore=["lm_head", "re:.*mlp.gate$"], - ), -] - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - trust_remote_code_model=True, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - SAMPLE_INPUT = ["I love quantization because"] - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) - output = model.generate(**inputs, max_length=50) - text_output = tokenizer.batch_decode(output) - print(text_output) - print("==========================================") -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml deleted file mode 100644 index 23f276e2f..000000000 --- a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml +++ /dev/null @@ -1,8 +0,0 @@ -quant_stage: - quant_modifiers: - GPTQModifier: - ignore: [lm_head, "re:.*mlp.gate$"] - config_groups: - group_0: - weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false} - targets: [Linear] diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/deepseekv3_example.py similarity index 52% rename from examples/quantizing_moe/qwen_moe_w4a16.py rename to examples/quantizing_moe/deepseekv3_example.py index 40a78a9b7..1b4c334ff 100644 --- a/examples/quantizing_moe/qwen_moe_w4a16.py +++ b/examples/quantizing_moe/deepseekv3_example.py @@ -1,29 +1,31 @@ -import torch from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer +from llmcompressor.modeling import prepare_for_quantization from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot from llmcompressor.utils import dispatch_for_generation -# select a Mixture of Experts model for quantization -MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +# Select model and load it. +# For DeepSeekv3, we require a full precision model in order to properly calibrate +# `DeepSeek-V3-BF16` is a DeepSeek-V3 FP8 model which has been converted to BF16 +model_id = "RedHatAI/DeepSeek-V3-BF16" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = prepare_for_quantization(model) # Select calibration dataset. DATASET_ID = "HuggingFaceH4/ultrachat_200k" DATASET_SPLIT = "train_sft" + +# Select number of samples. 512 samples is a good place to start. +# Increasing the number of samples can improve accuracy. NUM_CALIBRATION_SAMPLES = 512 MAX_SEQUENCE_LENGTH = 2048 - # Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) -ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) def preprocess(example): @@ -51,34 +53,36 @@ def tokenize(sample): ds = ds.map(tokenize, remove_columns=ds.column_names) -# define a llmcompressor recipe for W416 quantization with a group size of 128 +# Configure the quantization algorithm to run. # since the MoE gate layers are sensitive to quantization, we add them to the ignore # list so they remain at full precision recipe = GPTQModifier( - targets="Linear", - scheme="W4A16", - ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"], + targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"] ) +# Apply algorithms. +# due to the large size of DeepSeekV3, we specify sequential targets such that +# only one MLP is loaded into GPU memory at a time oneshot( model=model, dataset=ds, recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, - save_compressed=True, - trust_remote_code_model=True, + sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"], ) # Confirm generations of the quantized model look sane. +print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") -output = model.generate(input_ids, max_new_tokens=20) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to("cuda") for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) -print("==========================================") +print("==========================================\n\n") -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16" +# Save to disk compressed. +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py deleted file mode 100644 index a17bf873d..000000000 --- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py +++ /dev/null @@ -1,58 +0,0 @@ -from typing import List - -from packaging.version import Version -from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils import dispatch_for_generation - -MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" - -model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - - -# Dataset config parameters -DATASET_ID = "open_platypus" -DATASET_SPLIT = "train" -MAX_SEQ_LENGTH = 2048 -NUM_CALIBRATION_SAMPLES = 512 - -# Recipe -layers_to_ignore: List[str] = [ - "lm_head", - "re:.*block_sparse_moe.gate", # does not quantize well -] -recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore) - - -oneshot( - model=model, - tokenizer=tokenizer, - dataset=DATASET_ID, - splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"}, - recipe=recipe, - max_seq_length=MAX_SEQ_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, -) - -# Confirm generations of the quantized model look sane. -# Generation is broken for deepseek models when using the latest transformers package -if Version(__version__) < Version("4.48"): - print("========== SAMPLE GENERATION ==============") - dispatch_for_generation(model) - input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") - output = model.generate(input_ids, max_new_tokens=20) - print(tokenizer.decode(output[0])) - print("==========================================") -else: - print( - "WARNING: cannot perform sample generation of " - "deepseek models with transformers >= 4.48" - ) - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py new file mode 100644 index 000000000..e2c22ed1f --- /dev/null +++ b/src/llmcompressor/modeling/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa + +from .prepare import * diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py new file mode 100644 index 000000000..4b885ff64 --- /dev/null +++ b/src/llmcompressor/modeling/deepseek_v3.py @@ -0,0 +1,48 @@ +import torch +from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE + + +class DeepseekV3MoECalibrate(torch.nn.Module): + def __init__(self, config, experts, gate, shared_experts): + super().__init__() + self.config = config + self.experts = experts + self.gate = gate + self.shared_experts = shared_experts + + def forward(self, hidden_states): + residuals = hidden_states + orig_shape = hidden_states.shape + topk_indices, topk_weights = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + + # Begin MoE + final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype) + expert_mask = torch.nn.functional.one_hot( + topk_indices, num_classes=len(self.experts) + ) + expert_mask = expert_mask.permute(2, 0, 1) + + for expert_idx in range(len(self.experts)): + expert = self.experts[expert_idx] + mask = expert_mask[expert_idx] + token_indices, weight_indices = torch.where(mask) + + expert_weights = topk_weights[token_indices, weight_indices] + expert_input = hidden_states[token_indices] + expert_output = expert(expert_input) + weighted_output = expert_output * expert_weights.unsqueeze(-1) + + if token_indices.numel() > 0: + final_hidden_states.index_add_(0, token_indices, weighted_output) + # End MoE + + hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape) + hidden_states = hidden_states + self.shared_experts(residuals) + return hidden_states + + +def replace(module: DeepseekV3MoE) -> DeepseekV3MoECalibrate: + return DeepseekV3MoECalibrate( + module.config, module.experts, module.gate, module.shared_experts + ) diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py new file mode 100644 index 000000000..a8dedf8ee --- /dev/null +++ b/src/llmcompressor/modeling/prepare.py @@ -0,0 +1,22 @@ +import torch +from transformers import PreTrainedModel +from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE + +from llmcompressor.modeling.deepseek_v3 import replace as replace_DeepseekV3MoE +from llmcompressor.utils.module import module_bfs + +__all__ = ["prepare_for_quantization"] + +replacements = { + DeepseekV3MoE: replace_DeepseekV3MoE, +} + + +def prepare_for_quantization(model: PreTrainedModel) -> PreTrainedModel: + def replace(module: torch.nn.Module) -> torch.nn.Module: + if module.__class__ in replacements: + return replacements[module.__class__](module) + else: + return module + + return module_bfs(model, replace, progress=True) diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py new file mode 100644 index 000000000..a02aa8b4a --- /dev/null +++ b/src/llmcompressor/utils/module.py @@ -0,0 +1,27 @@ +from typing import Callable, Union + +import torch +import tqdm + +__all__ = ["module_bfs"] + + +def module_bfs( + module: torch.nn.Module, + func: Callable[[torch.nn.Module], torch.nn.Module], + pre: bool = True, + progress: Union[bool, tqdm.tqdm] = False, +) -> torch.nn.Module: + if progress is True: + total = len(list(module.modules())) + progress = tqdm.tqdm(total=total) + if pre: + module = func(module) + for name, child in list(module.named_children()): + module.add_module(name, module_bfs(child, func, pre, progress)) + if not pre: + module = func(module) + if isinstance(progress, tqdm.tqdm): + progress.update(1) + + return module diff --git a/tests/examples/test_quantizing_moe.py b/tests/examples/test_quantizing_moe.py index 49686d25c..1f5a53a56 100644 --- a/tests/examples/test_quantizing_moe.py +++ b/tests/examples/test_quantizing_moe.py @@ -44,14 +44,15 @@ def test_doc_example_command(self, example_dir: str, tmp_path: Path): "script_filename", [ pytest.param( - "deepseek_moe_w4a16.py", - marks=[ - pytest.mark.multi_gpu, - pytest.mark.skip(reason="exceptionally long run time"), - ], + "deepseekv2_5_example.py", + marks=pytest.mark.skip(reason="exceptionally long run time"), ), - pytest.param("deepseek_moe_w8a8_fp8.py"), - pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu), + pytest.param( + "deepseekv3_example.py", + marks=pytest.mark.skip(reason="exceptionally long run time"), + ), + pytest.param("mixtral_example.py"), + pytest.param("qwen_example.py"), ], ) def test_deepseek_example_script( From 626000d1b80cc90bce483334c71f2b3f952c9f48 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 26 Jun 2025 14:26:10 -0400 Subject: [PATCH 20/22] merge with main src Signed-off-by: Kyle Sayers --- src/llmcompressor/args/dataset_arguments.py | 7 - src/llmcompressor/args/model_arguments.py | 7 - src/llmcompressor/entrypoints/oneshot.py | 2 - src/llmcompressor/metrics/logger.py | 169 ++++++++---------- src/llmcompressor/modeling/deepseek_v3.py | 4 + src/llmcompressor/modeling/prepare.py | 22 ++- .../modifiers/obcq/sgpt_sparsify.py | 4 +- .../modifiers/quantization/calibration.py | 15 ++ src/llmcompressor/observers/min_max.py | 1 + src/llmcompressor/observers/mse.py | 9 +- src/llmcompressor/pipelines/basic/pipeline.py | 6 +- .../pipelines/layer_sequential/pipeline.py | 7 +- .../pipelines/sequential/pipeline.py | 7 +- .../compressed_tensors_utils.py | 19 -- src/llmcompressor/utils/dev.py | 28 +-- 15 files changed, 117 insertions(+), 190 deletions(-) diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py index 677d09daa..33193cde2 100644 --- a/src/llmcompressor/args/dataset_arguments.py +++ b/src/llmcompressor/args/dataset_arguments.py @@ -197,10 +197,3 @@ class DatasetArguments(CustomDatasetArguments): "definition" }, ) - model_input_device: Optional[str] = field( - default=None, - metadata={ - "help": "Device to put model inputs on for calibration. " - "If none is specified, the model input device is inferred from the model" - }, - ) diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py index ea3c3936a..e68bd16aa 100644 --- a/src/llmcompressor/args/model_arguments.py +++ b/src/llmcompressor/args/model_arguments.py @@ -80,13 +80,6 @@ class ModelArguments: default=True, metadata={"help": "Whether to compress sparse models during save"}, ) - oneshot_device: Optional[str] = field( - default="cuda", - metadata={ - "help": "This argument is deprecated and nonfunctional " - "and will be removed in future release" - }, - ) model_revision: str = field( default="main", metadata={ diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 945c71943..707aafedf 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -208,7 +208,6 @@ def oneshot( tie_word_embeddings: bool = False, trust_remote_code_model: bool = False, save_compressed: bool = True, - oneshot_device: str = "cuda:0", model_revision: str = "main", # Recipe arguments recipe: Optional[Union[str, List[str]]] = None, @@ -259,7 +258,6 @@ def oneshot( :param trust_remote_code_model: Whether to allow for custom models to execute their own modeling files. :param save_compressed: Whether to compress sparse models during save. - :param oneshot_device: Device to run oneshot calibration on. :param model_revision: The specific model version to use (can be branch name, tag, or commit id). diff --git a/src/llmcompressor/metrics/logger.py b/src/llmcompressor/metrics/logger.py index b4c2f9505..8c895143b 100644 --- a/src/llmcompressor/metrics/logger.py +++ b/src/llmcompressor/metrics/logger.py @@ -2,18 +2,18 @@ Contains code for loggers that help visualize the information from each modifier """ -import logging import os import time import warnings from abc import ABC from contextlib import contextmanager from datetime import datetime -from logging import CRITICAL, DEBUG, ERROR, INFO, WARN, Logger from pathlib import Path from types import ModuleType from typing import Any, Callable, Dict, List, Optional, Union +from loguru import logger + from llmcompressor.metrics.utils import ( FrequencyManager, FrequencyType, @@ -52,16 +52,8 @@ "WANDBLogger", "SparsificationGroupLogger", "LoggerManager", - "LOGGING_LEVELS", ] ALL_TOKEN = "__ALL__" -LOGGING_LEVELS = { - "debug": DEBUG, - "info": INFO, - "warn": WARN, - "error": ERROR, - "critical": CRITICAL, -} DEFAULT_TAG = "defaul_tag" @@ -231,11 +223,12 @@ def lambda_func( def log_hyperparams( self, params: Dict, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ) -> bool: """ :param params: Each key-value pair in the dictionary is the name of the hyper parameter and it's corresponding value. + :param level: minimum severity level for the log message :return: True if logged, False otherwise. """ if not self.enabled: @@ -256,7 +249,7 @@ def log_scalar( value: float, step: Optional[int] = None, wall_time: Optional[float] = None, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ) -> bool: """ :param tag: identifying tag to log the value with @@ -264,6 +257,7 @@ def log_scalar( :param step: global step for when the value was taken :param wall_time: global wall time for when the value was taken, defaults to time.time() + :param level: minimum severity level for the log message :param kwargs: additional logging arguments to support Python and custom loggers :return: True if logged, False otherwise. """ @@ -285,7 +279,7 @@ def log_scalars( values: Dict[str, float], step: Optional[int] = None, wall_time: Optional[float] = None, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ) -> bool: """ :param tag: identifying tag to log the values with @@ -293,6 +287,7 @@ def log_scalars( :param step: global step for when the values were taken :param wall_time: global wall time for when the values were taken, defaults to time.time() + :param level: minimum severity level for the log message :param kwargs: additional logging arguments to support Python and custom loggers :return: True if logged, False otherwise. """ @@ -313,22 +308,20 @@ class PythonLogger(LambdaLogger): """ Modifier metrics that handles printing values into a python metrics instance. - :param logger: a metrics instance to log to, if None then will create it's own - :param log_level: default level to log any incoming data at on the logging.Logger - instance when an explicit log level isn't provided :param name: name given to the metrics, used for identification; defaults to python :param enabled: True to log, False otherwise """ + # Class-level variable to track if file sink is created + _global_file_sink_id = None + def __init__( self, - logger: Logger = None, - log_level: int = None, name: str = "python", enabled: bool = True, ): - self._logger = logger or self._create_default_logger(log_level=log_level) + self._create_default_logger() super().__init__( lambda_func=self._log_lambda, @@ -336,17 +329,7 @@ def __init__( enabled=enabled, ) - def __getattr__(self, item): - return getattr(self._logger, item) - - @property - def logger(self) -> Logger: - """ - :return: a metrics instance to log to, if None then will create it's own - """ - return self._logger - - def _create_default_logger(self, log_level: Optional[int] = None) -> logging.Logger: + def _create_default_logger(self) -> None: """ Create a default modifier metrics, with a file handler logging at the debug level @@ -355,24 +338,9 @@ def _create_default_logger(self, log_level: Optional[int] = None) -> logging.Log :param log_level: logging level for the console metrics :return: metrics """ - logger = logging.getLogger(__name__) - - # Console handler, for logging high level modifier logs - # must be created before the file handler - # as file handler is also a stream handler - if not any( - isinstance(handler, logging.StreamHandler) for handler in logger.handlers - ): - stream_handler = logging.StreamHandler() - stream_handler.setLevel( - log_level or logging.getLogger("llmcompressor").level - ) - logger.addHandler(stream_handler) # File handler setup, for logging modifier debug statements - if not any( - isinstance(handler, logging.FileHandler) for handler in logger.handlers - ): + if PythonLogger._global_file_sink_id is None: base_log_path = ( os.environ.get("NM_TEST_LOG_DIR") if os.environ.get("NM_TEST_MODE") @@ -382,19 +350,11 @@ def _create_default_logger(self, log_level: Optional[int] = None) -> logging.Log dt_string = now.strftime("%d-%m-%Y_%H.%M.%S") log_path = os.path.join(base_log_path, f"{dt_string}.log") os.makedirs(base_log_path, exist_ok=True) - file_handler = logging.FileHandler( - log_path, - delay=True, + PythonLogger._global_file_sink_id = logger.add( + log_path, level="DEBUG", delay=True ) - file_handler.setLevel(LOGGING_LEVELS["debug"]) - logger.addHandler(file_handler) logger.info(f"Logging all LLM Compressor modifier-level logs to {log_path}") - logger.setLevel(LOGGING_LEVELS["debug"]) - logger.propagate = False - - return logger - def _log_lambda( self, tag: Optional[str], @@ -402,7 +362,7 @@ def _log_lambda( values: Optional[Dict[str, float]], step: Optional[int], wall_time: Optional[float], - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ) -> bool: """ :param tag: identifying tag to log the values with @@ -411,13 +371,22 @@ def _log_lambda( :param step: global step for when the values were taken :param wall_time: global wall time for when the values were taken, defaults to time.time() - :param level: level to log at. Corresponds to default logging package levels + :param level: minimum severity level for the log message :return: True if logged, False otherwise. """ if not level: - level = LOGGING_LEVELS["debug"] + level = "DEBUG" + + def is_higher_than_debug(lev: Optional[Union[int, str]] = None) -> bool: + """Check if the given level is higher than DEBUG level.""" + debug_level_no = logger.level("DEBUG").no + if isinstance(lev, int): + return level > debug_level_no + elif isinstance(lev, str): + return logger.level(lev).no > debug_level_no + return False - if level > LOGGING_LEVELS["debug"]: + if is_higher_than_debug(level): if step is not None: format = "%s %s step %s: %s" log_args = [ @@ -433,7 +402,7 @@ def _log_lambda( format = "%s %s [%s - %s]: %s" log_args = [self.name, tag, step, wall_time, values or value] - self._logger.log(level, format, *log_args) + logger.log(level, format, *log_args) return True @@ -443,7 +412,7 @@ def log_string( string: Optional[str], step: Optional[int], wall_time: Optional[float] = None, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ) -> bool: """ :param tag: identifying tag to log the values with @@ -451,7 +420,7 @@ def log_string( :param step: global step for when the values were taken :param wall_time: global wall time for when the values were taken, defaults to time.time() - :param level: level to log at. Corresponds to default logging package levels + :param level: minimum severity level for the log message :return: True if logged, False otherwise. """ if not wall_time: @@ -540,7 +509,7 @@ def _log_lambda( values: Optional[Dict[str, float]], step: Optional[int], wall_time: Optional[float], - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ) -> bool: if value is not None: self._writer.add_scalar(tag, value, step, wall_time) @@ -614,7 +583,7 @@ def _log_lambda( values: Optional[Dict[str, float]], step: Optional[int], wall_time: Optional[float], - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ) -> bool: params = {} @@ -656,11 +625,10 @@ class SparsificationGroupLogger(BaseLogger): :param lambda_func: an optional lambda function to call back into with any logs. The expected call sequence is (tag, value, values, step, wall_time) -> bool The return type is True if logged and False otherwise. - :param python: an optional argument for logging to a python metrics. - May be a logging.Logger instance to log to, True to create a metrics instance, - or non truthy to not log anything (False, None) + :param python: an bool argument for logging to a python metrics. + True to create a metrics instance, or False to not log anything :param python_log_level: if python, - the level to log any incoming data at on the logging.Logger instance + the level to log any incoming data at on the loguru.logger instance :param tensorboard: an optional argument for logging to a tensorboard writer. May be a SummaryWriter instance to log to, a string representing the directory to create a new SummaryWriter to log to, True to create a new SummaryWriter, @@ -688,8 +656,8 @@ def __init__( bool, ] ] = None, - python: Optional[Union[bool, Logger]] = None, - python_log_level: int = logging.INFO, + python: bool = False, + python_log_level: Optional[Union[int, str]] = "INFO", tensorboard: Optional[Union[bool, str, SummaryWriter]] = None, wandb_: Optional[Union[bool, Dict]] = None, name: str = "sparsification", @@ -706,8 +674,6 @@ def __init__( if python: self._loggers.append( PythonLogger( - logger=python if isinstance(python, Logger) else None, - log_level=python_log_level, name=name, enabled=enabled, ) @@ -741,8 +707,8 @@ def enabled(self, value: bool): """ self._enabled = value - for logger in self._loggers: - logger.enabled = value + for log in self._loggers: + log.enabled = value @property def loggers(self) -> List[BaseLogger]: @@ -751,13 +717,13 @@ def loggers(self) -> List[BaseLogger]: """ return self._loggers - def log_hyperparams(self, params: Dict, level: Optional[int] = None): + def log_hyperparams(self, params: Dict, level: Optional[Union[int, str]] = None): """ :param params: Each key-value pair in the dictionary is the name of the hyper parameter and it's corresponding value. """ - for logger in self._loggers: - logger.log_hyperparams(params, level) + for log in self._loggers: + log.log_hyperparams(params, level) def log_scalar( self, @@ -765,7 +731,7 @@ def log_scalar( value: float, step: Optional[int] = None, wall_time: Optional[float] = None, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ): """ :param tag: identifying tag to log the value with @@ -773,9 +739,10 @@ def log_scalar( :param step: global step for when the value was taken :param wall_time: global wall time for when the value was taken, defaults to time.time() + :param level: minimum severity level for the log message """ - for logger in self._loggers: - logger.log_scalar(tag, value, step, wall_time, level) + for log in self._loggers: + log.log_scalar(tag, value, step, wall_time, level) def log_scalars( self, @@ -783,7 +750,7 @@ def log_scalars( values: Dict[str, float], step: Optional[int] = None, wall_time: Optional[float] = None, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ): """ :param tag: identifying tag to log the values with @@ -791,9 +758,10 @@ def log_scalars( :param step: global step for when the values were taken :param wall_time: global wall time for when the values were taken, defaults to time.time() + :param level: minimum severity level for the log message """ - for logger in self._loggers: - logger.log_scalars(tag, values, step, wall_time, level) + for log in self._loggers: + log.log_scalars(tag, values, step, wall_time, level) class LoggerManager(ABC): @@ -956,7 +924,7 @@ def log_scalar( step: Optional[int] = None, wall_time: Optional[float] = None, log_types: Union[str, List[str]] = ALL_TOKEN, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ): """ (Note: this method is deprecated and will be removed in a future version, @@ -966,6 +934,7 @@ def log_scalar( :param value: value to save :param step: global step for when the value was taken :param wall_time: global wall time for when the value was taken + :param level: minimum severity level for the log message :param kwargs: additional logging arguments to support Python and custom loggers :return: True if logged, False otherwise. """ @@ -986,7 +955,7 @@ def log_scalars( step: Optional[int] = None, wall_time: Optional[float] = None, log_types: Union[str, List[str]] = ALL_TOKEN, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ): """ (Note: this method is deprecated and will be removed in a future version, @@ -996,6 +965,7 @@ def log_scalars( :param values: values to save :param step: global step for when the values were taken :param wall_time: global wall time for when the values were taken + :param level: minimum severity level for the log message :param kwargs: additional logging arguments to support Python and custom loggers :return: True if logged, False otherwise. """ @@ -1013,7 +983,7 @@ def log_hyperparams( self, params: Dict, log_types: Union[str, List[str]] = ALL_TOKEN, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ): """ (Note: this method is deprecated and will be removed in a future version, @@ -1036,7 +1006,7 @@ def log_string( step: Optional[int] = None, wall_time: Optional[float] = None, log_types: Union[str, List[str]] = ALL_TOKEN, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ): """ (Note: this method is deprecated and will be removed in a future version, @@ -1047,6 +1017,7 @@ def log_string( :param step: global step for when the values were taken :param wall_time: global wall time for when the values were taken :param kwargs: additional logging arguments to support Python and custom loggers + :param level: minimum severity level for the log message :return: True if logged, False otherwise. """ self.system.log_string( @@ -1119,13 +1090,14 @@ def log_string( step: Optional[int] = None, wall_time: Optional[float] = None, log_types: Union[str, List[str]] = ALL_TOKEN, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ): """ :param tag: identifying tag to log the values with :param values: values to save :param step: global step for when the values were taken :param wall_time: global wall time for when the values were taken + :param level: minimum severity level for the log message :param kwargs: additional logging arguments to support Python and custom loggers :return: True if logged, False otherwise. """ @@ -1151,7 +1123,7 @@ def debug(self, tag, string, *args, **kwargs): :param kwargs: additional arguments to pass to the metrics, see `log_string` for more details """ - kwargs["level"] = logging.DEBUG + kwargs["level"] = "DEBUG" self.log_string(tag=tag, string=string, *args, **kwargs) def info(self, tag, string, *args, **kwargs): @@ -1166,7 +1138,7 @@ def info(self, tag, string, *args, **kwargs): :param kwargs: additional arguments to pass to the metrics, see `log_string` for more details """ - kwargs["level"] = logging.INFO + kwargs["level"] = "INFO" self.log_string(tag=tag, string=string, *args, **kwargs) def warning(self, tag, string, *args, **kwargs): @@ -1181,7 +1153,7 @@ def warning(self, tag, string, *args, **kwargs): :param kwargs: additional arguments to pass to the metrics, see `log_string` for more details """ - kwargs["level"] = logging.WARNING + kwargs["level"] = "WARNING" self.log_string(tag=tag, string=string, *args, **kwargs) def warn(self, tag, string, *args, **kwargs): @@ -1204,7 +1176,7 @@ def error(self, tag, string, *args, **kwargs): :param kwargs: additional arguments to pass to the metrics, see `log_string` for more details """ - kwargs["level"] = logging.ERROR + kwargs["level"] = "ERROR" self.log_string(tag=tag, string=string, *args, **kwargs) def critical(self, tag, string, *args, **kwargs): @@ -1219,7 +1191,7 @@ def critical(self, tag, string, *args, **kwargs): :param kwargs: additional arguments to pass to the metrics, see `log_string` for more details """ - kwargs["level"] = logging.CRITICAL + kwargs["level"] = "CRITICAL" self.log_string(tag=tag, string=string, *args, **kwargs) @@ -1232,11 +1204,12 @@ def log_hyperparams( self, params: Dict, log_types: Union[str, List[str]] = ALL_TOKEN, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ): """ :param params: Each key-value pair in the dictionary is the name of the hyper parameter and it's corresponding value. + :param level: minimum severity level for the log message """ for log in self.loggers: if log.enabled and (log_types == ALL_TOKEN or log.name in log_types): @@ -1249,13 +1222,14 @@ def log_scalar( step: Optional[int] = None, wall_time: Optional[float] = None, log_types: Union[str, List[str]] = ALL_TOKEN, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ): """ :param tag: identifying tag to log the value with :param value: value to save :param step: global step for when the value was taken :param wall_time: global wall time for when the value was taken + :param level: minimum severity level for the log message :param kwargs: additional logging arguments to support Python and custom loggers :return: True if logged, False otherwise. """ @@ -1276,13 +1250,14 @@ def log_scalars( step: Optional[int] = None, wall_time: Optional[float] = None, log_types: Union[str, List[str]] = ALL_TOKEN, - level: Optional[int] = None, + level: Optional[Union[int, str]] = None, ): """ :param tag: identifying tag to log the values with :param values: values to save :param step: global step for when the values were taken :param wall_time: global wall time for when the values were taken + :param level: minimum severity level for the log message :param kwargs: additional logging arguments to support Python and custom loggers :return: True if logged, False otherwise. """ diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py index 4b885ff64..c5de440ce 100644 --- a/src/llmcompressor/modeling/deepseek_v3.py +++ b/src/llmcompressor/modeling/deepseek_v3.py @@ -3,6 +3,10 @@ class DeepseekV3MoECalibrate(torch.nn.Module): + """ + Patched DeepseekV3MoE which sends all tokens to all experts for calibration + """ + def __init__(self, config, experts, gate, shared_experts): super().__init__() self.config = config diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py index a8dedf8ee..6944327b0 100644 --- a/src/llmcompressor/modeling/prepare.py +++ b/src/llmcompressor/modeling/prepare.py @@ -1,22 +1,20 @@ -import torch +from compressed_tensors.utils import replace_module from transformers import PreTrainedModel -from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE from llmcompressor.modeling.deepseek_v3 import replace as replace_DeepseekV3MoE -from llmcompressor.utils.module import module_bfs -__all__ = ["prepare_for_quantization"] +__all__ = ["prepare_for_calibration"] replacements = { - DeepseekV3MoE: replace_DeepseekV3MoE, + "DeepseekV3MoE": replace_DeepseekV3MoE, } -def prepare_for_quantization(model: PreTrainedModel) -> PreTrainedModel: - def replace(module: torch.nn.Module) -> torch.nn.Module: - if module.__class__ in replacements: - return replacements[module.__class__](module) - else: - return module +def prepare_for_calibration(model: PreTrainedModel) -> PreTrainedModel: + for name, module in model.named_modules(): + cls_name = module.__class__.__name__ + if cls_name in replacements: + new_module = replacements[cls_name](module) + replace_module(model, name, new_module) - return module_bfs(model, replace, progress=True) + return model diff --git a/src/llmcompressor/modifiers/obcq/sgpt_sparsify.py b/src/llmcompressor/modifiers/obcq/sgpt_sparsify.py index c43014a72..f327a4c34 100644 --- a/src/llmcompressor/modifiers/obcq/sgpt_sparsify.py +++ b/src/llmcompressor/modifiers/obcq/sgpt_sparsify.py @@ -3,6 +3,7 @@ import torch import transformers +from loguru import logger SGPT_PRECISION = torch.float32 @@ -108,11 +109,12 @@ def sparsify_weight( H = torch.linalg.cholesky(H, upper=True) Hinv = H except torch._C._LinAlgError: - raise torch._C._LinAlgError( + logger.warning( "Failed to invert hessian due to numerical instability. Consider " "increasing SparseGPTModifier.dampening_frac, increasing the number " "of calibration samples, or shuffling the calibration dataset" ) + Hinv = H = torch.eye(num_columns, dtype=H.dtype, device=H.device) # sparsity mask # TODO: consider computing sparsity mask in the same way and place as gptq diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py index 63e1c2a24..b10a4cb31 100644 --- a/src/llmcompressor/modifiers/quantization/calibration.py +++ b/src/llmcompressor/modifiers/quantization/calibration.py @@ -18,6 +18,12 @@ from llmcompressor.observers import Observer from llmcompressor.utils.helpers import getattr_chain +DEFAULT_MAXSHRINK = 0.20 +DEFAULT_PATIENCE = 5 +DEFAULT_AVERAGING_CONSTANT = 0.01 +DEFAULT_GRID = 100.0 +DEFAULT_NORM = 2.4 + __all__ = [ "initialize_observer", "update_weight_zp_scale", @@ -60,9 +66,18 @@ def initialize_observer( False, DynamicType.LOCAL, ): + observer_kwargs = quantization_args.observer_kwargs or {} observer = Observer.load_from_registry( quantization_args.observer, quantization_args=quantization_args, + averaging_constant=observer_kwargs.get( + "averaging_constant", DEFAULT_AVERAGING_CONSTANT + ), + # used by mse observer only, will be ignored by minmax observer + maxshrink=observer_kwargs.get("maxshrink", DEFAULT_MAXSHRINK), + patience=observer_kwargs.get("patience", DEFAULT_PATIENCE), + grid=observer_kwargs.get("grid", DEFAULT_GRID), + norm=observer_kwargs.get("norm", DEFAULT_NORM), ) module.register_module(f"{base_name}_observer", observer) diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py index eca56b6c4..ce5c0e779 100644 --- a/src/llmcompressor/observers/min_max.py +++ b/src/llmcompressor/observers/min_max.py @@ -22,6 +22,7 @@ def __init__( self, quantization_args: QuantizationArgs, averaging_constant: float = 0.01, + **kwargs, ): super().__init__(quantization_args=quantization_args) diff --git a/src/llmcompressor/observers/mse.py b/src/llmcompressor/observers/mse.py index 73e70c202..419155f07 100644 --- a/src/llmcompressor/observers/mse.py +++ b/src/llmcompressor/observers/mse.py @@ -20,18 +20,19 @@ class MovingAverageMSEObserver(Observer): def __init__( self, quantization_args: QuantizationArgs, + maxshrink: float = 0.2, + patience: int = 5, averaging_constant: float = 0.01, grid: float = 100.0, norm: float = 2.4, + **kwargs, ): super().__init__(quantization_args=quantization_args) - kwargs = quantization_args.observer_kwargs or {} - self.maxshrink = kwargs.get("maxshrink", 0.20) - self.patience = kwargs.get("patience", 5) - self.min_val = {} self.max_val = {} + self.maxshrink = maxshrink + self.patience = patience self.averaging_constant = averaging_constant self.grid = grid self.norm = norm diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py index 87f463bfe..605358ae9 100644 --- a/src/llmcompressor/pipelines/basic/pipeline.py +++ b/src/llmcompressor/pipelines/basic/pipeline.py @@ -2,6 +2,7 @@ import torch import tqdm +from compressed_tensors.utils import get_execution_device from torch.utils.data.dataloader import DataLoader from llmcompressor.core import LifecycleCallbacks @@ -9,7 +10,6 @@ from llmcompressor.pipelines.registry import CalibrationPipeline from llmcompressor.pytorch.utils.helpers import tensors_to_device from llmcompressor.utils import calibration_forward_context, dispatch_for_generation -from llmcompressor.utils.dev import infer_model_device if TYPE_CHECKING: from llmcompressor.args.dataset_arguments import DatasetArguments @@ -38,9 +38,7 @@ def __call__( :param dataset_args: dataset arguments relevant to pipelines """ dispatch_for_generation(model) # basic dispatch is identical to generation - model_device = getattr(dataset_args, "model_input_device") - if model_device is None: - model_device = infer_model_device(model) + model_device = get_execution_device(model) LifecycleCallbacks.calibration_epoch_start() diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index 089f7dc8c..e5a608708 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -2,7 +2,7 @@ import torch import tqdm -from compressed_tensors.utils import disable_offloading +from compressed_tensors.utils import disable_offloading, get_execution_device from torch.utils.data.dataloader import DataLoader from llmcompressor.core import LifecycleCallbacks, active_session @@ -19,7 +19,6 @@ dispatch_for_sequential, get_sequential_targets, ) -from llmcompressor.utils.dev import infer_model_device from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context if TYPE_CHECKING: @@ -61,9 +60,7 @@ def __call__( # prepare model for sequential onloading dispatch_for_sequential(model) - model_device = getattr(dataset_args, "model_input_device") - if model_device is None: - model_device = infer_model_device(model) + model_device = get_execution_device(model) # find layers modifiers = session.get_modifiers() diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index c9ce9cd30..9a2b8f3c9 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING import torch -from compressed_tensors.utils import disable_offloading +from compressed_tensors.utils import disable_offloading, get_execution_device from torch.utils.data.dataloader import DataLoader from tqdm import tqdm @@ -14,7 +14,6 @@ get_sequential_targets, trace_subgraphs, ) -from llmcompressor.utils.dev import infer_model_device from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context if TYPE_CHECKING: @@ -55,9 +54,7 @@ def __call__( # prepare model for sequential onloading dispatch_for_sequential(model) - model_device = getattr(dataset_args, "model_input_device") - if model_device is None: - model_device = infer_model_device(model) + model_device = get_execution_device(model) # prepare to trace subgraphs modifiers = session.get_modifiers() diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py index 4832e3b7f..0465b2a8d 100644 --- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py @@ -1,11 +1,9 @@ import os -import re import weakref from functools import wraps from typing import Optional import torch -import transformers from accelerate.accelerator import get_state_dict_offloaded_model from compressed_tensors import ( CompressionFormat, @@ -86,11 +84,6 @@ def save_pretrained_wrapper( :param kwargs: additional kwargs to pass on to model.save_pretrained """ - # HACK: Override the dtype_byte_size function in transformers to - # support float8 types. Fix is posted upstream - # https://github.com/huggingface/transformers/pull/30488 - transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size - # compress model using compressor compressor = get_model_compressor( model=model, @@ -128,18 +121,6 @@ def save_pretrained_wrapper( model.save_pretrained = save_pretrained_compressed(model.save_pretrained) -# HACK: Override the dtype_byte_size function in transformers to support float8 types -# Fix is posted upstream https://github.com/huggingface/transformers/pull/30488 -def new_dtype_byte_size(dtype): - if dtype == torch.bool: - return 1 / 8 - bit_search = re.search(r"[^\d](\d+)_?", str(dtype)) - if bit_search is None: - raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") - bit_size = int(bit_search.groups()[0]) - return bit_size // 8 - - def patch_tied_tensors_bug(model: torch.nn.Module): """ Patches bug where HF transformers will fail to untie weights under specific diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py index 558ef816e..f0feb6c04 100644 --- a/src/llmcompressor/utils/dev.py +++ b/src/llmcompressor/utils/dev.py @@ -7,7 +7,7 @@ import torch from accelerate import dispatch_model, infer_auto_device_map from accelerate.utils import get_balanced_memory -from compressed_tensors.utils import has_offloaded_params, remove_dispatch +from compressed_tensors.utils import remove_dispatch from huggingface_hub import snapshot_download from safetensors.torch import save_file from transformers import AutoModelForCausalLM, PreTrainedModel @@ -20,7 +20,6 @@ "skip_weights_download", "patch_transformers_logger_level", "dispatch_for_generation", - "infer_model_device", ] @@ -141,28 +140,3 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel: ) return dispatch_model(model, device_map=device_map) - - -def infer_model_device(model: PreTrainedModel) -> torch.device: - """ - Gets the model's execution device (the device that model inputs should be on) - using non-guaranteed but reasonable assumptions about module and parameter order. - - If a model is offloaded, assume that modules execute in the same order - that they are returned by `model.modules()` - - If a model is not offloaded, assume that parameters are used in the same order - that they are returned by `model.parameters()` - - :param model: model whose execution device is being inferred - :return: device which model inputs should be put on - """ - for module in model.modules(): - if has_offloaded_params(module): - return module._hf_hook.execution_device - - first_param = next(model.parameters(), None) - if first_param is None: - return torch.device("cpu") - - return first_param.device From 863377ea1dc4333bc35ffa7817d32111e11ca4b2 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 26 Jun 2025 14:27:30 -0400 Subject: [PATCH 21/22] remove extra file Signed-off-by: Kyle Sayers --- src/llmcompressor/utils/module.py | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 src/llmcompressor/utils/module.py diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py deleted file mode 100644 index a02aa8b4a..000000000 --- a/src/llmcompressor/utils/module.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Callable, Union - -import torch -import tqdm - -__all__ = ["module_bfs"] - - -def module_bfs( - module: torch.nn.Module, - func: Callable[[torch.nn.Module], torch.nn.Module], - pre: bool = True, - progress: Union[bool, tqdm.tqdm] = False, -) -> torch.nn.Module: - if progress is True: - total = len(list(module.modules())) - progress = tqdm.tqdm(total=total) - if pre: - module = func(module) - for name, child in list(module.named_children()): - module.add_module(name, module_bfs(child, func, pre, progress)) - if not pre: - module = func(module) - if isinstance(progress, tqdm.tqdm): - progress.update(1) - - return module From 2f5de103abc4cf5fdd9197af9029ef8182021a5a Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 26 Jun 2025 14:31:24 -0400 Subject: [PATCH 22/22] convert to fp8 examples Signed-off-by: Kyle Sayers --- examples/quantizing_moe/README.md | 14 +++++++------- examples/quantizing_moe/mixtral_example.py | 4 ++-- examples/quantizing_moe/qwen_example.py | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/quantizing_moe/README.md b/examples/quantizing_moe/README.md index f2a162d91..8a9b257f4 100644 --- a/examples/quantizing_moe/README.md +++ b/examples/quantizing_moe/README.md @@ -1,6 +1,6 @@ -# Quantizing Mixtral-8x7B-Instruct-v0.1 Model with W4A16 +# Quantizing Mixtral-8x7B-Instruct-v0.1 Model with FP8 -This directory contains an example script for quantizing the `Mixtral-8x7B-Instruct-v0.1` model using the static per-tensor W4A16 quantization scheme. +This directory contains an example script for quantizing the `Mixtral-8x7B-Instruct-v0.1` model using the static per-tensor FP8 quantization scheme. ## Installation @@ -22,7 +22,7 @@ python3 mixtral_example.py ## Creating a Quantized MoE Model -This example leverages `llm-compressor` and `compressed-tensors` to create an W4A16-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `ultrachat_200k` dataset. +This example leverages `llm-compressor` and `compressed-tensors` to create an FP8-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `ultrachat_200k` dataset. You can follow the detailed steps below or simply run the example script with: @@ -36,24 +36,24 @@ In this step, you'll choose a baseline model for quantization, a dataset for cal - **Models**: Can be referenced from a local directory or retrieved from the Hugging Face Hub. - **Datasets**: Can also be from a local directory or the Hugging Face Hub. -- **Recipes**: These are YAML files or Python modifier objects that describe how a model should be optimized during or after training. In this example, we use a `QuantizationModifier` object with the scheme set to `W4A16`. +- **Recipes**: These are YAML files or Python modifier objects that describe how a model should be optimized during or after training. In this example, we use a `QuantizationModifier` object with the scheme set to `FP8`. ```python from llmcompressor.modifiers.quantization import QuantizationModifier -recipe = QuantizationModifier(scheme="W4A16", targets="Linear", ignore=["lm_head", "re:.*block_sparse_moe.gate"]) +recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=["lm_head", "re:.*block_sparse_moe.gate"]) ``` NOTE: `.*block_sparse_moe.gate` layers do not quantize well, hence they are ignored! ### Step 2: Run Quantization Using Oneshot -The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-W4A16-G128`. +The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-FP8`. ```python from llmcompressor import oneshot -output_dir = "Mixtral-8x7B-Instruct-v0.1-W4A16-G128" +output_dir = "Mixtral-8x7B-Instruct-v0.1-FP8" oneshot( model=model, diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py index 49b08c722..5021c7947 100644 --- a/examples/quantizing_moe/mixtral_example.py +++ b/examples/quantizing_moe/mixtral_example.py @@ -55,7 +55,7 @@ def tokenize(sample): # since the MoE gate layers are sensitive to quantization, we add them to the ignore # list so they remain at full precision recipe = QuantizationModifier( - scheme="W4A16", + scheme="FP8", targets="Linear", ignore=[ "lm_head", @@ -81,6 +81,6 @@ def tokenize(sample): print("==========================================") # Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_example.py index 2531e6528..bb00b530e 100644 --- a/examples/quantizing_moe/qwen_example.py +++ b/examples/quantizing_moe/qwen_example.py @@ -56,7 +56,7 @@ def tokenize(sample): # list so they remain at full precision recipe = GPTQModifier( targets="Linear", - scheme="W4A16", + scheme="FP8", ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"], ) @@ -80,6 +80,6 @@ def tokenize(sample): print("==========================================") # Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128" +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR)