From 1aea4ddd2339e096c704e066e40640b59eadde2d Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 3 Jun 2025 17:31:24 -0400 Subject: [PATCH 01/36] wip: alignment context Signed-off-by: Kyle Sayers --- examples/quantization_w4a16/llama3_example.py | 5 ++- src/llmcompressor/pipelines/basic/pipeline.py | 2 + src/llmcompressor/pipelines/registry.py | 4 +- .../pipelines/sequential/helpers.py | 35 ++++++++++++++- .../pipelines/sequential/pipeline.py | 45 ++++++++++--------- 5 files changed, 67 insertions(+), 24 deletions(-) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 09bf63fb8..1016a8081 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -1,3 +1,5 @@ +import torch +from compressed_tensors import force_cpu_offload from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer @@ -9,9 +11,10 @@ model = AutoModelForCausalLM.from_pretrained( MODEL_ID, - device_map="auto", + # device_map="auto", torch_dtype="auto", ) +force_cpu_offload(model, execution_device=torch.device("cuda")) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset. diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py index 15b94786a..431dc1965 100644 --- a/src/llmcompressor/pipelines/basic/pipeline.py +++ b/src/llmcompressor/pipelines/basic/pipeline.py @@ -37,6 +37,8 @@ def __call__( :param dataloader: loads data for calibration :param dataset_args: dataset arguments relevant to pipelines """ + # TODO: warn about cpu offloading + model_device = get_execution_device(model) LifecycleCallbacks.calibration_epoch_start() diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py index 77d6e79ab..0a4cbb645 100644 --- a/src/llmcompressor/pipelines/registry.py +++ b/src/llmcompressor/pipelines/registry.py @@ -75,12 +75,12 @@ def _validate_infer_pipeline(modifiers: List[Modifier]) -> str: quant_modifier = active_qmods[0] config = quant_modifier.resolve_quantization_config() if config.requires_calibration_data(): - return "basic" + return "sequential" else: return "datafree" if any(isinstance(modifier, SmoothQuantModifier) for modifier in modifiers): - return "basic" + return "sequential" return "datafree" diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index b7937a2fc..38f1cdadf 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Set import torch +from accelerate.hooks import AlignDevicesHook from compressed_tensors import has_offloaded_params from compressed_tensors.quantization import find_name_or_class_matches from loguru import logger @@ -23,7 +24,12 @@ from .ast_helpers import autowrap_forwards -__all__ = ["trace_subgraphs", "Subgraph", "get_targets_from_modifiers"] +__all__ = [ + "trace_subgraphs", + "Subgraph", + "get_targets_from_modifiers", + "keep_onload_context", +] @dataclass @@ -485,3 +491,30 @@ def is_ancestor(module: Module) -> bool: is_ancestor(model) return ancestors + + +@contextlib.contextmanager +def keep_onload_context(): + original_pre_forward = AlignDevicesHook.pre_forward + onloaded_modules = dict() + + # onload once and disable any future onloading/offloading steps + def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs): + ret = original_pre_forward(self, module, *args, **kwargs) + if module not in onloaded_modules: + onloaded_modules[module] = (self, self.offload) + self.offload = False + return ret + + # use the patched pre_forward function within the context + with patch_attr(AlignDevicesHook, "pre_forward", keep_onload_pre_forward): + yield + + # manually offload all modules that were onloaded + for module, (hook, offload) in onloaded_modules.items(): + hook.offload = offload + hook.post_forward(module, None) + + +# def is_cpu_offloaded(): +# diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 22c47d894..fcb91d803 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -1,9 +1,9 @@ from typing import TYPE_CHECKING import torch -import tqdm from compressed_tensors.utils import get_execution_device from torch.utils.data.dataloader import DataLoader +from tqdm import tqdm from llmcompressor.core import LifecycleCallbacks, active_session from llmcompressor.modifiers.utils.hooks import HooksMixin @@ -11,6 +11,7 @@ from llmcompressor.pipelines.registry import CalibrationPipeline from llmcompressor.pipelines.sequential.helpers import ( get_targets_from_modifiers, + keep_onload_context, trace_subgraphs, ) from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context @@ -51,6 +52,8 @@ def __call__( """ session = active_session() + # TODO: warn about not cpu offloading + # prepare to trace subgraphs modifiers = session.get_modifiers() sequential_targets = get_targets_from_modifiers(modifiers, model) @@ -59,37 +62,39 @@ def __call__( # trace subgraphs sample_input = next(iter(dataloader)) subgraphs = trace_subgraphs(model, sample_input, sequential_targets, ignore) + num_subgraphs = len(subgraphs) LifecycleCallbacks.calibration_epoch_start() with calibration_forward_context(model), DisableQuantization(model): # prepare intermediates cache model_device = get_execution_device(model) - intermediates = IntermediatesCache.from_dataloader(dataloader, model_device) + activations = IntermediatesCache.from_dataloader(dataloader, model_device) - num_subgraphs = len(subgraphs) for subgraph_index, subgraph in enumerate(subgraphs): # prepare tqdm description texts calib_desc = f"({subgraph_index + 1}/{num_subgraphs}): Calibrating" prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating" - # do a preliminary pass to trigger modifier hooks - for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc): - inputs = intermediates.fetch(batch_idx, subgraph.input_names) - subgraph.forward(model, **inputs) - - LifecycleCallbacks.sequential_epoch_end() - - # this pass does not trigger modifier hooks - # and is only used for capturing outputs from newly compressed modules - with HooksMixin.disable_hooks(): - for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=prop_desc): - inputs = intermediates.fetch(batch_idx, subgraph.input_names) - output = subgraph.forward(model, **inputs) - - if subgraph_index < num_subgraphs - 1: - intermediates.update(batch_idx, output) - intermediates.delete(batch_idx, subgraph.consumed_names) + # reduce memory movement by keeping modules onloaded + with keep_onload_context(): + # do a preliminary pass to trigger modifier hooks + for batch_idx in tqdm(range(len(dataloader)), desc=calib_desc): + inputs = activations.fetch(batch_idx, subgraph.input_names) + subgraph.forward(model, **inputs) + + LifecycleCallbacks.sequential_epoch_end() + + # this pass does not trigger modifier hooks + # and is only used for capturing outputs of newly compressed modules + with HooksMixin.disable_hooks(): + for batch_idx in tqdm(range(len(dataloader)), desc=prop_desc): + inputs = activations.fetch(batch_idx, subgraph.input_names) + output = subgraph.forward(model, **inputs) + + if subgraph_index < num_subgraphs - 1: + activations.update(batch_idx, output) + activations.delete(batch_idx, subgraph.consumed_names) # redundant, finish any remaining compression LifecycleCallbacks.calibration_epoch_end() From 6705bf4e5e8c3c05407e5a8b4ad6d38100f22d90 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 5 Jun 2025 22:19:31 +0000 Subject: [PATCH 02/36] touchups based on remaining steps Signed-off-by: Brian Dellabetta --- examples/quantization_w4a16/llama3_example.py | 5 +- src/llmcompressor/entrypoints/oneshot.py | 38 ++++++++++++- src/llmcompressor/modifiers/awq/base.py | 1 - src/llmcompressor/pipelines/basic/pipeline.py | 2 - .../pipelines/layer_sequential/pipeline.py | 54 +++++++++++-------- .../pipelines/sequential/pipeline.py | 2 - 6 files changed, 70 insertions(+), 32 deletions(-) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 1016a8081..6ac0328d7 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -1,5 +1,4 @@ import torch -from compressed_tensors import force_cpu_offload from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer @@ -11,10 +10,9 @@ model = AutoModelForCausalLM.from_pretrained( MODEL_ID, - # device_map="auto", + device_map="cpu", torch_dtype="auto", ) -force_cpu_offload(model, execution_device=torch.device("cuda")) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset. @@ -67,6 +65,7 @@ def tokenize(sample): recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, + oneshot_device=torch.device("cuda") if torch.cuda.is_available() else None, ) # Confirm generations of the quantized model look sane. diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index bedca7392..730c280f9 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -2,6 +2,9 @@ from datetime import datetime from typing import Optional +import torch +from compressed_tensors import force_cpu_offload +from compressed_tensors.utils import get_execution_device from loguru import logger from torch.utils.data import DataLoader from transformers import PreTrainedModel @@ -10,7 +13,11 @@ from llmcompressor.core.session_functions import active_session from llmcompressor.datasets import get_calibration_dataloader from llmcompressor.entrypoints.utils import post_process, pre_process -from llmcompressor.pipelines.registry import CalibrationPipeline +from llmcompressor.pipelines import ( + CalibrationPipeline, + LayerSequentialPipeline, + SequentialPipeline, +) __all__ = ["Oneshot", "oneshot"] @@ -186,6 +193,35 @@ def apply_recipe_modifiers( user_pipeline = self.dataset_args.pipeline modifiers = session.get_modifiers() pipeline = CalibrationPipeline.from_modifiers(modifiers, user=user_pipeline) + + model_exec_device = get_execution_device(self.model) + + # Sequential pipelines onload models layer by layer to minimize GPU memory usage + if isinstance(pipeline, (SequentialPipeline, LayerSequentialPipeline)): + # unless pure cpu run, throw warning if model lives on oneshot_device + if ( + model_exec_device + == self.model_args.oneshot_device + != torch.device("cpu") + ): + logger.warning( + f"Model device {model_exec_device} is the same as oneshot" + " execution device. If you encounter OOM errors, consider" + " loading the model up on CPU, so that more memory is available" + " for the oneshot algorithm to run on GPU. Example available at" + " examples/quantization_w4a16/llama3_example.py" + ) + + # set cpu offload for model + elif ( + model_exec_device + == torch.device("cpu") + != self.model_args.oneshot_device + ): + force_cpu_offload( + self.model, execution_devce=self.model_args.oneshot_device + ) + pipeline(self.model, calibration_dataloader, self.dataset_args) session.finalize() diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index f95aaaea8..e5e02b62f 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -34,7 +34,6 @@ __all__ = ["AWQModifier"] -# TODO (Brian INFERENG-531) Add support for offloaded models class AWQModifier(Modifier, QuantizationMixin): """ Implements the AWQ (Activation-Weighted Quantization) algorithm, diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py index 431dc1965..15b94786a 100644 --- a/src/llmcompressor/pipelines/basic/pipeline.py +++ b/src/llmcompressor/pipelines/basic/pipeline.py @@ -37,8 +37,6 @@ def __call__( :param dataloader: loads data for calibration :param dataset_args: dataset arguments relevant to pipelines """ - # TODO: warn about cpu offloading - model_device = get_execution_device(model) LifecycleCallbacks.calibration_epoch_start() diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index 9cb2f3708..d4b79f188 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -14,7 +14,10 @@ to_next_layer_kwargs, ) from llmcompressor.pipelines.registry import CalibrationPipeline -from llmcompressor.pipelines.sequential.helpers import get_targets_from_modifiers +from llmcompressor.pipelines.sequential.helpers import ( + get_targets_from_modifiers, + keep_onload_context, +) from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context if TYPE_CHECKING: @@ -73,29 +76,34 @@ def __call__( calib_desc = f"({layer_index + 1}/{num_layers}): Calibrating" prop_desc = f"({layer_index + 1}/{num_layers}): Propagating" - # do a preliminary pass to trigger modifier hooks - for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc): - inputs = intermediates.fetch(batch_idx) - layer(**inputs) - - LifecycleCallbacks.sequential_epoch_end() - - # this pass does not trigger modifier hooks - # and is only used for capturing outputs from newly compressed modules - with HooksMixin.disable_hooks(): - for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=prop_desc): + # reduce memory movement by keeping modules onloaded + with keep_onload_context(): + # do a preliminary pass to trigger modifier hooks + for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc): inputs = intermediates.fetch(batch_idx) - output = layer(**inputs) - - if layer_index < num_layers - 1: - next_layer = layers[layer_index + 1] - output = to_next_layer_kwargs(output, next_layer) - output = maybe_inject_pos_embeddings( - output, next_layer, inputs - ) - - intermediates.delete(batch_idx) - intermediates.update(batch_idx, output) + layer(**inputs) + + LifecycleCallbacks.sequential_epoch_end() + + # this pass does not trigger modifier hooks + # and is only used for capturing outputs from + # newly compressed modules + with HooksMixin.disable_hooks(): + for batch_idx in tqdm.tqdm( + range(len(dataloader)), desc=prop_desc + ): + inputs = intermediates.fetch(batch_idx) + output = layer(**inputs) + + if layer_index < num_layers - 1: + next_layer = layers[layer_index + 1] + output = to_next_layer_kwargs(output, next_layer) + output = maybe_inject_pos_embeddings( + output, next_layer, inputs + ) + + intermediates.delete(batch_idx) + intermediates.update(batch_idx, output) # redundant, finish any remaining compression LifecycleCallbacks.calibration_epoch_end() diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index fcb91d803..c043d2c8a 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -52,8 +52,6 @@ def __call__( """ session = active_session() - # TODO: warn about not cpu offloading - # prepare to trace subgraphs modifiers = session.get_modifiers() sequential_targets = get_targets_from_modifiers(modifiers, model) From cf1f87d4422c2333faa13e84a36412a374c3af7a Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 5 Jun 2025 21:40:22 -0400 Subject: [PATCH 03/36] implement oneshot_device, pipeline warnings Signed-off-by: Kyle Sayers --- src/llmcompressor/args/model_arguments.py | 2 +- src/llmcompressor/entrypoints/oneshot.py | 38 +------------------ src/llmcompressor/entrypoints/utils.py | 12 ++++++ .../pipelines/layer_sequential/pipeline.py | 11 ++++++ src/llmcompressor/pipelines/registry.py | 8 +--- .../pipelines/sequential/pipeline.py | 11 ++++++ 6 files changed, 38 insertions(+), 44 deletions(-) diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py index 870f6d772..9cf8a687c 100644 --- a/src/llmcompressor/args/model_arguments.py +++ b/src/llmcompressor/args/model_arguments.py @@ -81,7 +81,7 @@ class ModelArguments: metadata={"help": "Whether to compress sparse models during save"}, ) oneshot_device: Optional[str] = field( - default="cuda:0", + default="cuda", metadata={"help": "Device to run oneshot calibration on"}, ) model_revision: str = field( diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 730c280f9..54a36abfe 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -2,9 +2,6 @@ from datetime import datetime from typing import Optional -import torch -from compressed_tensors import force_cpu_offload -from compressed_tensors.utils import get_execution_device from loguru import logger from torch.utils.data import DataLoader from transformers import PreTrainedModel @@ -13,11 +10,7 @@ from llmcompressor.core.session_functions import active_session from llmcompressor.datasets import get_calibration_dataloader from llmcompressor.entrypoints.utils import post_process, pre_process -from llmcompressor.pipelines import ( - CalibrationPipeline, - LayerSequentialPipeline, - SequentialPipeline, -) +from llmcompressor.pipelines import CalibrationPipeline __all__ = ["Oneshot", "oneshot"] @@ -193,35 +186,6 @@ def apply_recipe_modifiers( user_pipeline = self.dataset_args.pipeline modifiers = session.get_modifiers() pipeline = CalibrationPipeline.from_modifiers(modifiers, user=user_pipeline) - - model_exec_device = get_execution_device(self.model) - - # Sequential pipelines onload models layer by layer to minimize GPU memory usage - if isinstance(pipeline, (SequentialPipeline, LayerSequentialPipeline)): - # unless pure cpu run, throw warning if model lives on oneshot_device - if ( - model_exec_device - == self.model_args.oneshot_device - != torch.device("cpu") - ): - logger.warning( - f"Model device {model_exec_device} is the same as oneshot" - " execution device. If you encounter OOM errors, consider" - " loading the model up on CPU, so that more memory is available" - " for the oneshot algorithm to run on GPU. Example available at" - " examples/quantization_w4a16/llama3_example.py" - ) - - # set cpu offload for model - elif ( - model_exec_device - == torch.device("cpu") - != self.model_args.oneshot_device - ): - force_cpu_offload( - self.model, execution_devce=self.model_args.oneshot_device - ) - pipeline(self.model, calibration_dataloader, self.dataset_args) session.finalize() diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py index 0186628f0..f63734985 100644 --- a/src/llmcompressor/entrypoints/utils.py +++ b/src/llmcompressor/entrypoints/utils.py @@ -3,6 +3,8 @@ from pathlib import PosixPath from typing import Optional, Tuple +import torch +from compressed_tensors.utils import force_cpu_offload from loguru import logger from torch.nn import Module from transformers import ( @@ -62,6 +64,16 @@ def pre_process(model_args: "ModelArguments"): # untie tie_word_embeddings weights patch_tied_tensors_bug(model_args.model) + # offload to cpu if possible + if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available(): + # TODO: consider renaming function to something like "offload_dispatch_model" + # TODO: modify function to remove any hooks if they already exist (making sure + # to move to cpu when removing hook + force_cpu_offload(model_args.model, model_args.oneshot_device) + + else: + logger.warning("CUDA is not available! Compressing model on CPU instead") + # wrap model.save_pretrained modify_save_pretrained(model_args.model) diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index d4b79f188..3130f75d9 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -2,6 +2,7 @@ import torch import tqdm +from loguru import logger from torch.utils.data.dataloader import DataLoader from llmcompressor.core import LifecycleCallbacks, active_session @@ -57,6 +58,16 @@ def __call__( """ session = active_session() + # check for offloading + if model.device != torch.device("meta"): + logger.warning( + "Attemping to use sequential pipeline with a model which is not " + "offloaded to the cpu. Deploying a model in this way may lead to more " + "memory usage than is required. It is recommended to set " + '`oneshot_device="cuda"` or call `force_cpu_offload` on your model ' + "before compressing" + ) + # find layers modifiers = session.get_modifiers() sequential_targets, _ = get_targets_from_modifiers(modifiers, model) diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py index 0a4cbb645..f472c0f0d 100644 --- a/src/llmcompressor/pipelines/registry.py +++ b/src/llmcompressor/pipelines/registry.py @@ -18,6 +18,7 @@ __all__ = ["CalibrationPipeline"] SEQUENTIAL_MODIFIERS = (AWQModifier, GPTQModifier, SparsityModifierBase) +NEED_DATA = (SmoothQuantModifier, *SEQUENTIAL_MODIFIERS) class CalibrationPipeline(ABC, RegistryMixin): @@ -60,7 +61,7 @@ def from_modifiers( @staticmethod def _validate_infer_pipeline(modifiers: List[Modifier]) -> str: - if any(isinstance(modifier, SEQUENTIAL_MODIFIERS) for modifier in modifiers): + if any(isinstance(modifier, NEED_DATA) for modifier in modifiers): return "sequential" active_qmods = _get_active_quant_modifiers(modifiers) @@ -76,11 +77,6 @@ def _validate_infer_pipeline(modifiers: List[Modifier]) -> str: config = quant_modifier.resolve_quantization_config() if config.requires_calibration_data(): return "sequential" - else: - return "datafree" - - if any(isinstance(modifier, SmoothQuantModifier) for modifier in modifiers): - return "sequential" return "datafree" diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index c043d2c8a..4af40f772 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -2,6 +2,7 @@ import torch from compressed_tensors.utils import get_execution_device +from loguru import logger from torch.utils.data.dataloader import DataLoader from tqdm import tqdm @@ -52,6 +53,16 @@ def __call__( """ session = active_session() + # check for offloading + if model.device != torch.device("meta"): + logger.warning( + "Attemping to use sequential pipeline with a model which is not " + "offloaded to the cpu. Deploying a model in this way may lead to more " + "memory usage than is required. It is recommended to set " + '`oneshot_device="cuda"` or call `force_cpu_offload` on your model ' + "before compressing" + ) + # prepare to trace subgraphs modifiers = session.get_modifiers() sequential_targets = get_targets_from_modifiers(modifiers, model) From 97c8d303fd4b40030bf0e41741f041849862e4de Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 5 Jun 2025 21:42:19 -0400 Subject: [PATCH 04/36] simplify example Signed-off-by: Kyle Sayers --- examples/quantization_w4a16/llama3_example.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 6ac0328d7..df5a8f826 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -1,4 +1,3 @@ -import torch from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer @@ -8,11 +7,7 @@ # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, - device_map="cpu", - torch_dtype="auto", -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset. @@ -65,7 +60,6 @@ def tokenize(sample): recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, - oneshot_device=torch.device("cuda") if torch.cuda.is_available() else None, ) # Confirm generations of the quantized model look sane. From ecfe15d85c01bef92f6396d5c51cf99b3ff4509e Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 5 Jun 2025 21:46:13 -0400 Subject: [PATCH 05/36] move offloading outside of preprocess, which is shared with train Signed-off-by: Kyle Sayers --- src/llmcompressor/entrypoints/oneshot.py | 11 +++++++++++ src/llmcompressor/entrypoints/utils.py | 12 ------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 54a36abfe..9659b7d7e 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -2,6 +2,8 @@ from datetime import datetime from typing import Optional +import torch +from compressed_tensors.utils import force_cpu_offload from loguru import logger from torch.utils.data import DataLoader from transformers import PreTrainedModel @@ -123,6 +125,15 @@ def __init__( # initialize the model and processor pre_process(model_args) + # offload to cpu if possible + if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available(): + # TODO: consider renaming function similar to "offload_dispatch_model" + # TODO: modify function to remove any hooks if they already exist (making + # sure to move to cpu when removing hook + force_cpu_offload(model_args.model, model_args.oneshot_device) + else: + logger.warning("CUDA is not available! Compressing model on CPU instead") + # Set instance attributes self.model = self.model_args.model self.processor = self.model_args.processor diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py index f63734985..0186628f0 100644 --- a/src/llmcompressor/entrypoints/utils.py +++ b/src/llmcompressor/entrypoints/utils.py @@ -3,8 +3,6 @@ from pathlib import PosixPath from typing import Optional, Tuple -import torch -from compressed_tensors.utils import force_cpu_offload from loguru import logger from torch.nn import Module from transformers import ( @@ -64,16 +62,6 @@ def pre_process(model_args: "ModelArguments"): # untie tie_word_embeddings weights patch_tied_tensors_bug(model_args.model) - # offload to cpu if possible - if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available(): - # TODO: consider renaming function to something like "offload_dispatch_model" - # TODO: modify function to remove any hooks if they already exist (making sure - # to move to cpu when removing hook - force_cpu_offload(model_args.model, model_args.oneshot_device) - - else: - logger.warning("CUDA is not available! Compressing model on CPU instead") - # wrap model.save_pretrained modify_save_pretrained(model_args.model) From 6f8624457d445af435292be474bd0cfdcc8b9167 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 5 Jun 2025 21:48:59 -0400 Subject: [PATCH 06/36] cleanup Signed-off-by: Kyle Sayers --- src/llmcompressor/pipelines/sequential/helpers.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index 38f1cdadf..ee282f73a 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -514,7 +514,3 @@ def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs): for module, (hook, offload) in onloaded_modules.items(): hook.offload = offload hook.post_forward(module, None) - - -# def is_cpu_offloaded(): -# From 929f678e371e9dbcdfc675407874203a70c1e393 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 5 Jun 2025 22:43:57 -0400 Subject: [PATCH 07/36] update examples, remove offload devicemap utils Signed-off-by: Kyle Sayers --- examples/awq/llama_example.py | 10 +- examples/awq/qwen3_moe_example.py | 11 +- examples/big_models_with_accelerate/README.md | 95 ------------- .../cpu_offloading_fp8.py | 26 ---- .../mult_gpus_int8_device_map.py | 81 ----------- .../multi_gpu_int8.py | 78 ----------- .../fp8_compressed_inference.py | 6 +- examples/multimodal_audio/whisper_example.py | 13 +- examples/multimodal_vision/gemma3_example.py | 4 +- .../multimodal_vision/idefics3_example.py | 4 +- examples/multimodal_vision/llava_example.py | 4 +- .../multimodal_vision/mistral3_example.py | 4 +- examples/multimodal_vision/mllama_example.py | 4 +- .../multimodal_vision/phi3_vision_example.py | 1 - examples/multimodal_vision/pixtral_example.py | 4 +- .../multimodal_vision/qwen2_vl_example.py | 6 +- .../multimodal_vision/qwen_2_5_vl_example.py | 6 +- .../llama7b_sparse_w4a16.py | 4 +- .../gemma2_fp8_kv_example.py | 12 +- .../llama3_fp8_kv_example.py | 8 +- .../phi3.5_fp8_kv_example.py | 6 +- examples/quantization_w4a16/llama3_example.py | 9 +- .../quantization_w4a16_fp4/llama3_example.py | 4 +- .../quantization_w4a4_fp4/llama3_example.py | 11 +- .../quantization_w8a8_fp8/gemma2_example.py | 4 +- .../llama3.2_vision_example.py | 4 +- .../quantization_w8a8_fp8/llama3_example.py | 4 +- .../quantization_w8a8_fp8/llava1.5_example.py | 4 +- .../quantization_w8a8_fp8/qwen2vl_example.py | 4 +- .../quantization_w8a8_fp8/whisper_example.py | 4 +- .../quantization_w8a8_int8/gemma2_example.py | 6 +- .../quantization_w8a8_int8/llama3_example.py | 6 +- examples/quantizing_moe/deepseek_moe_w4a16.py | 13 +- .../quantizing_moe/deepseek_moe_w8a8_fp8.py | 2 +- .../quantizing_moe/deepseek_moe_w8a8_int8.py | 13 +- .../quantizing_moe/mixtral_moe_w8a8_fp8.py | 10 +- examples/quantizing_moe/qwen_moe_w4a16.py | 13 +- .../llama3_8b_2of4.py | 4 +- .../transformers/compression/helpers.py | 132 +----------------- 39 files changed, 55 insertions(+), 579 deletions(-) delete mode 100644 examples/big_models_with_accelerate/README.md delete mode 100644 examples/big_models_with_accelerate/cpu_offloading_fp8.py delete mode 100644 examples/big_models_with_accelerate/mult_gpus_int8_device_map.py delete mode 100644 examples/big_models_with_accelerate/multi_gpu_int8.py diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py index 9d2c724d7..7706db7e6 100644 --- a/examples/awq/llama_example.py +++ b/examples/awq/llama_example.py @@ -5,12 +5,10 @@ from llmcompressor.modifiers.awq import AWQModifier # Select model and load it. -MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) # Select calibration dataset. DATASET_ID = "mit-han-lab/pile-val-backup" @@ -72,6 +70,6 @@ def tokenize(sample): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym" +SAVE_DIR = model_id.split("/")[-1] + "-awq-asym" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py index b8f4a4ec1..5775284a1 100644 --- a/examples/awq/qwen3_moe_example.py +++ b/examples/awq/qwen3_moe_example.py @@ -5,12 +5,9 @@ from llmcompressor.modifiers.awq import AWQModifier # Select model and load it. -MODEL_ID = "Qwen/Qwen3-30B-A3B" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) +model_id = "Qwen/Qwen3-30B-A3B" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) # Select calibration dataset. DATASET_ID = "mit-han-lab/pile-val-backup" @@ -77,6 +74,6 @@ def tokenize(sample): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym" +SAVE_DIR = model_id.split("/")[-1] + "-awq-sym" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/big_models_with_accelerate/README.md b/examples/big_models_with_accelerate/README.md deleted file mode 100644 index 801f46a2f..000000000 --- a/examples/big_models_with_accelerate/README.md +++ /dev/null @@ -1,95 +0,0 @@ -# Quantizing Big Models with HF Accelerate - -`llmcompressor` integrates with `accelerate` to support quantizing large models such as Llama 70B and 405B, or quantizing any model with limited GPU resources. - -## Overview - -[`accelerate`]((https://huggingface.co/docs/accelerate/en/index)) is a highly useful library in the Hugging Face ecosystem that supports for working with large models, including: -- Offloading parameters to CPU -- Sharding models across multiple GPUs with pipeline-parallelism - - -### Using `device_map` - -To enable `accelerate` features with `llmcompressor`, simple insert `device_map` in `from_pretrained` during model load. - -```python -from transformers import AutoModelForCausalLM -MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" - -# device_map="auto" triggers usage of accelerate -# if > 1 GPU, the model will be sharded across the GPUs -# if not enough GPU memory to fit the model, parameters are offloaded to the CPU -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto") -``` - -`llmcompressor` is designed to respect the `device_map`, so calls to `oneshot` -will work properly out of the box for basic quantization with `QuantizationModifier`, -even for CPU offloaded models. - -To enable CPU offloading for second-order quantization methods such as GPTQ, we need to -allocate additional memory upfront when computing the device map. Not doing so risks -potentially going out-of-memory. - -```python -from llmcompressor.transformers.compression.helpers import calculate_offload_device_map -from transformers import AutoModelForCausalLM -MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" - -# Load model, reserving memory in the device map for sequential GPTQ (adjust num_gpus as needed) -device_map = calculate_offload_device_map(MODEL_ID, reserve_for_hessians=True, num_gpus=1) -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, - device_map=device_map, - torch_dtype="auto", -) -``` - -### Practical Advice - -When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down. If more gpu memory is not available, consider reducing the precision of the loaded model to a lower-width dtype such as `torch.bfloat16`. - -## Examples - -We will show working examples for each use case: -- **CPU Offloading**: Quantize `Llama-70B` to `FP8` using `PTQ` with a single GPU -- **Multi-GPU**: Quantize `Llama-70B` to `INT8` using `GPTQ` and `SmoothQuant` with 2 GPUs - -### Installation - -Install `llmcompressor`: - -```bash -pip install llmcompressor -``` - -### CPU Offloading: `FP8` Quantization with `PTQ` - -CPU offloading is slow. As a result, we recommend using this feature only with data-free quantization methods. For example, when quantizing a model to `fp8`, we typically use simple `PTQ` to statically quantize the weights and use dynamic quantization for the activations. These methods do not require calibration data. - -- `cpu_offloading_fp8.py` demonstrates quantizing the weights and activations of `Llama-70B` to `fp8` on a single GPU: - -```bash -export CUDA_VISIBLE_DEVICES=0 -python cpu_offloading_fp8.py -``` - -The resulting model `./Meta-Llama-3-70B-Instruct-FP8-Dynamic` is ready to run with `vllm`! - -### Multi-GPU: `INT8` Quantization with `GPTQ` - -For quantization methods that require calibration data (e.g. `GPTQ`), CPU offloading is too slow. For these methods, `llmcompressor` can use `accelerate` multi-GPU to quantize models that are larger than a single GPU. For example, when quantizing a model to `int8`, we typically use `GPTQ` to statically quantize the weights, which requires calibration data. - -- `multi_gpu_int8.py` demonstrates quantizing the weights and activations of `Llama-70B` to `int8` on 2 A100s: - -```python -export CUDA_VISIBLE_DEVICES=0,1 -python multi_gpu_int8.py -``` - -The resulting model `./Meta-Llama-3-70B-Instruct-INT8-Dynamic` is quantized and ready to run with `vllm`! - -## Questions or Feature Request? - -Please open up an issue on `vllm-project/llm-compressor` diff --git a/examples/big_models_with_accelerate/cpu_offloading_fp8.py b/examples/big_models_with_accelerate/cpu_offloading_fp8.py deleted file mode 100644 index 248759ba4..000000000 --- a/examples/big_models_with_accelerate/cpu_offloading_fp8.py +++ /dev/null @@ -1,26 +0,0 @@ -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier - -MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" -OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" - -# Load model -# Note: device_map="auto" will offload to CPU if not enough space on GPU. -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True -) - -# Configure the quantization scheme and algorithm (PTQ + FP8_DYNAMIC). -recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] -) - -# Apply quantization and save in `compressed-tensors` format. -oneshot( - model=model, - recipe=recipe, - tokenizer=AutoTokenizer.from_pretrained(MODEL_ID), - output_dir=OUTPUT_DIR, -) diff --git a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py deleted file mode 100644 index be9ecd86a..000000000 --- a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py +++ /dev/null @@ -1,81 +0,0 @@ -import torch -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.modifiers.smoothquant import SmoothQuantModifier -from llmcompressor.transformers.compression.helpers import calculate_offload_device_map - -MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" - -# adjust based off number of desired GPUs -# reserve_for_hessians=True reserves memory which is required by -# GPTQModifier and SparseGPTModifier -device_map = calculate_offload_device_map( - MODEL_ID, num_gpus=1, reserve_for_hessians=True, torch_dtype=torch.bfloat16 -) - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16 -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for W8A8 quantization -recipe = [ - SmoothQuantModifier(smoothing_strength=0.8), - GPTQModifier( - targets="Linear", - scheme="W8A8", - ignore=["lm_head"], - ), -] - -SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8" - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - save_compressed=True, - output_dir=SAVE_DIR, -) diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py deleted file mode 100644 index a8023456a..000000000 --- a/examples/big_models_with_accelerate/multi_gpu_int8.py +++ /dev/null @@ -1,78 +0,0 @@ -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import GPTQModifier - -MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" -SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic" - -# 1) Load model (device_map="auto" with shard the model over multiple GPUs!). -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, - device_map="auto", - torch_dtype="auto", - trust_remote_code=True, -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# 2) Prepare calibration dataset (in this case, we use ultrachat). -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" - -# Select number of samples. 512 samples is a good place to start. -# Increasing the number of samples can improve accuracy. -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 1024 - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# 3) Configure algorithms. In this case, we: -# * quantize the weights to int8 with GPTQ (static per channel) -# * quantize the activations to int8 (dynamic per token) -recipe = [ - GPTQModifier( - targets="Linear", scheme="W8A8", ignore=["lm_head"], dampening_frac=0.1 - ), -] - -# 4) Apply algorithms and save in `compressed-tensors` format. -# if you encounter GPU out-of-memory issues, consider using an explicit -# device map (see multi_gpus_int8_device_map.py) -oneshot( - model=model, - tokenizer=tokenizer, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - output_dir=SAVE_DIR, -) diff --git a/examples/compressed_inference/fp8_compressed_inference.py b/examples/compressed_inference/fp8_compressed_inference.py index f0d0381d2..57debe2fd 100644 --- a/examples/compressed_inference/fp8_compressed_inference.py +++ b/examples/compressed_inference/fp8_compressed_inference.py @@ -19,11 +19,7 @@ "def fibonacci(n):", ] -compressed_model = AutoModelForCausalLM.from_pretrained( - MODEL_STUB, - torch_dtype="auto", - device_map="cuda:0", -) +compressed_model = AutoModelForCausalLM.from_pretrained(MODEL_STUB, torch_dtype="auto") # tokenize the sample data tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB) diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py index e5a292504..f19b0016a 100644 --- a/examples/multimodal_audio/whisper_example.py +++ b/examples/multimodal_audio/whisper_example.py @@ -6,15 +6,10 @@ from llmcompressor.modifiers.quantization import GPTQModifier # Select model and load it. -MODEL_ID = "openai/whisper-large-v3" - -model = WhisperForConditionalGeneration.from_pretrained( - MODEL_ID, - device_map="auto", - torch_dtype="auto", -) +model_id = "openai/whisper-large-v3" +model = WhisperForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") model.config.forced_decoder_ids = None -processor = WhisperProcessor.from_pretrained(MODEL_ID) +processor = WhisperProcessor.from_pretrained(model_id) # Configure processor the dataset task. processor.tokenizer.set_prefix_tokens(language="en", task="transcribe") @@ -106,6 +101,6 @@ def data_collator(batch): # and it was a great thing for what it was at the time but it's not a passive house # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py index 9ac9e0dd9..3310d82d4 100644 --- a/examples/multimodal_vision/gemma3_example.py +++ b/examples/multimodal_vision/gemma3_example.py @@ -8,9 +8,7 @@ # Load model. model_id = "google/gemma-3-4b-it" -model = Gemma3ForConditionalGeneration.from_pretrained( - model_id, device_map="auto", torch_dtype="auto" -) +model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # Oneshot arguments diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py index a8157393d..71434868e 100644 --- a/examples/multimodal_vision/idefics3_example.py +++ b/examples/multimodal_vision/idefics3_example.py @@ -9,9 +9,7 @@ # Load model. model_id = "HuggingFaceM4/Idefics3-8B-Llama3" # or "HuggingFaceTB/SmolVLM-Instruct" -model = Idefics3ForConditionalGeneration.from_pretrained( - model_id, device_map="auto", torch_dtype="auto" -) +model = Idefics3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # Oneshot arguments diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py index cbd0bd5d2..c5c370096 100644 --- a/examples/multimodal_vision/llava_example.py +++ b/examples/multimodal_vision/llava_example.py @@ -8,9 +8,7 @@ # Load model. model_id = "llava-hf/llava-1.5-7b-hf" -model = LlavaForConditionalGeneration.from_pretrained( - model_id, device_map="auto", torch_dtype="auto" -) +model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # Oneshot arguments diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py index 3a45855a0..5ad1820f3 100644 --- a/examples/multimodal_vision/mistral3_example.py +++ b/examples/multimodal_vision/mistral3_example.py @@ -11,9 +11,7 @@ # Load model. model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" -model = Mistral3ForConditionalGeneration.from_pretrained( - model_id, device_map="auto", torch_dtype="auto" -) +model = Mistral3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # Use a custom calibration chat template, rather than the overly-verbose default diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py index d4ddb28d6..9812bcf44 100644 --- a/examples/multimodal_vision/mllama_example.py +++ b/examples/multimodal_vision/mllama_example.py @@ -8,9 +8,7 @@ # Load model. model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" -model = MllamaForConditionalGeneration.from_pretrained( - model_id, device_map="auto", torch_dtype="auto" -) +model = MllamaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # Oneshot arguments diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py index df61b664b..537ff4dc4 100644 --- a/examples/multimodal_vision/phi3_vision_example.py +++ b/examples/multimodal_vision/phi3_vision_example.py @@ -12,7 +12,6 @@ model_id = "microsoft/Phi-3-vision-128k-instruct" model = AutoModelForCausalLM.from_pretrained( model_id, - device_map="auto", torch_dtype="auto", trust_remote_code=True, _attn_implementation="eager", diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py index 940caa6ca..996eea885 100644 --- a/examples/multimodal_vision/pixtral_example.py +++ b/examples/multimodal_vision/pixtral_example.py @@ -8,9 +8,7 @@ # Load model. model_id = "mgoin/pixtral-12b" -model = LlavaForConditionalGeneration.from_pretrained( - model_id, device_map="auto", torch_dtype="auto" -) +model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # Oneshot arguments diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py index 713035eee..cb64e3eb9 100644 --- a/examples/multimodal_vision/qwen2_vl_example.py +++ b/examples/multimodal_vision/qwen2_vl_example.py @@ -11,11 +11,7 @@ # Load model. model_id = "Qwen/Qwen2-VL-2B-Instruct" -model = Qwen2VLForConditionalGeneration.from_pretrained( - model_id, - device_map="auto", - torch_dtype="auto", -) +model = Qwen2VLForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # Oneshot arguments diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py index 068229a12..83eea10dd 100644 --- a/examples/multimodal_vision/qwen_2_5_vl_example.py +++ b/examples/multimodal_vision/qwen_2_5_vl_example.py @@ -11,11 +11,7 @@ # Load model. model_id = "Qwen/Qwen2.5-VL-7B-Instruct" -model = Qwen2_5_VLForConditionalGeneration.from_pretrained( - model_id, - device_map="auto", - torch_dtype="auto", -) +model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # Oneshot arguments diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py index 51bdad4d5..e63e9cd2d 100644 --- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py +++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py @@ -6,9 +6,7 @@ # load the model in as bfloat16 to save on memory and compute model_stub = "neuralmagic/Llama-2-7b-ultrachat200k" -model = AutoModelForCausalLM.from_pretrained( - model_stub, torch_dtype=torch.bfloat16, device_map="auto" -) +model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(model_stub) # uses LLM Compressor's built-in preprocessing for ultra chat diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py index fba2dcce6..840e10a41 100644 --- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py +++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py @@ -4,13 +4,9 @@ from llmcompressor import oneshot # Select model and load it. -MODEL_ID = "google/gemma-2-9b-it" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, - device_map="auto", - torch_dtype="auto", -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +model_id = "google/gemma-2-9b-it" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) # Select calibration dataset. DATASET_ID = "HuggingFaceH4/ultrachat_200k" @@ -98,6 +94,6 @@ def process_and_tokenize(example): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +SAVE_DIR = model_id.split("/")[1] + "-FP8-KV" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index 4bbecaae0..df866e117 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -5,13 +5,13 @@ from llmcompressor import oneshot # Select model and load it. -MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, + model_id, device_map="auto", torch_dtype="auto", ) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +tokenizer = AutoTokenizer.from_pretrained(model_id) # Select calibration dataset. DATASET_ID = "HuggingFaceH4/ultrachat_200k" @@ -96,6 +96,6 @@ def process_and_tokenize(example): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +SAVE_DIR = model_id.split("/")[1] + "-FP8-KV" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py index 576092cf6..f22e0ea02 100644 --- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py +++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py @@ -7,11 +7,7 @@ # Phi-3.5 is a special case for KV cache quantization because it has # fused QKV linear layers. MODEL_ID = "microsoft/Phi-3.5-mini-instruct" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, - device_map="auto", - torch_dtype="auto", -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset. diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index df5a8f826..7d7bb0448 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -5,10 +5,9 @@ from llmcompressor.transformers import oneshot # Select model and load it. -MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - -model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) # Select calibration dataset. DATASET_ID = "HuggingFaceH4/ultrachat_200k" @@ -71,6 +70,6 @@ def tokenize(sample): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py index d8573d271..4bd0f16b0 100644 --- a/examples/quantization_w4a16_fp4/llama3_example.py +++ b/examples/quantization_w4a16_fp4/llama3_example.py @@ -6,9 +6,7 @@ MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" # Load model. -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Configure the quantization algorithm and scheme. diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py index f9d8f35dc..edff1a04c 100644 --- a/examples/quantization_w4a4_fp4/llama3_example.py +++ b/examples/quantization_w4a4_fp4/llama3_example.py @@ -4,13 +4,10 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - # Load model. -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) DATASET_ID = "HuggingFaceH4/ultrachat_200k" @@ -76,6 +73,6 @@ def tokenize(sample): # Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4" +SAVE_DIR = model_id.split("/")[1] + "-NVFP4" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py index 77664f2d5..ed1efe2af 100644 --- a/examples/quantization_w8a8_fp8/gemma2_example.py +++ b/examples/quantization_w8a8_fp8/gemma2_example.py @@ -6,9 +6,7 @@ MODEL_ID = "google/gemma-2-27b-it" # 1) Load model. -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # 2) Configure the quantization algorithm and scheme. diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py index c99d0bfcc..e4d8bebac 100644 --- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py +++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py @@ -6,9 +6,7 @@ MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" # Load model. -model = MllamaForConditionalGeneration.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = MllamaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") processor = AutoProcessor.from_pretrained(MODEL_ID) # Configure the quantization algorithm and scheme. diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py index a66200239..5227eabb2 100644 --- a/examples/quantization_w8a8_fp8/llama3_example.py +++ b/examples/quantization_w8a8_fp8/llama3_example.py @@ -6,9 +6,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" # Load model. -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Configure the quantization algorithm and scheme. diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py index 31cb4cb94..c05d94a80 100644 --- a/examples/quantization_w8a8_fp8/llava1.5_example.py +++ b/examples/quantization_w8a8_fp8/llava1.5_example.py @@ -6,9 +6,7 @@ MODEL_ID = "llava-hf/llava-1.5-7b-hf" # Load model. -model = LlavaForConditionalGeneration.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = LlavaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") processor = AutoProcessor.from_pretrained(MODEL_ID) # Configure the quantization algorithm and scheme. diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py index 564fc6644..c364fbb1e 100644 --- a/examples/quantization_w8a8_fp8/qwen2vl_example.py +++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py @@ -6,9 +6,7 @@ MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct" # Load model. -model = Qwen2VLForConditionalGeneration.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") processor = AutoProcessor.from_pretrained(MODEL_ID) # Configure the quantization algorithm and scheme. diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py index 5efd08a57..7f504a41b 100644 --- a/examples/quantization_w8a8_fp8/whisper_example.py +++ b/examples/quantization_w8a8_fp8/whisper_example.py @@ -7,9 +7,7 @@ MODEL_ID = "openai/whisper-large-v2" # Load model. -model = WhisperForConditionalGeneration.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") model.config.forced_decoder_ids = None processor = AutoProcessor.from_pretrained(MODEL_ID) processor.tokenizer.set_prefix_tokens(language="en", task="transcribe") diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py index 0573b3249..ac7ff5f49 100644 --- a/examples/quantization_w8a8_int8/gemma2_example.py +++ b/examples/quantization_w8a8_int8/gemma2_example.py @@ -6,11 +6,7 @@ # 1) Select model and load it. MODEL_ID = "google/gemma-2-2b-it" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, - device_map="auto", - torch_dtype="auto", -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # 2) Prepare calibration dataset. diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py index c475e9089..d3067de6f 100644 --- a/examples/quantization_w8a8_int8/llama3_example.py +++ b/examples/quantization_w8a8_int8/llama3_example.py @@ -7,11 +7,7 @@ # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, - device_map="auto", - torch_dtype="auto", -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset. diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py index 077bf6a1f..4c56a2c19 100644 --- a/examples/quantizing_moe/deepseek_moe_w4a16.py +++ b/examples/quantizing_moe/deepseek_moe_w4a16.py @@ -4,7 +4,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ from llmcompressor import oneshot -from llmcompressor.transformers.compression.helpers import calculate_offload_device_map # NOTE: transformers 4.49.0 has an attribute error with DeepSeek. # Please consider either downgrading your transformers version to a @@ -13,18 +12,8 @@ # select a Mixture of Experts model for quantization MODEL_ID = "deepseek-ai/DeepSeek-V2.5" -# adjust based off number of desired GPUs -# if not enough memory is available, some layers will automatically be offlaoded to cpu -device_map = calculate_offload_device_map( - MODEL_ID, - reserve_for_hessians=True, - num_gpus=2, - torch_dtype=torch.bfloat16, - trust_remote_code=True, -) - model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py index ac9ec8b19..261ac93f2 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py @@ -13,7 +13,7 @@ MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True + MODEL_ID, device_map="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py index da5856fc5..1b8d80a66 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py @@ -5,7 +5,6 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.transformers.compression.helpers import calculate_offload_device_map # NOTE: transformers 4.49.0 has an attribute error with DeepSeek. # Please consider either downgrading your transformers version to a @@ -14,18 +13,8 @@ # select a Mixture of Experts model for quantization MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" -# adjust based off number of desired GPUs -# if not enough memory is available, some layers will automatically be offlaoded to cpu -device_map = calculate_offload_device_map( - MODEL_ID, - reserve_for_hessians=True, - num_gpus=2, - torch_dtype=torch.bfloat16, - trust_remote_code=True, -) - model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py index 01489e50e..3dc821ce3 100644 --- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py @@ -5,19 +5,11 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers.compression.helpers import calculate_offload_device_map MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" NUM_GPUS = 2 -# Adjust based off number of desired GPUs -device_map = calculate_offload_device_map( - MODEL_ID, reserve_for_hessians=True, num_gpus=NUM_GPUS, torch_dtype="auto" -) - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map=device_map, torch_dtype="auto" -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py index df98d0513..ebb4a5615 100644 --- a/examples/quantizing_moe/qwen_moe_w4a16.py +++ b/examples/quantizing_moe/qwen_moe_w4a16.py @@ -4,23 +4,12 @@ from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot -from llmcompressor.transformers.compression.helpers import calculate_offload_device_map # select a Mixture of Experts model for quantization MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat" -# adjust based off number of desired GPUs -# if not enough memory is available, some layers will automatically be offloaded to cpu -device_map = calculate_offload_device_map( - MODEL_ID, - reserve_for_hessians=True, - num_gpus=2, - torch_dtype=torch.bfloat16, - trust_remote_code=True, -) - model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py index 9fc681ecf..3952d0a90 100644 --- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py +++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py @@ -75,9 +75,7 @@ def get_recipe(fp8_enabled): args = parse_args() # Load model and tokenizer -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Load and preprocess dataset diff --git a/src/llmcompressor/transformers/compression/helpers.py b/src/llmcompressor/transformers/compression/helpers.py index 179d5bc11..d02a08809 100644 --- a/src/llmcompressor/transformers/compression/helpers.py +++ b/src/llmcompressor/transformers/compression/helpers.py @@ -1,27 +1,20 @@ from collections import defaultdict -from typing import Dict, List, Optional, Tuple, Type, Union +from typing import Dict, List, Optional, Tuple -import psutil import torch -from accelerate import infer_auto_device_map, init_empty_weights from accelerate.accelerator import get_state_dict_offloaded_model from compressed_tensors.quantization.utils import iter_named_leaf_modules, module_type from compressed_tensors.utils import align_module_device from torch.nn.modules import Linear from tqdm import tqdm -from transformers import AutoModelForCausalLM from llmcompressor.pytorch.utils import get_linear_layers from llmcompressor.pytorch.utils.helpers import tensor_sparsity -from llmcompressor.utils.pytorch import get_layers, get_no_split_params __ALL__ = [ "tensor_follows_mask_structure", "infer_sparsity_structure_from_stage_modifiers", "infer_sparsity_structure_from_model", - "hessian_memory_requirements", - "custom_offload_device_map", - "calculate_offload_device_map", "infer_sparse_targets_and_ignores", "is_sparse_compression_target", ] @@ -111,36 +104,6 @@ def infer_sparsity_structure_from_model(model: torch.nn.Module) -> Optional[str] return None -def hessian_memory_requirements(model: torch.nn.Module) -> int: - """ - Determines the number of bytes needed to store Hessian data for a single - transformer layer in model. This is used for reserving memory for GPTQ - quantization - - :param model: model to calculate requirements for - :return: number of bytes required to reserve for GPTQ on a single layer - """ - transformer_layers = get_layers(get_no_split_params(model), model) - total_hessian_elems = {} - max_column_size = {} - for no_split_name, no_split_layer in transformer_layers.items(): - total_hessian_elems[no_split_name] = 0 - max_column_size[no_split_name] = 0 - for _name, module in no_split_layer.named_modules(): - if isinstance(module, Linear) and hasattr(module, "weight"): - column_size = module.weight.shape[1] - total_hessian_elems[no_split_name] += column_size * column_size - if column_size > max_column_size[no_split_name]: - # max extra memory for inverse calculation - max_column_size[no_split_name] = column_size - - max_total_hessian_elems = max(total_hessian_elems.values()) - overall_max_column_size = max(max_column_size.values()) - bytes_per_weight = 32 // 8 # hessians are float32 - inverse_reserved = overall_max_column_size * overall_max_column_size - return (max_total_hessian_elems + inverse_reserved) * bytes_per_weight - - def quantization_memory_requirement(model: torch.nn.Module) -> int: """ Determines the max number of bytes needed to store quantization scale and zp data @@ -168,99 +131,6 @@ def quantization_memory_requirement(model: torch.nn.Module) -> int: return total_elements * bytes_ratio -def custom_offload_device_map( - model_stub: str, - max_memory_per_gpu: Union[str, int], - num_gpus: int = 1, - model_cls: Type = AutoModelForCausalLM, - **model_kwargs, -) -> Dict[Union[int, str], Union[int, str]]: - """ - Calculates the optimal gpu mappings for model_stub stored as torch_dtype, where - each GPU is restricted to allocating a specific amount of memory. - - :param model_stub: local path or HF stub to calculate mapping for - :param max_memory_per_gpu: Max memory to allocate on each GPU, as either a string - such as "10GB" or an integer number of bytes - :param num_gpus: number of gpus to utilize - :param model_cls: model class to use when initializing model structure, - default is AutoModelForCausalLM - :param model_kwargs: keyword arguments to pass to model initializer - :return: memory mapping for layers of model_stub to be passed to from_pretrained() - """ - max_cpu_memory = psutil.virtual_memory().available - memory_limits = {device: max_memory_per_gpu for device in range(num_gpus)} - memory_limits["cpu"] = max_cpu_memory - - device_map = {} - with init_empty_weights(): - dummy_model = model_cls.from_pretrained(model_stub, **model_kwargs) - device_map = infer_auto_device_map( - dummy_model, - max_memory=memory_limits, - no_split_module_classes=dummy_model._no_split_modules, - ) - del dummy_model - - return device_map - - -def calculate_offload_device_map( - model_stub: str, - reserve_for_hessians=False, - num_gpus: int = 1, - torch_dtype: torch.dtype = torch.float16, - model_cls: Type = AutoModelForCausalLM, - **model_kwargs, -) -> Dict[Union[int, str], Union[int, str]]: - """ - Calculates the optimal gpu mappings for model_stub stored as torch_dtype. Takes - into account extra memory required for quantization and (optionally) GPTQ hessians - - :param model_stub: local path or HF stub to calculate mapping for - :param reserve_for_hessians: whether to reserve memory for GPTQ - :param num_gpus: number of gpus to utilize - :param model_cls: model class to use when initializing model structure, - default is AutoModelForCausalLM - :param model_kwargs: keyword arguments to pass to model initializer - :return: memory mapping for layers of model_stub to be passed to from_pretrained() - """ - max_cpu_memory = psutil.virtual_memory().available - max_gpu_memory = torch.cuda.mem_get_info(0)[0] - available_gpus = torch.cuda.device_count() - if available_gpus < num_gpus: - raise ValueError( - f"Requested {num_gpus} GPUs but only {available_gpus} are available." - ) - max_gpu_memory = [max_gpu_memory] * num_gpus - - device_map = {} - with init_empty_weights(): - dummy_model = model_cls.from_pretrained( - model_stub, torch_dtype=torch_dtype, **model_kwargs - ) - - reserved_memory = 0 - if reserve_for_hessians: - reserved_memory = hessian_memory_requirements(dummy_model) - reserved_memory += quantization_memory_requirement(dummy_model) - - memory_limits = { - idx: (max_memory - reserved_memory) - for idx, max_memory in enumerate(max_gpu_memory) - } - memory_limits["cpu"] = max_cpu_memory - - device_map = infer_auto_device_map( - dummy_model, - max_memory=memory_limits, - no_split_module_classes=dummy_model._no_split_modules, - ) - del dummy_model - - return device_map - - def infer_sparse_targets_and_ignores( model: torch.nn.Module, sparsity_structure: str, From a275f5343d7cb522935ebc923b7dffc6decf58b9 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 5 Jun 2025 23:15:58 -0400 Subject: [PATCH 08/36] update examples to load before generating Signed-off-by: Kyle Sayers --- examples/awq/llama_example.py | 13 ++++++++----- examples/awq/qwen3_moe_example.py | 13 ++++++++----- examples/multimodal_audio/whisper_example.py | 14 ++++++++------ examples/multimodal_vision/gemma3_example.py | 13 ++++++++----- examples/multimodal_vision/idefics3_example.py | 13 ++++++++----- examples/multimodal_vision/llava_example.py | 13 ++++++++----- examples/multimodal_vision/mistral3_example.py | 13 ++++++++----- examples/multimodal_vision/mllama_example.py | 13 ++++++++----- .../multimodal_vision/phi3_vision_example.py | 13 ++++++++----- examples/multimodal_vision/pixtral_example.py | 13 ++++++++----- examples/multimodal_vision/qwen2_vl_example.py | 14 ++++++++------ .../multimodal_vision/qwen_2_5_vl_example.py | 14 ++++++++------ .../gemma2_fp8_kv_example.py | 13 ++++++++----- .../llama3_fp8_kv_example.py | 13 ++++++++----- .../phi3.5_fp8_kv_example.py | 13 ++++++++----- examples/quantization_w4a16/llama3_example.py | 13 ++++++++----- .../quantization_w4a16_fp4/llama3_example.py | 15 +++++++++------ examples/quantization_w4a4_fp4/llama3_example.py | 15 +++++++++------ examples/quantization_w8a8_fp8/gemma2_example.py | 16 +++++++++------- .../llama3.2_vision_example.py | 11 ++++++++--- examples/quantization_w8a8_fp8/llama3_example.py | 13 ++++++++----- .../quantization_w8a8_fp8/llava1.5_example.py | 10 ++++++++-- .../quantization_w8a8_fp8/qwen2vl_example.py | 10 ++++++++-- .../quantization_w8a8_fp8/whisper_example.py | 13 ++++++++----- .../quantization_w8a8_int8/gemma2_example.py | 6 +++++- .../quantization_w8a8_int8/llama3_example.py | 13 ++++++++----- examples/quantizing_moe/deepseek_moe_w8a8_fp8.py | 3 +++ .../quantizing_moe/deepseek_moe_w8a8_int8.py | 3 +++ examples/quantizing_moe/mixtral_moe_w8a8_fp8.py | 3 +++ examples/quantizing_moe/qwen_moe_w4a16.py | 3 +++ .../llama3_8b_2of4.py | 11 +++++++---- 31 files changed, 227 insertions(+), 129 deletions(-) diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py index 7706db7e6..d456f2b3b 100644 --- a/examples/awq/llama_example.py +++ b/examples/awq/llama_example.py @@ -61,6 +61,14 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[-1] + "-awq-asym" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") @@ -68,8 +76,3 @@ def tokenize(sample): output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[-1] + "-awq-asym" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py index 5775284a1..fa171f0af 100644 --- a/examples/awq/qwen3_moe_example.py +++ b/examples/awq/qwen3_moe_example.py @@ -65,6 +65,14 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[-1] + "-awq-sym" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") @@ -72,8 +80,3 @@ def tokenize(sample): output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[-1] + "-awq-sym" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py index f19b0016a..4992f78d1 100644 --- a/examples/multimodal_audio/whisper_example.py +++ b/examples/multimodal_audio/whisper_example.py @@ -83,6 +83,14 @@ def data_collator(batch): data_collator=data_collator, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") @@ -92,15 +100,9 @@ def data_collator(batch): "input_features": torch.tensor(sample_features).to(model.device), "decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device), } - output = model.generate(**sample_input, language="en") print(processor.batch_decode(output, skip_special_tokens=True)) print("==========================================\n\n") # that's where you have a lot of windows in the south no actually that's passive solar # and passive solar is something that was developed and designed in the 1960s and 70s # and it was a great thing for what it was at the time but it's not a passive house - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py index 3310d82d4..8f4db44c5 100644 --- a/examples/multimodal_vision/gemma3_example.py +++ b/examples/multimodal_vision/gemma3_example.py @@ -46,6 +46,14 @@ def data_collator(batch): data_collator=data_collator, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = Gemma3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") messages = [ @@ -65,8 +73,3 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py index 71434868e..fcc4559d8 100644 --- a/examples/multimodal_vision/idefics3_example.py +++ b/examples/multimodal_vision/idefics3_example.py @@ -92,6 +92,14 @@ def tokenize(sample): data_collator=data_collator, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = Idefics3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") messages = [ @@ -111,8 +119,3 @@ def tokenize(sample): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py index c5c370096..abb781d48 100644 --- a/examples/multimodal_vision/llava_example.py +++ b/examples/multimodal_vision/llava_example.py @@ -47,6 +47,14 @@ def data_collator(batch): data_collator=data_collator, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") messages = [ @@ -66,8 +74,3 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py index 5ad1820f3..50b7e7dd5 100644 --- a/examples/multimodal_vision/mistral3_example.py +++ b/examples/multimodal_vision/mistral3_example.py @@ -60,6 +60,14 @@ def data_collator(batch): data_collator=data_collator, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = Mistral3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") messages = [ @@ -80,8 +88,3 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py index 9812bcf44..7cd45c85a 100644 --- a/examples/multimodal_vision/mllama_example.py +++ b/examples/multimodal_vision/mllama_example.py @@ -47,6 +47,14 @@ def data_collator(batch): data_collator=data_collator, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") messages = [ @@ -66,8 +74,3 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py index 537ff4dc4..772001cb1 100644 --- a/examples/multimodal_vision/phi3_vision_example.py +++ b/examples/multimodal_vision/phi3_vision_example.py @@ -78,6 +78,14 @@ def data_collator(batch): ignore=["lm_head", "re:model.vision_embed_tokens.*"], ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Perform oneshot oneshot( model=model, @@ -95,8 +103,3 @@ def data_collator(batch): output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py index 996eea885..71c1a0770 100644 --- a/examples/multimodal_vision/pixtral_example.py +++ b/examples/multimodal_vision/pixtral_example.py @@ -53,6 +53,14 @@ def data_collator(batch): data_collator=data_collator, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") messages = [ @@ -72,8 +80,3 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py index cb64e3eb9..27e0954d3 100644 --- a/examples/multimodal_vision/qwen2_vl_example.py +++ b/examples/multimodal_vision/qwen2_vl_example.py @@ -95,6 +95,14 @@ def data_collator(batch): data_collator=data_collator, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = Qwen2VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") messages = [ @@ -123,9 +131,3 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") - - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py index 83eea10dd..68798cf00 100644 --- a/examples/multimodal_vision/qwen_2_5_vl_example.py +++ b/examples/multimodal_vision/qwen_2_5_vl_example.py @@ -89,6 +89,14 @@ def data_collator(batch): data_collator=data_collator, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = Qwen2_5_VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") messages = [ @@ -117,9 +125,3 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") - - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py index 840e10a41..cf5501bc9 100644 --- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py +++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py @@ -77,6 +77,14 @@ def process_and_tokenize(example): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-FP8-KV" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + print( "Note: Inference with the quantized kv_cache is not supported. ", "Please use vLLM for inference with the quantized kv_cache.", @@ -92,8 +100,3 @@ def process_and_tokenize(example): output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-FP8-KV" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index df866e117..0cb52e0fd 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -82,6 +82,14 @@ def process_and_tokenize(example): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-FP8-KV" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + logger.info( "Running sample generation. ", "Note: Inference with the quantized kv_cache is not supported. ", @@ -94,8 +102,3 @@ def process_and_tokenize(example): output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-FP8-KV" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py index f22e0ea02..20e7fac4c 100644 --- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py +++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py @@ -79,6 +79,14 @@ def process_and_tokenize(example): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + print( "Note: Inference with the quantized kv_cache is not supported. ", "Please use vLLM for inference with the quantized kv_cache.", @@ -90,8 +98,3 @@ def process_and_tokenize(example): output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") - -# Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 7d7bb0448..9b0b0ffaa 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -61,6 +61,14 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") @@ -68,8 +76,3 @@ def tokenize(sample): output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") - -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py index 4bd0f16b0..378f4f988 100644 --- a/examples/quantization_w4a16_fp4/llama3_example.py +++ b/examples/quantization_w4a16_fp4/llama3_example.py @@ -17,15 +17,18 @@ # Apply quantization. oneshot(model=model, recipe=recipe) +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + +# Validate model generations print("\n\n") print("========== SAMPLE GENERATION ==============") input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") - - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py index edff1a04c..d84b749c0 100644 --- a/examples/quantization_w4a4_fp4/llama3_example.py +++ b/examples/quantization_w4a4_fp4/llama3_example.py @@ -64,15 +64,18 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +# Save to disk in compressed-tensors format. +SAVE_DIR = model_id.split("/")[1] + "-NVFP4" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + +# Validate model generations print("\n\n") print("========== SAMPLE GENERATION ==============") input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") - - -# Save to disk in compressed-tensors format. -SAVE_DIR = model_id.split("/")[1] + "-NVFP4" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py index ed1efe2af..713ce6609 100644 --- a/examples/quantization_w8a8_fp8/gemma2_example.py +++ b/examples/quantization_w8a8_fp8/gemma2_example.py @@ -18,13 +18,15 @@ ) # 3) Apply quantization and save in compressed-tensors format. -OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -oneshot( - model=model, - recipe=recipe, - tokenizer=tokenizer, - output_dir=OUTPUT_DIR, -) +oneshot(model=model, recipe=recipe, tokenizer=tokenizer) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") # Confirm generations of the quantized model look sane. # NOTE: transformers 4.49.0 results in a generation error with gemma2. diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py index e4d8bebac..d3a63eb0c 100644 --- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py +++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py @@ -19,15 +19,20 @@ ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"], ) -# Apply quantization and save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +# Apply quantization. oneshot( model=model, recipe=recipe, - output_dir=SAVE_DIR, ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR) processor.save_pretrained(SAVE_DIR) +# Load model after saving +model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py index 5227eabb2..badd6ac5e 100644 --- a/examples/quantization_w8a8_fp8/llama3_example.py +++ b/examples/quantization_w8a8_fp8/llama3_example.py @@ -20,14 +20,17 @@ # Apply quantization. oneshot(model=model, recipe=recipe) +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) print("==========================================") - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -model.save_pretrained(SAVE_DIR) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py index c05d94a80..06d1483e1 100644 --- a/examples/quantization_w8a8_fp8/llava1.5_example.py +++ b/examples/quantization_w8a8_fp8/llava1.5_example.py @@ -19,11 +19,17 @@ ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_tower.*"], ) -# Apply quantization and save to disk in compressed-tensors format. +# Apply quantization. +oneshot(model=model, recipe=recipe) + +# Save to disk in compressed-tensors format. SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR) +model.save_pretrained(SAVE_DIR) processor.save_pretrained(SAVE_DIR) +# Load model after saving +model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py index c364fbb1e..1cf73f527 100644 --- a/examples/quantization_w8a8_fp8/qwen2vl_example.py +++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py @@ -19,11 +19,17 @@ ignore=["re:.*lm_head", "re:visual.*"], ) -# Apply quantization and save to disk in compressed-tensors format. +# Apply quantization. +oneshot(model=model, recipe=recipe) + +# Save to disk in compressed-tensors format. SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR) +model.save_pretrained(SAVE_DIR) processor.save_pretrained(SAVE_DIR) +# Load model after saving +model = Qwen2VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py index 7f504a41b..51ac804b7 100644 --- a/examples/quantization_w8a8_fp8/whisper_example.py +++ b/examples/quantization_w8a8_fp8/whisper_example.py @@ -23,6 +23,14 @@ # Apply quantization. oneshot(model=model, recipe=recipe) +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) + +# Load model after saving +model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") ds = load_dataset( @@ -37,8 +45,3 @@ print(processor.batch_decode(output_ids, skip_special_tokens=False)[0]) # Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel print("==========================================") - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py index ac7ff5f49..a0ebe0079 100644 --- a/examples/quantization_w8a8_int8/gemma2_example.py +++ b/examples/quantization_w8a8_int8/gemma2_example.py @@ -54,15 +54,19 @@ def tokenize(sample): recipe = GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]) # 4) Apply quantization and save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8" oneshot( model=model, dataset=ds, recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, - output_dir=MODEL_ID.split("/")[1] + "-INT8", + output_dir=SAVE_DIR, ) +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. # NOTE: transformers 4.49.0 results in a generation error with gemma2. # Consider either downgrading your transformers version to a previous version diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py index d3067de6f..1894932f1 100644 --- a/examples/quantization_w8a8_int8/llama3_example.py +++ b/examples/quantization_w8a8_int8/llama3_example.py @@ -67,6 +67,14 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") @@ -74,8 +82,3 @@ def tokenize(sample): output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") - -# Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py index 261ac93f2..69dae05ad 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py @@ -79,6 +79,9 @@ def tokenize(sample): output_dir=SAVE_DIR, ) +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. # Generation is broken for deepseek models when using the latest transformers package if Version(__version__) < Version("4.48"): diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py index 1b8d80a66..aada68b31 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py @@ -80,6 +80,9 @@ def tokenize(sample): output_dir=SAVE_DIR, ) +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. # Generation is broken for deepseek models when using the latest transformers package if Version(__version__) < Version("4.48"): diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py index 3dc821ce3..15e3e67eb 100644 --- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py @@ -43,6 +43,9 @@ output_dir=SAVE_DIR, ) +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. # Generation is broken for deepseek models when using the latest transformers package if Version(__version__) < Version("4.48"): diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py index ebb4a5615..d025004f0 100644 --- a/examples/quantizing_moe/qwen_moe_w4a16.py +++ b/examples/quantizing_moe/qwen_moe_w4a16.py @@ -73,6 +73,9 @@ def tokenize(sample): output_dir=SAVE_DIR, ) +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") + # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py index 3952d0a90..d51ad7670 100644 --- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py +++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py @@ -97,13 +97,16 @@ def get_recipe(fp8_enabled): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) +# Save compressed model and tokenizer +model.save_pretrained(save_dir) +tokenizer.save_pretrained(save_dir) + +# Load model after saving +model = AutoModelForCausalLM.from_pretrained(save_dir, device_map="auto") + # Validate the compressed model print("\n========== SAMPLE GENERATION ==============") input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n") - -# Save compressed model and tokenizer -model.save_pretrained(save_dir) -tokenizer.save_pretrained(save_dir) From 9d6c227e3e75f8e2736828246da5dd767b74bfe1 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 9 Jun 2025 20:42:44 -0400 Subject: [PATCH 09/36] remove hooks Signed-off-by: Kyle Sayers --- src/llmcompressor/entrypoints/oneshot.py | 4 ++++ src/llmcompressor/entrypoints/utils.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 9659b7d7e..f8f4496d2 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -14,6 +14,8 @@ from llmcompressor.entrypoints.utils import post_process, pre_process from llmcompressor.pipelines import CalibrationPipeline +from accelerate.hooks import remove_hook_from_module + __all__ = ["Oneshot", "oneshot"] @@ -130,6 +132,8 @@ def __init__( # TODO: consider renaming function similar to "offload_dispatch_model" # TODO: modify function to remove any hooks if they already exist (making # sure to move to cpu when removing hook + # TODO: remove hook in util + remove_hook_from_module(model_args.model, recurse=True) force_cpu_offload(model_args.model, model_args.oneshot_device) else: logger.warning("CUDA is not available! Compressing model on CPU instead") diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py index 0186628f0..23758b5f3 100644 --- a/src/llmcompressor/entrypoints/utils.py +++ b/src/llmcompressor/entrypoints/utils.py @@ -27,6 +27,7 @@ ) from llmcompressor.typing import Processor from llmcompressor.utils.fsdp.helpers import is_fsdp_model +from accelerate.hooks import remove_hook_from_module def pre_process(model_args: "ModelArguments"): @@ -105,6 +106,9 @@ def post_process( "Ex. `oneshot(..., output_dir=...)`" ) + # Remove any existing hooks (maybe added by oneshot sequential onloading) + remove_hook_from_module(model_args.model, recurse=True) + # Reset the one-time-use session upon completion if recipe_args is not None and recipe_args.clear_sparse_session: reset_session() From 8351ac9fa41d3f39137a9e6af458ead1765ab8a4 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 9 Jun 2025 20:47:22 -0400 Subject: [PATCH 10/36] name change Signed-off-by: Kyle Sayers --- src/llmcompressor/pipelines/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py index f472c0f0d..2ac384866 100644 --- a/src/llmcompressor/pipelines/registry.py +++ b/src/llmcompressor/pipelines/registry.py @@ -18,7 +18,7 @@ __all__ = ["CalibrationPipeline"] SEQUENTIAL_MODIFIERS = (AWQModifier, GPTQModifier, SparsityModifierBase) -NEED_DATA = (SmoothQuantModifier, *SEQUENTIAL_MODIFIERS) +CALIBRATION_MODIFIERS = (SmoothQuantModifier, *SEQUENTIAL_MODIFIERS) class CalibrationPipeline(ABC, RegistryMixin): @@ -61,7 +61,7 @@ def from_modifiers( @staticmethod def _validate_infer_pipeline(modifiers: List[Modifier]) -> str: - if any(isinstance(modifier, NEED_DATA) for modifier in modifiers): + if any(isinstance(modifier, CALIBRATION_MODIFIERS) for modifier in modifiers): return "sequential" active_qmods = _get_active_quant_modifiers(modifiers) From ad71c5bf2178d18f0278d06cb7e494e6aadcfffa Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 12 Jun 2025 14:01:41 -0400 Subject: [PATCH 11/36] cleanup and nits Signed-off-by: Kyle Sayers --- src/llmcompressor/entrypoints/oneshot.py | 4 --- .../pipelines/layer_sequential/pipeline.py | 4 +-- .../pipelines/sequential/helpers.py | 8 +++-- .../pipelines/sequential/pipeline.py | 4 +-- src/llmcompressor/utils/module.py | 29 ------------------- 5 files changed, 10 insertions(+), 39 deletions(-) delete mode 100644 src/llmcompressor/utils/module.py diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 5575f5327..0a7cff81e 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -128,10 +128,6 @@ def __init__( # offload to cpu if possible if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available(): - # TODO: consider renaming function similar to "offload_dispatch_model" - # TODO: modify function to remove any hooks if they already exist (making - # sure to move to cpu when removing hook - # TODO: remove hook in util remove_hook_from_module(model_args.model, recurse=True) force_cpu_offload(model_args.model, model_args.oneshot_device) else: diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index 3130f75d9..3a0cd8cb6 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -16,8 +16,8 @@ ) from llmcompressor.pipelines.registry import CalibrationPipeline from llmcompressor.pipelines.sequential.helpers import ( + disable_offloading, get_targets_from_modifiers, - keep_onload_context, ) from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context @@ -88,7 +88,7 @@ def __call__( prop_desc = f"({layer_index + 1}/{num_layers}): Propagating" # reduce memory movement by keeping modules onloaded - with keep_onload_context(): + with disable_offloading(): # do a preliminary pass to trigger modifier hooks for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc): inputs = intermediates.fetch(batch_idx) diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index ee282f73a..6cb63acdd 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -28,7 +28,7 @@ "trace_subgraphs", "Subgraph", "get_targets_from_modifiers", - "keep_onload_context", + "disable_offloading", ] @@ -494,7 +494,11 @@ def is_ancestor(module: Module) -> bool: @contextlib.contextmanager -def keep_onload_context(): +def disable_offloading(): + """ + Keep modules onloaded and disable offloading until this context exits. + Affects modules which have been hooked with accelerate's `AlignDevicesHook` + """ original_pre_forward = AlignDevicesHook.pre_forward onloaded_modules = dict() diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 4af40f772..ab794daa4 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -11,8 +11,8 @@ from llmcompressor.pipelines.cache import IntermediatesCache from llmcompressor.pipelines.registry import CalibrationPipeline from llmcompressor.pipelines.sequential.helpers import ( + disable_offloading, get_targets_from_modifiers, - keep_onload_context, trace_subgraphs, ) from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context @@ -86,7 +86,7 @@ def __call__( prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating" # reduce memory movement by keeping modules onloaded - with keep_onload_context(): + with disable_offloading(): # do a preliminary pass to trigger modifier hooks for batch_idx in tqdm(range(len(dataloader)), desc=calib_desc): inputs = activations.fetch(batch_idx, subgraph.input_names) diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py deleted file mode 100644 index 0867b3955..000000000 --- a/src/llmcompressor/utils/module.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import Callable, Union - -import tqdm -from torch.nn import Module - - -def module_bfs( - module: Module, - func: Callable[[Module], Module], - pre: bool = True, - progress: Union[bool, tqdm.tqdm] = False, -) -> Module: - if progress is True: - total = len(list(module.modules())) - progress = tqdm.tqdm(total=total) - - if pre: - module = func(module) - - for name, child in list(module.named_children()): - module.add_module(name, module_bfs(child, func, pre, progress)) - - if not pre: - module = func(module) - - if isinstance(progress, tqdm.tqdm): - progress.update(1) - - return module From 819df1ccb3a408c1f78a4e090341b7b9d6c0f92b Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 12 Jun 2025 16:12:53 -0400 Subject: [PATCH 12/36] rename function Signed-off-by: Kyle Sayers --- src/llmcompressor/entrypoints/oneshot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 0a7cff81e..8bcf061f5 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -4,7 +4,7 @@ import torch from accelerate.hooks import remove_hook_from_module -from compressed_tensors.utils import force_cpu_offload +from compressed_tensors.utils import offloaded_dispatch from loguru import logger from torch.utils.data import DataLoader from transformers import PreTrainedModel @@ -129,7 +129,7 @@ def __init__( # offload to cpu if possible if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available(): remove_hook_from_module(model_args.model, recurse=True) - force_cpu_offload(model_args.model, model_args.oneshot_device) + offloaded_dispatch(model_args.model, model_args.oneshot_device) else: logger.warning("CUDA is not available! Compressing model on CPU instead") From 7dd71b94cb5b55895f1f3c97e4f2566470007a2a Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 12 Jun 2025 16:51:37 -0400 Subject: [PATCH 13/36] add dispatch utility Signed-off-by: Kyle Sayers --- examples/quantization_w4a16/llama3_example.py | 20 +++++++++---------- src/llmcompressor/utils/dev.py | 17 ++++++++++++++-- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 9b0b0ffaa..d0f3485d4 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -3,6 +3,7 @@ from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot +from llmcompressor.utils.dev import dispatch_for_generation # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" @@ -61,18 +62,17 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") -output = model.generate(input_ids, max_new_tokens=100) +dispatch_for_generation(model) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to("cuda") for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[-1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) \ No newline at end of file diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py index 4af08448b..b1e5c014d 100644 --- a/src/llmcompressor/utils/dev.py +++ b/src/llmcompressor/utils/dev.py @@ -2,7 +2,7 @@ import logging import os import tempfile -from typing import Type +from typing import Type, Dict, Any, Union import torch from huggingface_hub import snapshot_download @@ -10,10 +10,12 @@ from transformers import AutoModelForCausalLM, PreTrainedModel from transformers.modeling_utils import TORCH_INIT_FUNCTIONS from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, WEIGHTS_INDEX_NAME +from accelerate import dispatch_model, infer_auto_device_map +from accelerate.utils import get_balanced_memory from llmcompressor.utils.helpers import patch_attr -__all__ = ["skip_weights_download", "patch_transformers_logger_level"] +__all__ = ["skip_weights_download", "patch_transformers_logger_level", "dispatch_for_generation"] @contextlib.contextmanager @@ -106,3 +108,14 @@ def patch_transformers_logger_level(level: int = logging.ERROR): transformers_logger.setLevel(level=level) yield transformers_logger.setLevel(level=restore_log_level) + + +def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel: + max_memory = get_balanced_memory( + model, + dtype=model.dtype, + no_split_module_classes=model._get_no_split_modules("auto") + ) + device_map = infer_auto_device_map(model, dtype=model.dtype, max_memory=max_memory) + + return dispatch_model(model, device_map=device_map) \ No newline at end of file From 8ba0f2cf2d582ed6442b242f01755aa0c006e6c0 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 12 Jun 2025 17:07:28 -0400 Subject: [PATCH 14/36] apply style Signed-off-by: Kyle Sayers --- examples/quantization_w4a16/llama3_example.py | 2 +- src/llmcompressor/utils/dev.py | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index d0f3485d4..d487a911b 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -75,4 +75,4 @@ def tokenize(sample): # Save to disk compressed. SAVE_DIR = model_id.split("/")[-1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) \ No newline at end of file +tokenizer.save_pretrained(SAVE_DIR) diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py index b1e5c014d..9c4bbbe5e 100644 --- a/src/llmcompressor/utils/dev.py +++ b/src/llmcompressor/utils/dev.py @@ -2,20 +2,24 @@ import logging import os import tempfile -from typing import Type, Dict, Any, Union +from typing import Type import torch +from accelerate import dispatch_model, infer_auto_device_map +from accelerate.utils import get_balanced_memory from huggingface_hub import snapshot_download from safetensors.torch import save_file from transformers import AutoModelForCausalLM, PreTrainedModel from transformers.modeling_utils import TORCH_INIT_FUNCTIONS from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, WEIGHTS_INDEX_NAME -from accelerate import dispatch_model, infer_auto_device_map -from accelerate.utils import get_balanced_memory from llmcompressor.utils.helpers import patch_attr -__all__ = ["skip_weights_download", "patch_transformers_logger_level", "dispatch_for_generation"] +__all__ = [ + "skip_weights_download", + "patch_transformers_logger_level", + "dispatch_for_generation", +] @contextlib.contextmanager @@ -114,8 +118,8 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel: max_memory = get_balanced_memory( model, dtype=model.dtype, - no_split_module_classes=model._get_no_split_modules("auto") + no_split_module_classes=model._get_no_split_modules("auto"), ) device_map = infer_auto_device_map(model, dtype=model.dtype, max_memory=max_memory) - return dispatch_model(model, device_map=device_map) \ No newline at end of file + return dispatch_model(model, device_map=device_map) From fbf2a6d1b035aa98d117f635e745ddb19f8543b8 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 13 Jun 2025 09:56:33 -0400 Subject: [PATCH 15/36] update examples Signed-off-by: Kyle Sayers --- examples/awq/llama_example.py | 21 ++++++++-------- examples/awq/qwen3_moe_example.py | 24 ++++++++++-------- .../fp8_compressed_inference.py | 6 ++++- examples/multimodal_audio/whisper_example.py | 22 ++++++++-------- examples/multimodal_vision/gemma3_example.py | 15 ++++++----- .../multimodal_vision/idefics3_example.py | 15 ++++++----- examples/multimodal_vision/llava_example.py | 15 ++++++----- .../multimodal_vision/mistral3_example.py | 15 ++++++----- examples/multimodal_vision/mllama_example.py | 15 ++++++----- .../multimodal_vision/phi3_vision_example.py | 15 ++++++----- examples/multimodal_vision/pixtral_example.py | 15 ++++++----- .../multimodal_vision/qwen2_vl_example.py | 16 ++++++------ .../multimodal_vision/qwen_2_5_vl_example.py | 16 ++++++------ .../llama7b_sparse_w4a16.py | 10 +++++--- .../gemma2_fp8_kv_example.py | 21 ++++++++-------- .../llama3_fp8_kv_example.py | 25 ++++++++----------- .../phi3.5_fp8_kv_example.py | 15 ++++++----- .../quantization_w4a16_fp4/llama3_example.py | 17 ++++++------- .../quantization_w4a4_fp4/llama3_example.py | 24 +++++++++--------- .../quantization_w8a8_fp8/gemma2_example.py | 21 +++++++++------- .../llama3.2_vision_example.py | 17 ++++++------- .../quantization_w8a8_fp8/llama3_example.py | 15 ++++++----- .../quantization_w8a8_fp8/llava1.5_example.py | 17 ++++++------- .../quantization_w8a8_fp8/qwen2vl_example.py | 17 ++++++------- .../quantization_w8a8_fp8/whisper_example.py | 15 ++++++----- .../quantization_w8a8_int8/gemma2_example.py | 14 ++++++----- .../quantization_w8a8_int8/llama3_example.py | 15 ++++++----- examples/quantizing_moe/deepseek_moe_w4a16.py | 13 +++++++++- .../quantizing_moe/deepseek_moe_w8a8_fp8.py | 17 ++++++------- .../quantizing_moe/deepseek_moe_w8a8_int8.py | 15 ++++++----- .../quantizing_moe/mixtral_moe_w8a8_fp8.py | 16 ++++++------ examples/quantizing_moe/qwen_moe_w4a16.py | 15 ++++++----- .../llama3_8b_2of4.py | 13 +++++----- examples/trl_mixin/ex_trl_constant.py | 4 +-- 34 files changed, 270 insertions(+), 276 deletions(-) diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py index d456f2b3b..9d2c724d7 100644 --- a/examples/awq/llama_example.py +++ b/examples/awq/llama_example.py @@ -5,10 +5,12 @@ from llmcompressor.modifiers.awq import AWQModifier # Select model and load it. -model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto" +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) # Select calibration dataset. DATASET_ID = "mit-han-lab/pile-val-backup" @@ -61,14 +63,6 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[-1] + "-awq-asym" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") @@ -76,3 +70,8 @@ def tokenize(sample): output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py index fa171f0af..3c16d2f43 100644 --- a/examples/awq/qwen3_moe_example.py +++ b/examples/awq/qwen3_moe_example.py @@ -3,11 +3,15 @@ from llmcompressor import oneshot from llmcompressor.modifiers.awq import AWQModifier +from llmcompressor.utils.dev import dispatch_for_generation # Select model and load it. -model_id = "Qwen/Qwen3-30B-A3B" -model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) +MODEL_ID = "Qwen/Qwen3-30B-A3B" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto" +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) # Select calibration dataset. DATASET_ID = "mit-han-lab/pile-val-backup" @@ -65,18 +69,16 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[-1] + "-awq-sym" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/compressed_inference/fp8_compressed_inference.py b/examples/compressed_inference/fp8_compressed_inference.py index 57debe2fd..f0d0381d2 100644 --- a/examples/compressed_inference/fp8_compressed_inference.py +++ b/examples/compressed_inference/fp8_compressed_inference.py @@ -19,7 +19,11 @@ "def fibonacci(n):", ] -compressed_model = AutoModelForCausalLM.from_pretrained(MODEL_STUB, torch_dtype="auto") +compressed_model = AutoModelForCausalLM.from_pretrained( + MODEL_STUB, + torch_dtype="auto", + device_map="cuda:0", +) # tokenize the sample data tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB) diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py index 4992f78d1..8a6f5d748 100644 --- a/examples/multimodal_audio/whisper_example.py +++ b/examples/multimodal_audio/whisper_example.py @@ -4,12 +4,14 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # Select model and load it. -model_id = "openai/whisper-large-v3" -model = WhisperForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") +MODEL_ID = "openai/whisper-large-v3" + +model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") model.config.forced_decoder_ids = None -processor = WhisperProcessor.from_pretrained(model_id) +processor = WhisperProcessor.from_pretrained(MODEL_ID) # Configure processor the dataset task. processor.tokenizer.set_prefix_tokens(language="en", task="transcribe") @@ -83,17 +85,10 @@ def data_collator(batch): data_collator=data_collator, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) sample_features = next(iter(ds))["input_features"] sample_decoder_ids = [processor.tokenizer.prefix_tokens] sample_input = { @@ -106,3 +101,8 @@ def data_collator(batch): # that's where you have a lot of windows in the south no actually that's passive solar # and passive solar is something that was developed and designed in the 1960s and 70s # and it was a great thing for what it was at the time but it's not a passive house + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py index 50c606377..2b9676cdd 100644 --- a/examples/multimodal_vision/gemma3_example.py +++ b/examples/multimodal_vision/gemma3_example.py @@ -5,6 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # Load model. model_id = "google/gemma-3-4b-it" @@ -46,16 +47,9 @@ def data_collator(batch): data_collator=data_collator, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = Gemma3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) messages = [ { "role": "user", @@ -74,3 +68,8 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100, disable_compile=True) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py index fcc4559d8..ede27ac5b 100644 --- a/examples/multimodal_vision/idefics3_example.py +++ b/examples/multimodal_vision/idefics3_example.py @@ -6,6 +6,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # Load model. model_id = "HuggingFaceM4/Idefics3-8B-Llama3" # or "HuggingFaceTB/SmolVLM-Instruct" @@ -92,16 +93,9 @@ def tokenize(sample): data_collator=data_collator, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = Idefics3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) messages = [ { "role": "user", @@ -119,3 +113,8 @@ def tokenize(sample): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py index abb781d48..a2bf9b020 100644 --- a/examples/multimodal_vision/llava_example.py +++ b/examples/multimodal_vision/llava_example.py @@ -5,6 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # Load model. model_id = "llava-hf/llava-1.5-7b-hf" @@ -47,16 +48,9 @@ def data_collator(batch): data_collator=data_collator, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) messages = [ { "role": "user", @@ -74,3 +68,8 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py index 50b7e7dd5..9413359e9 100644 --- a/examples/multimodal_vision/mistral3_example.py +++ b/examples/multimodal_vision/mistral3_example.py @@ -8,6 +8,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # Load model. model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" @@ -60,16 +61,9 @@ def data_collator(batch): data_collator=data_collator, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = Mistral3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) messages = [ { "role": "user", @@ -88,3 +82,8 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py index 7cd45c85a..2d92319f7 100644 --- a/examples/multimodal_vision/mllama_example.py +++ b/examples/multimodal_vision/mllama_example.py @@ -5,6 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # Load model. model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" @@ -47,16 +48,9 @@ def data_collator(batch): data_collator=data_collator, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) messages = [ { "role": "user", @@ -74,3 +68,8 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py index 772001cb1..2b6f66714 100644 --- a/examples/multimodal_vision/phi3_vision_example.py +++ b/examples/multimodal_vision/phi3_vision_example.py @@ -7,6 +7,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # Load model. model_id = "microsoft/Phi-3-vision-128k-instruct" @@ -78,14 +79,6 @@ def data_collator(batch): ignore=["lm_head", "re:model.vision_embed_tokens.*"], ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Perform oneshot oneshot( model=model, @@ -99,7 +92,13 @@ def data_collator(batch): # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py index 71c1a0770..035af6061 100644 --- a/examples/multimodal_vision/pixtral_example.py +++ b/examples/multimodal_vision/pixtral_example.py @@ -5,6 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # Load model. model_id = "mgoin/pixtral-12b" @@ -53,16 +54,9 @@ def data_collator(batch): data_collator=data_collator, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) messages = [ { "role": "user", @@ -80,3 +74,8 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py index 27e0954d3..b072b6ff9 100644 --- a/examples/multimodal_vision/qwen2_vl_example.py +++ b/examples/multimodal_vision/qwen2_vl_example.py @@ -8,6 +8,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # Load model. model_id = "Qwen/Qwen2-VL-2B-Instruct" @@ -95,16 +96,9 @@ def data_collator(batch): data_collator=data_collator, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = Qwen2VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) messages = [ { "role": "user", @@ -131,3 +125,9 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") + + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py index 68798cf00..8dffa5216 100644 --- a/examples/multimodal_vision/qwen_2_5_vl_example.py +++ b/examples/multimodal_vision/qwen_2_5_vl_example.py @@ -8,6 +8,7 @@ from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot +from llmcompressor.utils.dev import dispatch_for_generation # Load model. model_id = "Qwen/Qwen2.5-VL-7B-Instruct" @@ -89,16 +90,9 @@ def data_collator(batch): data_collator=data_collator, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = Qwen2_5_VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) messages = [ { "role": "user", @@ -125,3 +119,9 @@ def data_collator(batch): output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") + + +# Save to disk compressed. +SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py index 76a5f2972..6ed01e7d1 100644 --- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py +++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py @@ -6,7 +6,9 @@ # load the model in as bfloat16 to save on memory and compute model_stub = "neuralmagic/Llama-2-7b-ultrachat200k" -model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16) +model = AutoModelForCausalLM.from_pretrained( + model_stub, torch_dtype=torch.bfloat16, device_map="auto" +) tokenizer = AutoTokenizer.from_pretrained(model_stub) # uses LLM Compressor's built-in preprocessing for ultra chat @@ -88,8 +90,8 @@ tokenizer.save_pretrained(f"{output_dir}/quantization_stage") logger.info( - "llmcompressor does not currently support running " + "llmcompressor does not currently support running ", "compressed models in the marlin24 format. " - "The model produced from this example can be " - "run on vLLM with dtype=torch.float16." + "The model produced from this example can be ", + "run on vLLM with dtype=torch.float16.", ) diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py index cf5501bc9..44691914a 100644 --- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py +++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py @@ -2,11 +2,12 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot +from llmcompressor.utils.dev import dispatch_for_generation # Select model and load it. -model_id = "google/gemma-2-9b-it" -model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(model_id) +MODEL_ID = "google/gemma-2-9b-it" +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset. DATASET_ID = "HuggingFaceH4/ultrachat_200k" @@ -77,14 +78,6 @@ def process_and_tokenize(example): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-FP8-KV" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - print( "Note: Inference with the quantized kv_cache is not supported. ", "Please use vLLM for inference with the quantized kv_cache.", @@ -95,8 +88,14 @@ def process_and_tokenize(example): # Consider either downgrading your transformers version to a previous version # or use vLLM for sample generation. print("\n\n") +dispatch_for_generation(model) print("========== SAMPLE GENERATION ==============") input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index 0cb52e0fd..6aaa809bb 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -3,15 +3,12 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot +from llmcompressor.utils.dev import dispatch_for_generation # Select model and load it. -model_id = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained( - model_id, - device_map="auto", - torch_dtype="auto", -) -tokenizer = AutoTokenizer.from_pretrained(model_id) +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset. DATASET_ID = "HuggingFaceH4/ultrachat_200k" @@ -82,14 +79,6 @@ def process_and_tokenize(example): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save to disk compressed. -SAVE_DIR = model_id.split("/")[1] + "-FP8-KV" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - logger.info( "Running sample generation. ", "Note: Inference with the quantized kv_cache is not supported. ", @@ -98,7 +87,13 @@ def process_and_tokenize(example): # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py index 20e7fac4c..112f0d0b9 100644 --- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py +++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py @@ -2,6 +2,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot +from llmcompressor.utils.dev import dispatch_for_generation # Select model and load it. # Phi-3.5 is a special case for KV cache quantization because it has @@ -79,14 +80,6 @@ def process_and_tokenize(example): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - print( "Note: Inference with the quantized kv_cache is not supported. ", "Please use vLLM for inference with the quantized kv_cache.", @@ -94,7 +87,13 @@ def process_and_tokenize(example): # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py index 378f4f988..b6048e086 100644 --- a/examples/quantization_w4a16_fp4/llama3_example.py +++ b/examples/quantization_w4a16_fp4/llama3_example.py @@ -2,6 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" @@ -17,18 +18,16 @@ # Apply quantization. oneshot(model=model, recipe=recipe) -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - -# Validate model generations print("\n\n") print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") + + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py index d84b749c0..0bd484f9c 100644 --- a/examples/quantization_w4a4_fp4/llama3_example.py +++ b/examples/quantization_w4a4_fp4/llama3_example.py @@ -3,11 +3,13 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation + +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" # Load model. -model_id = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) DATASET_ID = "HuggingFaceH4/ultrachat_200k" @@ -64,18 +66,16 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save to disk in compressed-tensors format. -SAVE_DIR = model_id.split("/")[1] + "-NVFP4" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - -# Validate model generations print("\n\n") print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") + + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py index 374add135..9509d1505 100644 --- a/examples/quantization_w8a8_fp8/gemma2_example.py +++ b/examples/quantization_w8a8_fp8/gemma2_example.py @@ -2,6 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation MODEL_ID = "google/gemma-2-27b-it" @@ -18,22 +19,24 @@ ) # 3) Apply quantization and save in compressed-tensors format. -oneshot(model=model, recipe=recipe, tokenizer=tokenizer) - -# 4) Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") +oneshot( + model=model, + recipe=recipe, + tokenizer=tokenizer, +) # Confirm generations of the quantized model look sane. # NOTE: transformers 4.49.0 results in a generation error with gemma2. # Consider either downgrading your transformers version to a previous version # or use vLLM for sample generation. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) print("==========================================") + +# 4) Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py index 105e05483..a79214d36 100644 --- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py +++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py @@ -2,6 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" @@ -19,20 +20,18 @@ ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"], ) -# Apply quantization. +# Apply quantization and save to disk in compressed-tensors format. oneshot(model=model, recipe=recipe) -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -model.save_pretrained(SAVE_DIR) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py index badd6ac5e..440f0b584 100644 --- a/examples/quantization_w8a8_fp8/llama3_example.py +++ b/examples/quantization_w8a8_fp8/llama3_example.py @@ -2,6 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" @@ -20,17 +21,15 @@ # Apply quantization. oneshot(model=model, recipe=recipe) -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -model.save_pretrained(SAVE_DIR) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py index 06d1483e1..9c1731f03 100644 --- a/examples/quantization_w8a8_fp8/llava1.5_example.py +++ b/examples/quantization_w8a8_fp8/llava1.5_example.py @@ -2,6 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation MODEL_ID = "llava-hf/llava-1.5-7b-hf" @@ -19,20 +20,18 @@ ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_tower.*"], ) -# Apply quantization. +# Apply quantization and save to disk in compressed-tensors format. oneshot(model=model, recipe=recipe) -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -model.save_pretrained(SAVE_DIR) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py index 1cf73f527..2e7d02803 100644 --- a/examples/quantization_w8a8_fp8/qwen2vl_example.py +++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py @@ -2,6 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct" @@ -19,20 +20,18 @@ ignore=["re:.*lm_head", "re:visual.*"], ) -# Apply quantization. +# Apply quantization and save to disk in compressed-tensors format. oneshot(model=model, recipe=recipe) -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -model.save_pretrained(SAVE_DIR) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = Qwen2VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py index 51ac804b7..b9fbb9d24 100644 --- a/examples/quantization_w8a8_fp8/whisper_example.py +++ b/examples/quantization_w8a8_fp8/whisper_example.py @@ -3,6 +3,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation MODEL_ID = "openai/whisper-large-v2" @@ -23,16 +24,9 @@ # Apply quantization. oneshot(model=model, recipe=recipe) -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR) - -# Load model after saving -model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) ds = load_dataset( "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]" ) @@ -45,3 +39,8 @@ print(processor.batch_decode(output_ids, skip_special_tokens=False)[0]) # Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py index a0ebe0079..1a11effa3 100644 --- a/examples/quantization_w8a8_int8/gemma2_example.py +++ b/examples/quantization_w8a8_int8/gemma2_example.py @@ -3,6 +3,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # 1) Select model and load it. MODEL_ID = "google/gemma-2-2b-it" @@ -53,26 +54,27 @@ def tokenize(sample): # * quantize the activations to int8 (dynamic per token) recipe = GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]) -# 4) Apply quantization and save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8" +# 4) Apply quantization oneshot( model=model, dataset=ds, recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, - output_dir=SAVE_DIR, ) -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. # NOTE: transformers 4.49.0 results in a generation error with gemma2. # Consider either downgrading your transformers version to a previous version # or use vLLM for sample generation. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) print("==========================================") + +# 5) Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py index 1894932f1..6fa738656 100644 --- a/examples/quantization_w8a8_int8/llama3_example.py +++ b/examples/quantization_w8a8_int8/llama3_example.py @@ -4,6 +4,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.modifiers.smoothquant import SmoothQuantModifier +from llmcompressor.utils.dev import dispatch_for_generation # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" @@ -67,18 +68,16 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py index ccba53d2f..6d311858d 100644 --- a/examples/quantizing_moe/deepseek_moe_w4a16.py +++ b/examples/quantizing_moe/deepseek_moe_w4a16.py @@ -4,6 +4,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ from llmcompressor import oneshot +from llmcompressor.transformers.compression.helpers import calculate_offload_device_map # NOTE: transformers 4.49.0 has an attribute error with DeepSeek. # Please consider either downgrading your transformers version to a @@ -12,8 +13,18 @@ # select a Mixture of Experts model for quantization MODEL_ID = "deepseek-ai/DeepSeek-V2.5" +# adjust based off number of desired GPUs +# if not enough memory is available, some layers will automatically be offlaoded to cpu +device_map = calculate_offload_device_map( + MODEL_ID, + reserve_for_hessians=True, + num_gpus=2, + torch_dtype=torch.bfloat16, + trust_remote_code=True, +) + model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True + MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py index 5cdfe995d..03eaafc69 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py @@ -4,6 +4,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation # NOTE: transformers 4.49.0 has an attribute error with DeepSeek. # Please consider either downgrading your transformers version to a @@ -13,7 +14,7 @@ MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", trust_remote_code=True + MODEL_ID, torch_dtype="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -75,18 +76,11 @@ def tokenize(sample): trust_remote_code_model=True, ) -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. # Generation is broken for deepseek models when using the latest transformers package if Version(__version__) < Version("4.48"): print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) SAMPLE_INPUT = ["I love quantization because"] tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) @@ -98,3 +92,8 @@ def tokenize(sample): "WARNING: cannot perform sample generation of " "deepseek models with transformers >= 4.48" ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py index abddef9dc..d4249c278 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py @@ -5,6 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.utils.dev import dispatch_for_generation # NOTE: transformers 4.49.0 has an attribute error with DeepSeek. # Please consider either downgrading your transformers version to a @@ -76,18 +77,11 @@ def tokenize(sample): trust_remote_code_model=True, ) -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. # Generation is broken for deepseek models when using the latest transformers package if Version(__version__) < Version("4.48"): print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) SAMPLE_INPUT = ["I love quantization because"] tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device) @@ -100,3 +94,8 @@ def tokenize(sample): "WARNING: cannot perform sample generation of " "deepseek models with transformers >= 4.48" ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py index df0876088..3361e86bc 100644 --- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py @@ -5,9 +5,9 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" -NUM_GPUS = 2 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -37,18 +37,11 @@ num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save to disk in compressed-tensors format. -SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. # Generation is broken for deepseek models when using the latest transformers package if Version(__version__) < Version("4.48"): print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) @@ -58,3 +51,8 @@ "WARNING: cannot perform sample generation of " "deepseek models with transformers >= 4.48" ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py index 18a6c74ff..15b26f656 100644 --- a/examples/quantizing_moe/qwen_moe_w4a16.py +++ b/examples/quantizing_moe/qwen_moe_w4a16.py @@ -4,6 +4,7 @@ from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot +from llmcompressor.utils.dev import dispatch_for_generation # select a Mixture of Experts model for quantization MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat" @@ -69,17 +70,15 @@ def tokenize(sample): trust_remote_code_model=True, ) -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto") - # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py index d51ad7670..5f941e87c 100644 --- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py +++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py @@ -6,6 +6,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.obcq import SparseGPTModifier from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils.dev import dispatch_for_generation # Configuration MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" @@ -97,16 +98,14 @@ def get_recipe(fp8_enabled): num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save compressed model and tokenizer -model.save_pretrained(save_dir) -tokenizer.save_pretrained(save_dir) - -# Load model after saving -model = AutoModelForCausalLM.from_pretrained(save_dir, device_map="auto") - # Validate the compressed model print("\n========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n") + +# Save compressed model and tokenizer +model.save_pretrained(save_dir) +tokenizer.save_pretrained(save_dir) diff --git a/examples/trl_mixin/ex_trl_constant.py b/examples/trl_mixin/ex_trl_constant.py index c26e9f41d..ff8a370c9 100644 --- a/examples/trl_mixin/ex_trl_constant.py +++ b/examples/trl_mixin/ex_trl_constant.py @@ -7,9 +7,7 @@ model_path = "neuralmagic/Llama-2-7b-pruned50-retrained" output_dir = "./output_trl_sft_test_7b_gsm8k_sft_data" -model = AutoModelForCausalLM.from_pretrained( - model_path, torch_dtype="auto", device_map="auto" -) +model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer.pad_token = tokenizer.eos_token From 91b349b3fd50ab06f1c66f72ab772cd04ca6147d Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 13 Jun 2025 10:04:47 -0400 Subject: [PATCH 16/36] update examples 2 Signed-off-by: Kyle Sayers --- examples/awq/README.md | 6 +----- examples/awq/llama_example.py | 4 +--- examples/awq/qwen3_moe_example.py | 4 +--- examples/multimodal_audio/README.md | 6 +----- examples/multimodal_vision/README.md | 6 +----- .../llama7b_sparse_w4a16.py | 7 ++++--- examples/quantization_kv_cache/README.md | 6 +----- examples/quantization_w4a16/README.md | 4 +--- examples/quantization_w8a8_fp8/README.md | 3 +-- examples/quantization_w8a8_int8/README.md | 4 +--- examples/quantizing_moe/deepseek_moe_w4a16.py | 15 +++------------ 11 files changed, 16 insertions(+), 49 deletions(-) diff --git a/examples/awq/README.md b/examples/awq/README.md index 0a837d6f3..fd4cb4b62 100644 --- a/examples/awq/README.md +++ b/examples/awq/README.md @@ -18,11 +18,7 @@ recipe = [ To use your own model, start with an existing example change the `model_id` to match your own model stub. ```python model_id = "path/to/your/model" -model = AutoModelForCausalLM.from_pretrained( - model_id, - device_map="auto", - torch_dtype="auto", -) +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") ``` ## Adding Mappings ## diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py index 9d2c724d7..0db10c478 100644 --- a/examples/awq/llama_example.py +++ b/examples/awq/llama_example.py @@ -7,9 +7,7 @@ # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) # Select calibration dataset. diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py index 3c16d2f43..5634621f6 100644 --- a/examples/awq/qwen3_moe_example.py +++ b/examples/awq/qwen3_moe_example.py @@ -8,9 +8,7 @@ # Select model and load it. MODEL_ID = "Qwen/Qwen3-30B-A3B" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) # Select calibration dataset. diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md index d3d0631f9..e7ecca950 100644 --- a/examples/multimodal_audio/README.md +++ b/examples/multimodal_audio/README.md @@ -21,11 +21,7 @@ This directory contains example scripts for quantizing a variety of audio langua To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub. ```python3 model_id = "path/to/your/model" -model = AutoModelForCausalLM.from_pretrained( - model_id, - device_map="auto", - torch_dtype="auto", -) +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") ``` ## Customizing GPTQModifier Parameters ## diff --git a/examples/multimodal_vision/README.md b/examples/multimodal_vision/README.md index 9d6d12295..c0d0808b4 100644 --- a/examples/multimodal_vision/README.md +++ b/examples/multimodal_vision/README.md @@ -25,11 +25,7 @@ This directory contains example scripts for quantizing a variety of vision-langu To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub. ```python3 model_id = "path/to/your/model" -model = AutoModelForCausalLM.from_pretrained( - model_id, - device_map="auto", - torch_dtype="auto", -) +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") ``` ## Customizing GPTQModifier Parameters ## diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py index 6ed01e7d1..4bf505047 100644 --- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py +++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py @@ -3,12 +3,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot, train +from llmcompressor.utils.dev import dispatch_for_generation # load the model in as bfloat16 to save on memory and compute model_stub = "neuralmagic/Llama-2-7b-ultrachat200k" -model = AutoModelForCausalLM.from_pretrained( - model_stub, torch_dtype=torch.bfloat16, device_map="auto" -) +model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(model_stub) # uses LLM Compressor's built-in preprocessing for ultra chat @@ -71,6 +70,7 @@ ) # Sparse finetune +dispatch_for_generation(model) finetune_applied_model = train( model=oneshot_applied_model, **oneshot_kwargs, @@ -79,6 +79,7 @@ ) # Oneshot quantization +model.to("cpu") quantized_model = oneshot( model=finetune_applied_model, **oneshot_kwargs, diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md index 8ec73dee3..826bc6322 100644 --- a/examples/quantization_kv_cache/README.md +++ b/examples/quantization_kv_cache/README.md @@ -39,11 +39,7 @@ Load the model using `AutoModelForCausalLM`: from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, - device_map="auto", - torch_dtype="auto", -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/examples/quantization_w4a16/README.md b/examples/quantization_w4a16/README.md index 27edb92a6..5a3a07d67 100644 --- a/examples/quantization_w4a16/README.md +++ b/examples/quantization_w4a16/README.md @@ -40,9 +40,7 @@ Load the model using `AutoModelForCausalLM` for handling quantized saving and lo from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/examples/quantization_w8a8_fp8/README.md b/examples/quantization_w8a8_fp8/README.md index 2b817ba1e..f0a7e8d6c 100644 --- a/examples/quantization_w8a8_fp8/README.md +++ b/examples/quantization_w8a8_fp8/README.md @@ -38,8 +38,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/examples/quantization_w8a8_int8/README.md b/examples/quantization_w8a8_int8/README.md index bda9d2d46..830ec2ac9 100644 --- a/examples/quantization_w8a8_int8/README.md +++ b/examples/quantization_w8a8_int8/README.md @@ -38,9 +38,7 @@ Load the model using `AutoModelForCausalLM` for handling quantized saving and lo from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py index 6d311858d..5d08dc703 100644 --- a/examples/quantizing_moe/deepseek_moe_w4a16.py +++ b/examples/quantizing_moe/deepseek_moe_w4a16.py @@ -4,7 +4,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ from llmcompressor import oneshot -from llmcompressor.transformers.compression.helpers import calculate_offload_device_map +from llmcompressor.utils.dev import dispatch_for_generation # NOTE: transformers 4.49.0 has an attribute error with DeepSeek. # Please consider either downgrading your transformers version to a @@ -13,18 +13,8 @@ # select a Mixture of Experts model for quantization MODEL_ID = "deepseek-ai/DeepSeek-V2.5" -# adjust based off number of desired GPUs -# if not enough memory is available, some layers will automatically be offlaoded to cpu -device_map = calculate_offload_device_map( - MODEL_ID, - reserve_for_hessians=True, - num_gpus=2, - torch_dtype=torch.bfloat16, - trust_remote_code=True, -) - model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -84,6 +74,7 @@ def tokenize(sample): # Generation is broken for deepseek models when using the latest transformers package if Version(__version__) < Version("4.48"): print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) From 8e58e35b08ed96cdfa833f5b6df29f2e4822bcff Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 13 Jun 2025 12:27:43 -0400 Subject: [PATCH 17/36] remove fallback_to_cpu, use ct utils Signed-off-by: Kyle Sayers --- src/llmcompressor/entrypoints/utils.py | 15 ++------ .../pipelines/layer_sequential/pipeline.py | 6 ++-- .../pipelines/sequential/helpers.py | 35 +------------------ .../pipelines/sequential/pipeline.py | 3 +- .../pytorch/model_load/helpers.py | 17 --------- 5 files changed, 6 insertions(+), 70 deletions(-) diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py index 7667418e2..f648fa771 100644 --- a/src/llmcompressor/entrypoints/utils.py +++ b/src/llmcompressor/entrypoints/utils.py @@ -17,7 +17,7 @@ from llmcompressor.args import ModelArguments, RecipeArguments, TrainingArguments from llmcompressor.core import reset_session -from llmcompressor.pytorch.model_load.helpers import fallback_to_cpu, parse_dtype +from llmcompressor.pytorch.model_load.helpers import parse_dtype from llmcompressor.transformers.sparsification.compressed_tensors_utils import ( modify_save_pretrained, patch_tied_tensors_bug, @@ -197,20 +197,12 @@ def initialize_model_from_path( else model_args.model_name_or_path ) - # Fallback to CPU if GPU requested and not available - model_args.oneshot_device = fallback_to_cpu(model_args.oneshot_device) - - device_map = model_args.oneshot_device - if training_args is not None and training_args.do_train: - device_map = "auto" - model_kwargs = { "config": config, "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, "torch_dtype": parse_dtype(model_args.precision), - "device_map": device_map, "trust_remote_code": model_args.trust_remote_code_model, } @@ -220,10 +212,7 @@ def initialize_model_from_path( run_compressed=False ) - model = AutoModelForCausalLM.from_pretrained( - model_path, - **model_kwargs, - ) + model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs) if "sequence_length" in model_kwargs: model.seqlen = model_kwargs["sequence_length"] diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index 3a0cd8cb6..2cfda0d0e 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -2,6 +2,7 @@ import torch import tqdm +from compressed_tensors.utils import disable_offloading from loguru import logger from torch.utils.data.dataloader import DataLoader @@ -15,10 +16,7 @@ to_next_layer_kwargs, ) from llmcompressor.pipelines.registry import CalibrationPipeline -from llmcompressor.pipelines.sequential.helpers import ( - disable_offloading, - get_targets_from_modifiers, -) +from llmcompressor.pipelines.sequential.helpers import get_targets_from_modifiers from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context if TYPE_CHECKING: diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index 6cb63acdd..b7937a2fc 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -5,7 +5,6 @@ from typing import Any, Dict, List, Optional, Set import torch -from accelerate.hooks import AlignDevicesHook from compressed_tensors import has_offloaded_params from compressed_tensors.quantization import find_name_or_class_matches from loguru import logger @@ -24,12 +23,7 @@ from .ast_helpers import autowrap_forwards -__all__ = [ - "trace_subgraphs", - "Subgraph", - "get_targets_from_modifiers", - "disable_offloading", -] +__all__ = ["trace_subgraphs", "Subgraph", "get_targets_from_modifiers"] @dataclass @@ -491,30 +485,3 @@ def is_ancestor(module: Module) -> bool: is_ancestor(model) return ancestors - - -@contextlib.contextmanager -def disable_offloading(): - """ - Keep modules onloaded and disable offloading until this context exits. - Affects modules which have been hooked with accelerate's `AlignDevicesHook` - """ - original_pre_forward = AlignDevicesHook.pre_forward - onloaded_modules = dict() - - # onload once and disable any future onloading/offloading steps - def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs): - ret = original_pre_forward(self, module, *args, **kwargs) - if module not in onloaded_modules: - onloaded_modules[module] = (self, self.offload) - self.offload = False - return ret - - # use the patched pre_forward function within the context - with patch_attr(AlignDevicesHook, "pre_forward", keep_onload_pre_forward): - yield - - # manually offload all modules that were onloaded - for module, (hook, offload) in onloaded_modules.items(): - hook.offload = offload - hook.post_forward(module, None) diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index ab794daa4..3e0490b70 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING import torch -from compressed_tensors.utils import get_execution_device +from compressed_tensors.utils import disable_offloading, get_execution_device from loguru import logger from torch.utils.data.dataloader import DataLoader from tqdm import tqdm @@ -11,7 +11,6 @@ from llmcompressor.pipelines.cache import IntermediatesCache from llmcompressor.pipelines.registry import CalibrationPipeline from llmcompressor.pipelines.sequential.helpers import ( - disable_offloading, get_targets_from_modifiers, trace_subgraphs, ) diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py index 0ffbd053e..de4b061ec 100644 --- a/src/llmcompressor/pytorch/model_load/helpers.py +++ b/src/llmcompressor/pytorch/model_load/helpers.py @@ -15,7 +15,6 @@ __all__ = [ "copy_python_files_from_model_cache", - "fallback_to_cpu", "parse_dtype", "get_session_model", "get_completed_stages", @@ -71,22 +70,6 @@ def save_checkpoint( compressor.decompress_model(model) -def fallback_to_cpu(device: str) -> str: - """ - Takes in a device string and forces it to cpu if cuda is not available - - :param device: device id to check - :return: device modified for CUDA status - """ - if "cuda" in device and not torch.cuda.is_available(): - logger.warning( - f"Requested {device} but CUDA is not available, falling back to CPU" - ) - return "cpu" - - return device - - def parse_dtype(dtype_arg: Union[str, torch.dtype]) -> torch.dtype: """ :param dtype_arg: dtype or string to parse From 96631d16b0121f7f9d1269d6b2145789ddcb18cb Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 15 Jun 2025 01:06:09 -0400 Subject: [PATCH 18/36] remove hook from module within utils function Signed-off-by: Kyle Sayers --- src/llmcompressor/entrypoints/oneshot.py | 6 +++--- src/llmcompressor/entrypoints/utils.py | 4 ---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 8bcf061f5..c1dae7933 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -3,7 +3,6 @@ from typing import Optional import torch -from accelerate.hooks import remove_hook_from_module from compressed_tensors.utils import offloaded_dispatch from loguru import logger from torch.utils.data import DataLoader @@ -128,8 +127,9 @@ def __init__( # offload to cpu if possible if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available(): - remove_hook_from_module(model_args.model, recurse=True) - offloaded_dispatch(model_args.model, model_args.oneshot_device) + offloaded_dispatch( + model_args.model, execution_device=model_args.oneshot_device + ) else: logger.warning("CUDA is not available! Compressing model on CPU instead") diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py index f648fa771..4bbc31e82 100644 --- a/src/llmcompressor/entrypoints/utils.py +++ b/src/llmcompressor/entrypoints/utils.py @@ -3,7 +3,6 @@ from pathlib import PosixPath from typing import Optional, Tuple -from accelerate.hooks import remove_hook_from_module from loguru import logger from torch.nn import Module from transformers import ( @@ -106,9 +105,6 @@ def post_process( "Ex. `oneshot(..., output_dir=...)`" ) - # Remove any existing hooks (maybe added by oneshot sequential onloading) - remove_hook_from_module(model_args.model, recurse=True) - # Reset the one-time-use session upon completion if recipe_args is not None and recipe_args.clear_sparse_session: reset_session() From 96476fe07a10738832c8a86f06c54743a1c9f774 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 15 Jun 2025 01:09:46 -0400 Subject: [PATCH 19/36] remove unused util Signed-off-by: Kyle Sayers --- .../transformers/compression/helpers.py | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/src/llmcompressor/transformers/compression/helpers.py b/src/llmcompressor/transformers/compression/helpers.py index d02a08809..acb71b986 100644 --- a/src/llmcompressor/transformers/compression/helpers.py +++ b/src/llmcompressor/transformers/compression/helpers.py @@ -104,33 +104,6 @@ def infer_sparsity_structure_from_model(model: torch.nn.Module) -> Optional[str] return None -def quantization_memory_requirement(model: torch.nn.Module) -> int: - """ - Determines the max number of bytes needed to store quantization scale and zp data - - :param model: model to calculate requirements for - :return: number of bytes required to reserve for quantization - """ - - total_elements = 0 - for _, module in model.named_modules(): - if isinstance(module, Linear): - for param in module.parameters(): - # assume the max of group 128 and static scale/zp - # TODO: base this on the recipe instead instead of assuming max - - # potentially just bias term - max_quant_shape = param.shape[0] // 128 - - if len(param.size()) > 1: # weights - max_quant_shape *= param.shape[1] - - total_elements += max_quant_shape * 4 - - bytes_ratio = 32 // 16 # assuming float16 - return total_elements * bytes_ratio - - def infer_sparse_targets_and_ignores( model: torch.nn.Module, sparsity_structure: str, From cb965c91fd0309ad8f5a590f0a767d04255c6539 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 15 Jun 2025 22:45:46 -0400 Subject: [PATCH 20/36] docstring Signed-off-by: Kyle Sayers --- src/llmcompressor/utils/dev.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py index 1c67f6678..e773b48f1 100644 --- a/src/llmcompressor/utils/dev.py +++ b/src/llmcompressor/utils/dev.py @@ -116,6 +116,14 @@ def patch_transformers_logger_level(level: int = logging.ERROR): def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel: + """ + Dispatch a model autoregressive generation. This means that modules are dispatched + evenly across avaiable devices and kept onloaded if possible. Removes any HF hooks + that may have existed previously. + + :param model: model to dispatch + :return: model which is dispatched + """ remove_hook_from_module(model, recurse=True) max_memory = get_balanced_memory( model, From 8769b8591174fbb68677e57e8ca1ce743bdb1e24 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 15 Jun 2025 22:51:18 -0400 Subject: [PATCH 21/36] remove big model example tests Signed-off-by: Kyle Sayers --- .../test_big_models_with_accelerate.py | 74 ------------------- 1 file changed, 74 deletions(-) delete mode 100644 tests/examples/test_big_models_with_accelerate.py diff --git a/tests/examples/test_big_models_with_accelerate.py b/tests/examples/test_big_models_with_accelerate.py deleted file mode 100644 index 019017bdd..000000000 --- a/tests/examples/test_big_models_with_accelerate.py +++ /dev/null @@ -1,74 +0,0 @@ -from pathlib import Path - -import pytest - -from tests.examples.utils import ( - ReadMe, - copy_and_run_script, - gen_cmd_fail_message, - requires_gpu_count, -) - - -@pytest.fixture -def example_dir() -> str: - return "examples/big_models_with_accelerate" - - -@pytest.mark.example -class TestBigModelsWithAccelerate: - """ - Tests for examples in the "big_models_with_accelerate" example folder. - """ - - def test_readme_has_install_command(self, example_dir: str): - """ - Test that the README has a valid install command. - """ - readme_path = Path.cwd() / example_dir / "README.md" - readme = ReadMe(readme_path) - - code = readme.get_code_block_content(position=1, lang="shell") - assert "pip install" in code - - assert code.startswith("pip install llmcompressor") - - @pytest.mark.parametrize( - ("script_filename", "visible_gpus"), - [ - pytest.param("cpu_offloading_fp8.py", "0", id="cpu_offloading"), - pytest.param( - "multi_gpu_int8.py", - "", - id="multi_gpu_int8", - marks=[ - requires_gpu_count(2), - pytest.mark.multi_gpu, - ], - ), - pytest.param( - "mult_gpus_int8_device_map.py", - "0", - id="mult_gpus_int8_device_map", - ), - ], - ) - @requires_gpu_count(1) - def test_example_scripts( - self, - example_dir: str, - visible_gpus: str, - script_filename: str, - tmp_path: Path, - monkeypatch: pytest.MonkeyPatch, - ): - """ - Test for the example scripts in the folder. - """ - - if visible_gpus: - monkeypatch.setenv("CUDA_VISIBLE_DEVICES", visible_gpus) - - command, result = copy_and_run_script(tmp_path, example_dir, script_filename) - - assert result.returncode == 0, gen_cmd_fail_message(command, result) From a389d1488cfb29e549a1f92c4b65d917d763fcad Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 15 Jun 2025 23:16:52 -0400 Subject: [PATCH 22/36] big modeling example readme Signed-off-by: Kyle Sayers --- .../cpu_offloading_fp8.py | 26 ------ .../mult_gpus_int8_device_map.py | 81 ------------------ .../multi_gpu_int8.py | 78 ----------------- .../README.md | 12 +++ .../assets/sequential_onloading.png | Bin 0 -> 71199 bytes examples/quantization_w4a16/llama3_example.py | 2 +- 6 files changed, 13 insertions(+), 186 deletions(-) delete mode 100644 examples/big_models_with_accelerate/cpu_offloading_fp8.py delete mode 100644 examples/big_models_with_accelerate/mult_gpus_int8_device_map.py delete mode 100644 examples/big_models_with_accelerate/multi_gpu_int8.py create mode 100644 examples/big_models_with_sequential_onloading/README.md create mode 100644 examples/big_models_with_sequential_onloading/assets/sequential_onloading.png diff --git a/examples/big_models_with_accelerate/cpu_offloading_fp8.py b/examples/big_models_with_accelerate/cpu_offloading_fp8.py deleted file mode 100644 index ded5ff8d6..000000000 --- a/examples/big_models_with_accelerate/cpu_offloading_fp8.py +++ /dev/null @@ -1,26 +0,0 @@ -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier - -MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" -OUTPUT_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic" - -# Load model -# Note: device_map="auto" will offload to CPU if not enough space on GPU. -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True -) - -# Configure the quantization scheme and algorithm (PTQ + FP8_DYNAMIC). -recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] -) - -# Apply quantization and save in `compressed-tensors` format. -oneshot( - model=model, - recipe=recipe, - tokenizer=AutoTokenizer.from_pretrained(MODEL_ID), - output_dir=OUTPUT_DIR, -) diff --git a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py deleted file mode 100644 index d98051d21..000000000 --- a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py +++ /dev/null @@ -1,81 +0,0 @@ -import torch -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.modifiers.smoothquant import SmoothQuantModifier -from llmcompressor.transformers.compression.helpers import calculate_offload_device_map - -MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" - -# adjust based off number of desired GPUs -# reserve_for_hessians=True reserves memory which is required by -# GPTQModifier and SparseGPTModifier -device_map = calculate_offload_device_map( - MODEL_ID, num_gpus=1, reserve_for_hessians=True, torch_dtype=torch.bfloat16 -) - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16 -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 - - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# define a llmcompressor recipe for W8A8 quantization -recipe = [ - SmoothQuantModifier(smoothing_strength=0.8), - GPTQModifier( - targets="Linear", - scheme="W8A8", - ignore=["lm_head"], - ), -] - -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-INT8" - -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - save_compressed=True, - output_dir=SAVE_DIR, -) diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py deleted file mode 100644 index 9c1679eab..000000000 --- a/examples/big_models_with_accelerate/multi_gpu_int8.py +++ /dev/null @@ -1,78 +0,0 @@ -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import GPTQModifier - -MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic" - -# 1) Load model (device_map="auto" with shard the model over multiple GPUs!). -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, - device_map="auto", - torch_dtype="auto", - trust_remote_code=True, -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# 2) Prepare calibration dataset (in this case, we use ultrachat). -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" - -# Select number of samples. 512 samples is a good place to start. -# Increasing the number of samples can improve accuracy. -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 1024 - -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -ds = ds.shuffle(seed=42) - - -def preprocess(example): - return { - "text": tokenizer.apply_chat_template( - example["messages"], - tokenize=False, - ) - } - - -ds = ds.map(preprocess) - - -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -ds = ds.map(tokenize, remove_columns=ds.column_names) - -# 3) Configure algorithms. In this case, we: -# * quantize the weights to int8 with GPTQ (static per channel) -# * quantize the activations to int8 (dynamic per token) -recipe = [ - GPTQModifier( - targets="Linear", scheme="W8A8", ignore=["lm_head"], dampening_frac=0.1 - ), -] - -# 4) Apply algorithms and save in `compressed-tensors` format. -# if you encounter GPU out-of-memory issues, consider using an explicit -# device map (see multi_gpus_int8_device_map.py) -oneshot( - model=model, - tokenizer=tokenizer, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - output_dir=SAVE_DIR, -) diff --git a/examples/big_models_with_sequential_onloading/README.md b/examples/big_models_with_sequential_onloading/README.md new file mode 100644 index 000000000..f10e1e394 --- /dev/null +++ b/examples/big_models_with_sequential_onloading/README.md @@ -0,0 +1,12 @@ +## Big Modeling with Sequential Onloading ## +### What is Sequential Onloading? ### +Sequential onloading is a memory-efficient approach for compressing large language models (LLMs) using only a single GPU. Instead of loading the entire model into memory—which can easily require hundreds of gigabytes—this method loads and compresses one layer at a time. The outputs are offloaded before the next layer is processed, dramatically reducing peak memory usage while maintaining high compression fidelity. + +

+ +

+ +For more information, see the [RedHat AI blog post](https://developers.redhat.com/articles/2025/05/09/llm-compressor-optimize-llms-low-latency-deployments#generalizing_to_multimodal_and_moe_architectures) or the [LLM Compressor Office Hours Recording](https://www.youtube.com/watch?v=GrhuqQDmBk8). + +### Using Sequential Onloading ### +Sequential onloading is enabled by default within LLM Compressor. To disable sequential onloading, add the `pipeline="basic"` argument to the LLM Compressor `oneshot` function call. \ No newline at end of file diff --git a/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png b/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png new file mode 100644 index 0000000000000000000000000000000000000000..a499cb66a98205f3a13ff7b54fc9622b3d903015 GIT binary patch literal 71199 zcmeFXWmwc*_b5DsNQsmnjdVzNkBa2bNVhO_m$XQWG}57TBi-FC-Ce@a-Ml|;^?si7 zf6sM3pAU!Y8fMRqwbovH)!u6b$;*mAK_x^5fk00rB}5fLAOsrV!-(Nuhai0n&2eL@o`fkIC0iwAba5~Zes zdmY5~B4=v2!EnB zK|be=1p&}cfeS@b<~$T|3Y9J%5q2|SYEmnOBOLrsIL>d6#f~1?(bMwt!#A@}-8IxV zAYmH0O)KP{4&B{rl>1WDfIw7)D-Pea!XF>PfexVF$}vwt3aBo@rk9G;I!{VM(fN^W zE3l38Y|5~+^A9>a6ED=5+rvQQ`Nx-~5D=#;`49u9;cF89FL=Dtz6L1mcbL~3>oQoc z1)mydnjmauFrH3tEp8d!$#8f`L=6oQx8|u`q4CMTjNLdgzDeD>SfplAW#bKaQn0OF z`cuuQpVd0dF3Xp(gui{CdMiQ!(;VUBih+Jp12#t)?spUhw=os+&DJD1n)dhWHMaF| z!YbWQ(F3NCSI{M`4BJ1-DERZLnIaqq!~`F^9Ad)WXfZ`T?szgq94iwou_&x$GBTHG~xj7g?=~k1}TZcszDL z#j*n5bsC_QK&Vhq-=;Y zNR{OKiX_6_ygJ-4f(;x2~f)uO*BY6UNb#yR! zcq_@tm}S2T>%1HY3st=gQZ+noOM7>>(^j%A2`LjTUL`F-Fq zn77;S{Ie%@Px#}YB`m}C`-8xz`Jm2lwEWWh#bcix9)E=RKsYfrUdmu$6G?OmBwwBwaeyi|xjElm4RP(sAMT(0(lqXXPN%VAY!&c-< zNQ4#JblNN>-p0fBkJFF1Jm?nZzNai7+c)(!VpcAx_|`UW41#=o^uzD48sxn6Q;$nlP^{8rjdkb;s)zc4X!Y`0t zzcxWeU;{(27~A1&UrhzFVEc-amq@Tj`WsSUi+cCJ#v{+{mdb#i{xU}H7V)MVWsP_w zbWU`SQoEaRjq>yN?HmlVFB<9T^4z_z+J-+kP$UG&zC|1M`=wY;T=?2K?erJQiJx`Z5Q6x}E$#tacD&+X;BloP8O2 zapjsHT`0R_G$Ix^=O4P>;Cd1?g`F?yqHun~YVlrryM)z=!;UhF+~LdK%H8^^)wDX6 zAH_OIlp@OH&B;eMw)i-iBuTB;bKp4{L0Jo0VXCu82`Yq0gq{H0w<}Rck{ub28Oa&% zGo&+QN8CyYr=xdXW6)%WQ?F6j5x6|Qq`SnvM7hLy8)fkJFilj!H<}>YD7rA3ruUH) z?K;l8V6!);-#eAca*uP@$N0BRw@J3mxwyFOx#TLIbCt{pR@hYtnHi7o<$K6as#K|N z=4IzGj>YC#tBMqPD&2C0h?Qksjw+g5mS>g|&PdJ(&A2`B_m4u7dy~Vany#3xvl`in z#_wnKcJn9gFxr;U$?XZ+iIvRftO^>{sA4|xGR5QY4aWf;CjSYOLb^i1LTU|x@!@{|@;ofX7pKJMlEcsaS)iz-`_sv+z@bZ=+EP8uq#B_9PcznWf zTxKk}NHf<>w8|%ltuS5?%4|RW)Hpl)tWGBQR1WIVh zX2!M6#m$Dn7DHIfA#6Hl*~)py)5EQ9X*ZV97p=c-&cLm0>DqrnARXJGzh2s_o~l)lfKYl&8gp@_(cca+5fc>;FCXvF3u`vPpq6iJ_x2$OWaloEesH|xmZ zNN?n?VaC4aYl{PGsu3Ixzhpfe*UG(L92ns2OGh{U`wg{t$Vlt$NXYRSx z>KtB8MYV_RxlL@%WR0_nnrntj!Fl!IguZxL?2jQ=Y(WIUO;>I=&jY*j-t(Ew)7YwX z=g@PE{p|Oaou50cNUnc&%?0e2O=q{ot|x9>mstm-#*-};_wIW)`8ECYO1#GjC813$ zi#n!g<^$u|ur85I1`8y*31Xt5qr611KpVl3Le4|3#kU|$BG4k5!&$*&=UDqd_kvT< z;q5QS6gIi${N+<$7vBL4foDaer1&~mcjT?YuknUCF-Y*at;VYbB2*$GyZz}fJAyTy zIr7ZVxSE|TKR19lK#clVr-n<)42Ig-R%p6*1Ko(oFq>Z9IeoJHMDd+EA|)$A2dl{Cj5$eh zno-(g+I>074C>5XiR=Eb71cwxQI{>)s2ieIi$;t6_a>W)AJS`-8uYG_%TXoltEO3U z8+rXjFPjQV3#`~wwO)9}lO#0IUu(-M51u78&SlKayR6lh*T2EGlnh-P!;PXJ%`TTF z$>EY7q{`*248p52_KU^(*|?WEFX)Z&Z+zMP10o(J|BMJ zOV=`T(^sRfSN)Fhgl}G~1@yBaSTr_Nt2h_Bz! ziZ`Fmsu-*ITtTvqwH^X{Imx7ZUPF<;VrB$h-#Zn9;TL;Vy{cXw5xWm@O#H(0)#~b^ z>|&EB@eTK+s=De|%NJJkQ%4`nS0*@`et6|hL#qoNW@C3FcS*)e#>H#$m%>TyPTj! z{G8qO+C`MAY)6ur0Ey?-PH{_9ieMX`{Z7Hq^oG68CdO^&UEy;5_2rdH)r3z|-L2=3 zfri4nPhEbU!72EOGhLNr4QtL@?d#2bO*+fkd|q5fX4e(I<~&8&I{Vcx>JLw#djqjm zs8$}eLi-rYvOU@Zbw=l3_TJspXX8jBeO*t5~aOf$dsr zEta2%oiDg0o}Z1@MV=$==ZpLhNxFfR@IcKqfsP)}@*V{#TB|J1l|QklCjg)? z%t4%FFqToNkV00Kny?|8H5N&2zmsx;DC=X9MQjRF*rI9!ax1+AW(oQ2=UJx zS>XHOFC6$htoi+o5a|zk4E(|dJ|Dip|DBCM^9|wOHYYF!dZqM6QWE%9dT(cBWMyw+ z?NIVzv=it+v5`=>2Z8XYA3kuBic|*x{0UQKH3u~r2>*L)OJ;pTYXc+Z50*9$a6p0| z_<^RSk%K<@2TKbpd;SkX6#vZN2igzaEEMGbOmQ$5qEM5OCx2sYXGG4;%*xD4A&g2+ zPA+I?2<2B475lRs_$5SP;^1Jz&%)yD?9A-U!E9}3%)-XU$H&6T&ce>l1k7Nvcd>HN z|G;ErPx%|jUpS&h_V4XXZ5&Lkt;iqX>Kj-)ItWovJRth_?>C)BA58y)$;$rEwtx+? zJdCigF|)G#3maG}_|VHQZ~DQ=LS59<65tttLztJHRq&tr|G$y{VEnI@YX7y8mHWR} z{@2L=T=~x4$nK4`C4kaF_&>+>XYqdz{#hu<^04#&;>2$@|I-WbSr}E2<=-GVl!u*~1@P0`QOS_xD5Fd%(hGtO5iAgCs>?DSv?5Nq(G!`*tP(hT9XT z_X@1{784F3jqckglNa`UZn~pq?j^b71%Sc2~Ol@2Q z_fzYF>Ai)N>G*jRA&#Nz>sJ9=MF+}57PoZ^Rol*tDlc{QS&_jYc+8I=L~;<^L-P^U zfq<_h+*TCV-=2R4m$1Nb2ju@_D(y#B`VTnrp#K$wek;dsP=BG~CV^TrmSn(ie^C7C z_eB}{7wbP=@;30uQJ;9=9{q=!hki^NL4?1c{viO|fAhiPb( z5sC^5Siwi~$jD%FiEng%L5Mv!muR23E_ZXLk60*&?6LnyuJ;k;K^sFdCF6YMu9WR3 ziJx+3Z?tqOYb7_*zmdkPhYN8=M+&ukf3 z?6)d;m(na5$l7c%z`8#6;)jDR{7}-VAGtGh6aI!GpM{vkC9s&!mri4J1(M(WgdLd4 zs$ZV*Z3r>`&2PQyjRBR=7|ot23nC#J&_m`RTGh54ODNxw|6pLJSPJ9Xny*tO7*EQ3 z{q5jkQW6n9hwZaFxbKL6CfQMb6i}_H;5`jN^HxKPI`yiiVNRlZ{8tc+UN_g+V;L*b zJdx#k|AySDHKW!wH#aZu2=oGCaGWHM`G1z;Ns&oa0>-hlKK1ufr7 zzW9Tx8v)K&l9W8^5h4XLIfUYy4eH-DN7TTY#mejrzlX+4FS|cN2PTICB(a%eepccJ z8b+gI%D8{mH~=t4zPEE>b_)Yw&*CE~{<7{Z1VG##B0e0P0MM3`7}omd+{nW%0kSP{ zsD+R-#6kLzq_jn!{uttK5CB|n#`j$44$v@|q{2o1i|92rI1V>b*d>+&z{6K1>n{~?+>P}WEatvj(&8hYyY0t4-pa7Xem7sV$iMM1xsEEdtm1eur?{WP zx?h57^1G$I{c;HrZRo9l2gZjUNS;O{@3`Z94D{yMOM(1@=|lzA_q%@0EfbO6vnoGk%)P4yKSY@ z-R;`u#tuxJHgXiXJBRj6g19dEsmS#Vy70E6bKF#;+dVThKV+6FblMw)xGe|l%FAgE z#oHh7KAgU!=P`ob0x2nm%r(2CNR#$uDdvfj`*S5NTWgE}*Pd)uqeStc#U$*xHR{F8 zDli#`sN5#UU-dR-X~UpRv;|dns4D$ua^XE%q0q2*}Uc1 zAD=<@FquTkjw%Lba%HU2K$J_{hRDoPJn}ntVcjAH)_c`CUNYaOeL#qWq9FYda`!oO zZ&w^}+!zrFPt1ex$uO?=Rz$x2u3e)QOxL+~8@3t0TV-L5`~DLk*wd6BS$jyO6+eM# z00i+rS7u6qo?KHq-ySJETPzwbrZ!T<|BVPSD8VbVt~K@{CA?F;)D{z+&`A%PHU(dQ z*`r_UsJ0vb8J5ndDzh~OkY)D@kUF%y3>5ELz>#l5+n*Uh$fe@kc~<1p^;Q+R0SaLh z5HqzOkG<_)0GbdRla*o%5D}yqET5u=6&S^Oxjy*}+VS}rv_dT3uwBAD^5ylG>S(=b zdij2zXq@u>LGY#PA^YfXQC);(OIE2@!!PkNyFT@DB5{6v=5iz5 zv*Y5dH*b%-`^u;xjk;IC>KD80Wma_?5tLS2viQX6-7k}qF5gv{j=OV}ZES~>MW>Yv z6|f!S-2q3Kd}f)DxcId)Ylx5so8cwETm8!@w@W}(qOE6@jyaLnG~(Rf3*?Dw3ku3d z5)JOfB12QStaEFYmr*~bH$qfMtpuxUq`(d;gbjW7WxHi5G+r>BW^OONgQR&g4> zw$Ki#8Tj?s6_?cvUzsPQpY3kU&wi*_gU@ExXZM^jDAsBdj<;?sPgZvnUSG(Pt?G5? z{VpUE);o@HDs>Kq-)Xr|G}9?J52)vHO;L4nF6OV$v>NMVwox+;oo#aKq>$OV`LUyV z{Hss*17o&aCEVH~)9tjh=P6!({FAyO{mNMb68o8Dp#hC1$3Yk)qEZzKRS$=!nBMR!6=$IMXc4M>xWUVY|K`h3}w)gi_^E*G@op5Yu$E}G7c{z&+ zOxGVY#b=qFoo|RUelg0cOIZ}z9UmvIbyF=9>l?2><`HIZLBBLfsVcsADtEa*U3!Ak zwF4$!1jP4}M%EKkO%kNv8U^ykMYLmrjCsRqukF3D-4D90psPV6ndAK%7D->3k%Eg{DH>@h9+uffWA003Sty*4Ja^2T>)`@np9=2Eu^!WCO z%};XDYV(Z_ouQD#$ZPE97Vf9e`P7~(Eju-q z# z%6Hq}Qm2wYc`w{AXf)eNFiIwXZGrX{Fm!ZBYnpoiQ`hehZ!*?5-E4C@T}cgX~)-thAJ2BNRE3g1*iZ50TVH7UdeV}?^RDA zU=(pb5`CSh=6yOBGV5Zclgm72;9VtnSM%)Mdw5E!+`VDrfrJ9HA|Vg^{Q=KBb1qlg z>EzS|^L?_|?NW`@+_qwjGxImbw8Hu3D$Ir0_xOxOYb!BDg^RggFWdG7T+-$DQVbcJ zV8{NbI3Y3G?z7J+UnGWV8~ODRcx`_@Qe?XM@N;@UKCF`k;-Y==Cd8fTZqNAm1okc{ z43h>0cLz5Ra3|oUp2{7-O!eMjfxry#ewZ|x8dXV8cV{h{=qq1)UTy0I$gt5W=`3_L z=8{_89d~|d?y8kzBkWnwx!=5MfeyWbA?*#nie^<1=HG`{|6H#fH%-Qa9!942JHSfI zPP%524;LmpET-D(T&x8577kR>mdVb?@t%C9Rk+&K;Nem-_B3w1x(YZbTsSnZ+SjXu zRmrXy^XHa-6`#VDE7{$!l$i`IEC`-t5wo&x>wJk^kVo1``<0czOgqhqZ@Ez8+}zL* zHZbGLtYv46%~+G$^OYffT`Df8LAT;{UmRB;JiDZFg_Bdy%!a%lX{C2HYO*Cw`aL&8|KLY7rZZ zWrD+bMLK4Ad3lhsb=?GvKGHs=B0~gklLy8XQ4tkY`PW>H4fHn)TMET8J~D0M*k+Xb z=qvC`Xt%=7Cvt1|n#&7jURgn+|A;Mr^kTWXc;K0H$dl@mUo-QSTfag~Oct#bTHEh0 z53jBDWChZ)b9dy+ex$A-lX^MU*7Vm#4%{=Pw9co;J^4!CtGss?&_HM-6!ookWlb2G zx+~!JYJacMR3%C9suzEuJ3u$XalgKn;Iy4IVr6qR2yI}gsa{7q$rN#k7?wvJOVK0= zN_N_%V-fE1TG*wm9|BDG5gHccF@V!!KlGl$S0Ia`L}}n>^1jgZ zygZ>T%XyBYoxfyQ7>mt?5Nm+oBz!jj7GKD0nsiF;Z&S4-;j{nxsrd2TRs}df$+~s3 z-UD3du80yFh!KS z-JjS9sdGBg8q}Iu-ld;(|Mlowz|f{*oJ4)E(a%R00mq~~jwRE}aXf3zN_v)IJ1_>d-+O_ z8qOC*4jxev(ILN6O7|9oMvM zm_M5pm)ld2Mx?@_*>ZZmNXRz1<9t;IH(6|dR~k{XlR+MA1^s!;^acCoUW7aLS2r1YP|cX`p38xCc>ZN1kIn5PIa72 z+wA6K;B1Cg3)e2 zSmTGzlXZpgQRCBRvoPm96C}%pK`#dOPSHJsbPMsCB8&#%Q!DgMlO!kgf<+FQR&D3&t_TfB~(f5X9fk7+U<$GeaSr^`Mbx|R-aNA7$ z^$i<`0ws+;Tt){W*Na#DN;|de#CChvPV-?b^WS+se+|aMDST=-AD>8jBw2ZU%K~U^GDgj?xDieJ|9!?5` z)2>O?c@Px9;_ZtOn{hre<1T1y6=igAp1F(%V~sqcA!Hlw4k~l->-jQyZ!$B1#j-r& zp!ul9997$RyC8p*+0AWDt87!($4YUp_R8uu9d5zwTvG%~LaKyXBasSc6kXd{h=b2o zI+~TheGKMJpK|XSNb;38eNQ1sG|fpI?lvI#-g8Bn4H>7<&u-pl6~|~H<61+2r;b>EMdAk&EgYI%Aj3r6Dx8@Ph z9IbPjpfP?rGaT=;BelrRB$_iEXpx{|-gH*}brjpH>clvG_(fBL&GCLn*8V_ytCfaA ztEiRZ9YHnDR_ox*{i0Zs!MGYb{^-af27&tLEu#shB^ePBrp zcvAp-2FmBY-iJqvta%Z4_orpH=W9wA6I$wJRa*K+%PFS3=e|KSc;PnoS*ONXO_iwv5V!mwV$SbA9KJ!6EtT>>;OIvo*SIbVM3Q@Q zpi?;@Dp&NO9hGfRxtxF~eO9&Q-m7V7ctq|<{EYUv$<3J?fm@rvXJyZ{yK>)~g%&SG znV*(yuI&cazogvcn7O$5F}6P=k2 zRxV{>)WRfNO}2}Z<7?LZX&D`oJ~o$;eQnEWXt35&D@Xqe2r)^e-@}U$uFDaVGoa3P zUhp|yQb2<|1vWI(YEl?cyK}s?=$h|po{VNJzCSx~bNy-Q3eT&mW%T~&k(_k7NymYQ z^RFE%^Pk)d&BD{Dhs$3)uUu-TVbS;T=VC)>?!dWm#)C7Hsdod6nrMpV_4Cif*qF8; z(KbE1sR>@ftjxB>PM=@Javm?ZF68tT+oUbpX<*V4o@RStCx($G+DXY2g~1pOE(nnNqO0i`{`bXZBf(O7_O~ z>qxkbpYxOU*^yQvDb+u3RvT?MBJ!&$O6c>)YGt>~n7*nX z#~=yU?U#BxW|4)u*#LF`$Kx!v=_!ZYT~)94hUMs%p2*1xr+w?QmSMbkt(UE$jw}=uEslY9$6q!31ospX{4BFMvkA~KoMBrAW9|Ehn%y2rMwaq>b5uhaKVqYQ zZ-e4nwHldap+VT=6wv}?_gMtOSu{_RttV^aYhWvP3kKgm3mwk-vNUoTYw`4=U()ds zt{1(E?`v7(8C5m*9vmhKdz@;hr{FOu_p>P*jBHJ+`@$KTBVW*6yh5zNmOWR4) z;fAo(nNhk%%HFY(`L;&Bu_et$K6aD9B^O|pxld&Ty3 z=&iYm$N8q?oDo@^xK2_0X^#g@N%i@XEVRKRI3 zDA#2gpN?O8tmRquLn{%8jTTvMJBY3DS=>D?l5)bLQk0sTvRFLL+vnQ3a(#oCYx_eh zyBl8OnClMr-{{xLZVcW-Wam^Mt)hN0bV5_1cNYj3IA0}8?Auec2jrYKDzn|KY>=aH zn=j~C0NrzGfzV*)JY34rP-w~JAD)=oR(9Q-Dw)Om2C(qaxsIDIOE!F{+_Yz^Schhp z_ESs;A()@@)z?aaH0DpsaM9x2D%*sQruE03Gw=*gO= z=fvejF3rryE@RYf4ISk3m$kk@MQ&H83@lddWa#^QPp#b?wc7+ecgc@80{MV9Q}YQ# zdT#wRSNft~-|QDSQXw}qS_2wgX2o7fl{Lc|$l^24pEa@anWb87RHFIdP>)G{#KG)- z0k}1qa{dODT<4fj*5QhWr@TE^=x6OjL%TJ)PrUSJYfIXU4O_wYw;9?6 zKf3*bwwYoJRCf)coCSrZ>wz0QicsWJN@%0B_>cV#efE@p1)Zk&BRZ__XiJn(Trf}y$M zI_v|gXb=XdQSAVXkx^xgV3JOfdpq1z{m7#MZyI-zz&XQ^kKoAQ ze1o1M)%}wo3D<=`*Q{!eDkhBYR^}*Sa=-sp@THa-+vqVo!NXky`Agr6aB9Glz4WFc zc!0hx3%1g2FZ(L`PY(iK#bzLI3{!+@j@Yw@D} zld&Vfo&sE%OR%|)!(68m+@MtYR$A0_{{9emz_cHW*Ui>sJGBPncs2W+lVmr67TDWE zfFW_UracINS)aQad&tdzV~|ZgFL`^pGDXaog{U0jb?r>ng>0G7imH_vy|+&DWlQyX zFI;lZssDP)O2|vvzTb1cEt}6Dya0ryN1;jRFa9eL4)#GIhT{Kl-z#K+C5KLg3|nX= zloI%WzXNko=i%3Miykn~9E}k@V8%l+nfZlN{9Iqm^UP5!x6*vRpSgT$Qgetpov7wH zJRjggdtPWo`h!scX5rdg{LZ|vQywh*a@lCy%ue`DHpF9(H|8{{Xh!%IQjZ-en=TBsjHi6l$+>KRHPAN z28ui+4uO{@j)9m-FY4us)okB?Q+FZF<j%Ecw0#Ak#NqT=Ffyv1bwcI6x@t6u2Rb$-^B}p+q zHd{ZWH1qOf6Y}yHiUYqS92EQ|Hk<|Cs{BtvGNRa`=P^9ykB7-)?1Ra{3OFfBzw?$L z#G_ZuAu)6;bJa($k_5>ysW3J5TXQ6TeCPd}k|VD|e9qLvq^lSJ)iJY>x)g8$|^LEc|bE*=G0K{Ke{`_6&9 z3vlRA^;gUq^jhFKyzbzgf1yNYH4W-HULayQ{XKFCv88}_{BC5D(bXP>s%fU)n&l=b zJKTdnBGQFIUb$5iLM3bH>7Dkz0i$dN~uZf2xj@NXF* zttQlpG0bl>d~0mbz3nJ|xL=}{Ez{h>zw_KKdE5_8v?Av1f>KJ?jsL9GCO*Ehc zIkbR~3$@r1{boM-gKn{zAbtks8U_HWB0wzp=yyg{9vM*c*YdP-Ky3ppFpIB6D($zh zdDjD;aP1LCJa9gjfI9U&HahxK5wiw35-J`o$!EYeAM%56-=e7hV3G!4sKZOL1D0m_ z!Tu~FMgLVCfFV&=k*mT0CZ`7#H^)Xr{4Mqpi2yuHWi4blx zY#9g0Np=JK`&`gM$NU>rGmtc1@o52m$Ce)2iB20ls zPf#|{e7BLgnc6F*K?VQUO0@xdBJ2#Sdjawm#0jU1;O+5Gc0Swk$BMEDr+V^Rp}}nc z6@-J~xa1F$x+ija(p)%>wCqZOwC6vJC4XQAJ9`qKz6R3dMCmkNV%jTR=%4@Q6a539 zxCO`k!E6AZaP46VPJiHX0wbLGgMfWuqx~uG%sf|Zu+WVA^f%u_0lx1fB%!|f#)-^n zIEKhmH1Yu{=h1JNc)&{23fc;9usW7nzILqZIKI9ji^bd@5$JxvDSH^n3nX==;ZGem zTSFLQ3t!Ltk;N;(G>?T5IRmQ6MlMh0%q%8t@Y4nt?jOYDm_Q~M8_rJ&5V}Lm?y=dIEz3l}UP?{P z7*M@OXv6!H(STF1b~6K13((_^hf0APONl3dTr47s*4VRYw64PaMoEVOEL<_6l|lh- z(vsvmgB>Q42}{-JzfsET0i*CdplQHpPE#*A|I0=r9t6dIrRqNx1aR6H^EEcB49bk* zEPrIq9stoFZwpk_VA{i5J@7G(2e;$T;tPO1?yOS)JEVL*ipls-du^sTYM3eqYW(#N zAHfP}QQO;@Iyg+)8kTdOj*gB5oJ8+@L5PZ27i*&>8&Pbj@#%N&YI%JZ0M&Fj@Jt%z ztV|l5=Vr6@E(4zN%Bg0rznLK8X0igcSU|zoGs?j5G?x)NxN}YHk@r6Lln}W*CM~_*Ar=tQexvgZ z(v2pUIJ$5M|5JYU%@=TVs4E%1LH0zZ_rjCxWrv){ogCLN-;5jWC0>@M3?ox@Hs5RY6Axqu4O9LU+@zl(_K?d@$pX|3sxDhJF9`9s0hy@_O(sw_%O0_ATL0El0>5j-K+@UNgOoT7rA z$n`PcnGZ83_;>A=BNs{PzXd)r@Sd^EVlt(YG3CHJs<#>{j+pd_x!}KbKIC&D3Jca;4dGxUE_pc0144Y=*FW88WO^$8vdhrtqy6UVI71&AQ!u#) zdaEe4q3&DE_a2JND^z?s)O1>RH{C4gGZbJ@0+vnO# zI>Fz%1{e{5DLvc@BR*hPV9u893d3JSy}p1Sfe)es(L?%6!6Ng3cxssAwc_Lfj+f?@IzI07SDICQSAHlN*gz;z)pzdw{G< zW7$wlOyFFZyIh5(6>@~SC}|=4%{y6ICXWI#x|Lzzk?*k zgve-K7iaJ8?A7a=mpM*y$>6w=20!(-!>6ganGcp@REjhJ%82MI@zU!vUjs}{kJ)iJ zblbQ#;%Z&%->r)acG^n#icJA2{Z#TT$esUMplqIrj4KD7DY3vzcA+wg&&bAjG>y&5 z{mG~J%)~yRLgmPrMOsFb-RR-6;~ygX_5FiG42G(y2Ws7Ui^a9)e21w=TOi}aR){HB ziTuc({@yjShdT=oc-6H%KZtiiw1xCTx(bmnf&mxEWlDZYcmmMX8K|#NN)J(P5X-+A z)M~PRdYsKP+x4SzU5UyUr8UDn*a>_@=R&6WCEVY)_!PYTA0VG9uNM{T$#la@3rN(OURJ6p+;psMs`6 zan*y-Rgm$CwFk8`Qie(@PS6WNR;GPkv0ZRnsvDE&RnIp+$8MOV-e~Z4=}JY0DE>vZ zmHveJ{7ah@pBq5F0e5?x5h4AjeoO<|@FC1{dr6d}$c7w6=6N^%^5V{T)bo@ETJg?n za*01X>LdSnh^ltS4RhO$!y?03mPO3*rmpJq5Yckek_6L~&80(sWSlOSi<6EG-)x$M z_+U?m?E1p?5K&nV2uIJr{T}V{>QJA|+x*Tf$K!E!t9<24DRB>@_`aE$6V(c4lWF!4 zihT0)TTPBmVK@4t{zLG_=Ekn?~sy+zi^=qqt^|GI5fGTh6gFo8@NMggZB6 zriwGs!NN3@^yjdHc^SJ%Y|8BR)$Bs5so3mwexv&IWsylu!j^80zw=@xY*MIpX~b^U zRQF!bs>-yH%k+D^rGQ}*v+lBE{iJqT?$&UnAD@1r;{XOx0Xl^$?Q%6OZLMa*ZiUw` zt6Hc&Ck0Wf=p#j1uVa{4LS|!buH)UV5;u>>WQY^m=%#1l!Mh5VUu}FFr$M@j1WmS1 zsa2I}mQ^~I{f83+5K9%Z#P@uQ9PTkLMmLii(u`?T>_*#D#bvJV%k0A@p!Wka@o-apD*%+&%Ezh0Slpgu9LkGde#_H0qo= z^}%z$k5YA#8J!#Ol;hT{`pP#x_<5s|;|Uhcw8i7Er=RaVm%onRo?oQ$$Rb~#XEU+O zeYxxXRR{Ae%=%8c(9ET3cUiZ&y<9nc!<>3@A}WVjU7i`cFO;F$*}BAho%AdIWOf*V zKTg}N@-P*3r|I(Y_3;VqasO;o6Wi6m;p~-VLVN>v9xLta*2XD(CbLqa@TTTcnfUC+ zMz2sEwUwL0nAY)(Z*ho@*~`0S{QVoYGoFiD4{ix9rOyID+-vb}TNxctK{m5Yz*-cA zu_^={t+?X3vxFan}!%A%x_M zYJRZuu9?pBI`Y*=c=p2>`;g1d9q0XQZv!vKalPU@*J<)$Y$t@dOk?xowTMK-y`B^D z*e184mo*p6y6?3q?@gN?d)7gV_ZEUt?`xE=&AJv;pYj~<1g3K|oDev-fUR#|I1gG< zqYt|3m%2>cc}^H5xa?0LaHHSIip|@1z;xq??szB)ycuxIFj&V|J zk{Nfbf9XDUv|;VhcNc1 zG%)T2zPk18`DK5V*sxFfo@OAwh(o%@WZun$C4I1*-}|u$??h<9jr^uZ?b7R6^$9vT zhS8kX7f(H*ol{tG4l?N!VRxqQISx9XB2=|2#O0(9u+n)@hqp= z`ms`zj+tz^l^)?hZ#z;!Rn|F8jgO;|l#$(Ifqr58_$-!Xn^jMt{uhbGIPvwO>y9dc zk`O?YQ`O#z`CyH6fxj+?L3)ya5O5Znd)Oqy})ooR)AVkUUTjf2YBOVxnsuv;(o1orvEh^0x0f9N@3pF?1`nu< zWk;=APqLJ)A#F6?wVBV@^{cRex|cAFm`wLME`&Ju^R}#Ko4Wi~Wf>&gD48u)Up^C? zs->$_mU+T9#56PawbE}~e$}{d{E`{?hOFAqr;~_h!{TtBTaPWBJZT1vxw#0=EgcG9 z;cU$^U%1v3p~SB+q~VEe*dtVuzWAk1tT&ImUO7I6O6S4jFRMu$A0{pEm9@u60LYHk z&|HdxwF&0Ar;k^J<%|>i78|s@VyS0HuT8AJB@7W)n9jfu=oetQVNIxW_eY0`1RlCK z+=kl}WkW*mZ?fw>Zb=L4pg8+gki`Ldv^4SU!ReBbq^42VioMAoI{lHV(d`sQgRv)9 zGR3toFVVi_xP3k(Zq0bE8Nwb5{}6|0u*G_P8+}G&$24^s8fM;55u@4cBJggeYGhZ) znL1#1Rb9G3^XHcgX5}qZoY<5~yJ7yxjd8c+m=vNQvDa&>?j^gT`R<-kXkkS93ZQd8 z0cS2EXEm2p(KZ{{Q?E)#59F{Lt;Vja4P2NHi4lGi`@r0l&H#~xHf2P{_=wyOTb%7c zfy9On@iq2S5F#J;e3k4mZ5&noBmYw+kzd{=MR5x%Mba7s!WLb#5unTR>)mgSun&90 zcf8m#p{%85L44;OJr$yzmXA+31FW!w@@%sl7wf#*ExeAhqMkK-EoSCVcZGhI{J^V} z^bz{p?QGNqP9Y~Zs(8MxuC9N!7yq7IAYlVP>}sjIkbkPw`E0?ajtCc-r7ADKz+8H2 z-dvn$^yoyFI|x^0QC*>|;#n8-UTbEy+nwoZaBEqzWWPD0xUIB8j8xoODM^}d&`k5V zbUGZu?k`nqBAyI>civUap^&~sZiUlam=>UWcDLFS!O@l$v)7ii*VmcGnarcz8t07u zk90s==N9b*nn+e(s|i1}1$g6=UJ3CJp@y{WocG*+dfQ}gJ#BnP48zFX9EsP$2}oIB z!J1qfG$+ENam_t0Y?y4&`!#9w#9GQus7rU|1AEXp?3`)$0^9agTp&D>_YH90SW9&o zA-2DK%&)C~^l_J%p|j08VSRlZhg+JkyFfPeX8RT{dhZV67sMa4ttS@$X@#KCXMpfYJkRQQOy%*($e~C5^z0lL zq2Khcgyn_Jy`;+A1yY#(qvL(M>t7m_R#iJq-}zdRIs$fIUqk%gc5a-7lr6Y3ZqUWk zFRXh<*qeDILE_eGAC3$)TXwhREaZA|=Ukc+JYA`EySEOjD~Le9w=asQ4jbIl-Sox3 zFL}MVf5uRx(B!46@!h?F1xa*k(M6txMT}lu6H?BshP@`Myj^R_X=a+*^TW?C=vs>< z-Ku=?aDG=5Z`}ka5V3IFCVmXuwRp>0t&O_J$xNtIi&&=*7t;}1_O_!1J;rFDHdSL~ z>RXn_aOWXh`_u7mH!(r1!-Ex)0v}rG5R~q25CJ75q#L9gq#H#- zx;sA7-O}~U2k?7;_dd`42cETXt&i)RGiT49nSJKWy!QK$6JW996Fux-d{mr))_ldk zG)d_cyAm={u2NDmskyaXm0r-~!E5DqNacRaC$y(kKYf`n=^Wv-CHBZOVtmVGq~ycN z*Zc)dubKK@^@f(Cw|x4UaZwl897AseCrRU&Yc78zOD#BEEEVl|7I_};A8Lo-xeQa_ ztDYKG2I`kJ1vBhbHE2Pob-0x6>uVt36Wr#<*u<>tk}l8Rw4f&DI_@Sm8a6a z{boboIhL?uZz`z$kY!CyrtmA}i0(FTFQ7JxrbJvMkuYS{&s)CY*QG7JPY8lGxO>KM z)Ygt1l0VoZL?1)P?HJ`96nomR>D0FqkH>+WtKP(Z9#i%{-DKPDT4ib7ZVTalRSRFi zZ+>oeDX~aqspY{n`KICHXH=Q9&BIKv4Nn%+IIRnB#}OIHSYz$Lq2#1R%e`sNwkZ=j ztktBV%dLsNo?pbH@IF~|km2u%E~k}J*I{)R>C-iXYczBc;+8_5C{UcG$&58Sj+bFY z;x$GsZ(?(lhp*B|y{o>sAa3$2`*vXC1N8*>)s`2ONfHG{=&iq zHuhM&mMFQ}+N5|ft=n**>(Y$gZU?MdHOaIYj#Vn_OiU@K)TC6J@)H zF!_dwP!ByhMu-+|C^N+?$_~MVY_f9WriJ9B9hc%Ov+K$EWYRbH3I74?FefooK;ths^HNmHK9|=vwTdz;W z#CgqrR+mq6*8k844shVhp;v5v-(B+n5>t^-o3?!0U%+(2*G>0Q&f~DgmMrhUw%dJWK{6<; zqfBIwL*!hR?b6?PCRtcLXh{i1L~}}!w5ZrWPZn%iHV0Q99mco<0@GArye<2LfgZwc zkv)%j^#ISUQyFS!UHmX$QoFKbtfN*tibPJ(J(-Y;S|U~i^MEV-df|fUu)iPlg;aT8Zze39`{CA+|H9v>GpvB3`=3aA?$a z2nixqOKh9Gt0el4Vt>X!|Caq(J3M}98V)I#tm!Rf?wreWI18D=A%JZvmD{ZPy%AqgS_lRzf>iMLp@mcyk`^>Bd1#uP1xl&7KX>-HTKdgOfF#HW6p%+Z4$;Q zXvGfdE#7_Fa%QCB)^9J!voAanzD(;jE8Am7FbCLvcb*#;bdZbMv>L6VVqg!H+0`WQ z>(KsX7P)}rMWl1W%&-5asY&o0`AlWEicMDjY))OX=9jBg#l1hS->Q0C z?FzZZ*EC*Jv+}3eMQhh9L-CC?+^vm&ziIP`HqUqP!Q|hfzl~W=x3x>xpD1q(A5{fpp^W1w@F5i`b|Wl z*-=8L^1EO(bG4A&+3eoQ%QO=2`%+dre}XBWt!Qkd;o(uPT|W@aU3>6kf((+jMhZ!* zvBrk0fvtIS%sn1tDrZ(ytU#}ZHb@RtHM;CCO9Zuz7|i7=J9w}lVMM78QJPJ@H?nPY z-jv5r&~_cQmzXh{+mczpjwL51w4&(0z^2wq^Ls#Hy>NtA{-QZ7FD=$A{PEi<8UaaG2vAv z`poCoNe3D|olsgT*8knN#;&=5MJD?EHtl=TD<2qn4g}?y^~lbRAgyxZvTmp0zq?y6!N4 zkMsSYIG*8qYHzC7@Gq~>qo|^+U?KrKsXyr@_%Y|qrjeWeo?AT(=T@tX0n?_B_`g5V zjAS0*sImO4ghAzZ#<)>e+3G<4Al<=(w{jml@34-+QL8NAbXcx^6Q_30NMwhbvx+I` zg50ZCVq;IMsqaxmi}T6V!y~?3uJclH6e`mvku;*QVu~z)_jKVzC zVvS{eADmZqk@!W$bPA;I<^9Cf8H|gPaq?=~RE4eN^mL1J{EXv;7I)z-j|v?zuQQ*? zS{3Y^zKyPFDas?lquFJo10dx~;=SM82c?9UyPD!%ZOgqru6W-M6V@-zR&*00lCSEc z%_8d{o1(gCOOomqsnegKUk)J#T+0?-H+xN>_6|G5-F4Sz_7g> zTtwu$wh=9xs2wkvo-TL+prBAUrHtD=Z%z zb){qt{={$F(qGz@fqgF%R z^=_t3zZKK+tUPz&whm|Z5`4QSDm|KAR`H^)`;K@b=v2qU3%8Rt9Q}isJC=8JKqq#Z zdR}BF%A)1^=^}jqXR|)}*xh$Z$h`n}8P&zuyb!7y{KNQrOU_PYa(=teS*O-z8;)VJ zwt)`QqboCAM)BZzQb==)DiQhrG2)-1L_D^9qJ>vM^R~8G3@KN%W~bqu=_Tpw6)PD! z1y{A}Q|w^;I3}FSw}P=(7KNz!P0Xz{kiXG3AGD2xxB2yyH$U)pZxsY3@Lr{iSFP)g z_nOmoWim@eS!t)_3NTI9}|Z1i{BG$JJ()+FlIg&xYGVl5^f2cN9=oXbsp~yq1)2R%FvK zQTkZ&LQcY9P{l@E8T)m@aTa6IoAz3X$jEIB$M~&?P5UXm2+eq{_Sy+KlXrZql+O+Z zL9!$bU!fzb3>C!QgK>(Kv6%ulrnXn|avN*CPRl}a#&v3d*Q-9h=_3AgyX*kLwTb-d zpyQ~xGJSk(Vqml6)GK1D!c>1*^BL<^eiRcK`~C{o!yYdQL!Cys}e+r)a1$16{KhY6rk$ad>@NRB> z1u9)&oTt_bIbjHC7TY7`+8+CL;C39tG17=#$ah)O;k_oKti5v$AOUd3)mjxjhrJ_G;tuJojQ7D`hXHy=?^V;T|-_Tvu5fYWJ!*vPgGZr$)FFixoqe zpeNZeazzK)tphOY!QMA~k!j4&ios)_BhbAojOmT9M%F+r0_ukLhI=Un-Wz^s5l_a~ z*t6Kiow1#%U&g!}C7>ILsr`RnGR9wl?u57c)n5ZNL z1O;td$R;lFzE-!^x|`(9)k4UTHSpi-({Kw-cR=3G;=k7CY4%~*s-8sfG0wN0K09G z!ER>U!4_*0ax8qnJmS%&_FOB^?-V}9W--QBfx#wb*A~@scITr_!KV#a2yyh!adS4U z5cYLr%!bK{dpWGeTXE9oDHiie&H6YcME(SJ1q7D)OKYw~CXaFr%m>9c^L8#OjJG_lt#1T9lkG{*4R^SKYXYH`|LUI-hA1cDS?(sf(d|b$wW7sYVj% zLAGW?tM9X##Y_z92&L6y{Z&aDyCgNAT~B9o3|;0CJkidVWcw*#HZ;seken*A)-`H8 z`kUbGJB5((QlE*ID~U|t8+Jm8|1GqjOX*sD;Bd%uzwoLclx)~~gCLUxwPhZuSx+GD z!HM?Pm#>|uMsA%f?|l2_L}@YOgVjTWzN;firr#=L#scSB#;$9-@Q)93n+!+e^;%mV zCwwMT=AFbU=lDokRr4vdNb|#U$V`Y?gL^p6P=AOlRm}#xb7M;KDRvIR z_WT<-GjElT6olb49<2YCY^Rh?HOUd;G5Wy8ra4c+_m35+{*NBHF&Xd-bu)Y4!34;H zY~kn3C@T}e6tzva-2ct&4?nWYn%Jgh_7zE1+@7u(Tp8o=w*&%C<9}_|-@te<9MBsM z-5D%lvbmql3nTSTH_A-VOW@}?KO+XjM)e+zt0Ig6bU09S{s(B3!tfP~v<6! z!$Lh=UXZ{``w7$g!34LIwY4qKs519s6`$T-0GL9c>g2Se)2eJNjqHt10{E%40rUWj zJ2Vb}K#M*ff|wL_K$1Kf^;3`}597OU^#|&e(2bOa)HeV5AxslaE*4EUT(dU=)td(H zJ0=xvj@^oeN2giV$f9a`XD`n`D@npQ;N&7{L?mQNKSlYHgmJ=XED6~`OHoFEyxsVK z1O+}3s9{l{Fay=CLlOXOr`0l>DurS%VgDFT`3Jh-g@L}_+QerrJ{)z~jUG%v0$nz8T*wL(|B!4~S#3%igQu1mFm*~f8Q zKw*yW+QVleN44(<4=NZk`=lh#$cA!W9V~O4FNuDLf0^oxh}ppWf@m1VSpDYkUh0ZT z7~L8}d4JGG*O%a_`S-~U6@EkfdY{0Sc8g_co`G@kpLW+x)r23zUWWP+VPxxXR4^VK zxxU@NPcbsmbh5-eUUjb>WjhVFG>8f=aCDVzaN zl~R%jmVI*zWG{m1NEb=vqQQ5Ga{LX4D7*)&oEN?lpuPTZ!69N0J-l@#A zw3<@11+W(=3mvYR^sfH?psc{0?=-+`Z@YEQiQa!gCH~5R8MNHvCxSi^N`@E|cq$P|CP#f)D?L%L5x$F!s6N{TTp|<18K>_5EMSZqWLl_+sD# ze39;eV4zb7jG7-njRg97zds0bP6N2M^B#B19cs4^l+D5se!mAdeV-yMDk+QT+UJ=N zb|=G@N($We|(LZ8cHRGE)L{BGb7z!1U=^lVa5RWjAe&!1^-!>BvFR(&+F}Rb$7A-cBMpb5b)`?(j*u# z-rIH(lLGY?RW^npc|N-~Y=9(2fLp7zGs$!+JPG+1W6^{_*(Vi9lpuM9l_D(U|8yj1 zw)qLhNFaCz9Lc*4^a2g;MatZbrH8Q0p?mhFAjX!;rxqzYBTNO|o93ZW0kzS$;OTsFA5jb)u^6~9pru9X zuIc{A_%ufJE`g!zPrh}4cD{MCUw~1y7|Ou|JmW$Fpl$tfil;w1?i3pTM>-sdAbcWU zjEd`6;LdFd00>5VVD25@g|GM$3=dH_+m&#s*#FXyFcG}N(O}@Lz@$u!3J!t;3>sgI z{L!yR-c6kn!jYY5uO)Xt3WOy=Tx7SJZ7w$+sCPz&NRcXhf)7-Qef5Q2t>njRRdR{W z-I~nKJ8NArjHfD_U%!4meC}xp{BJ9ox(c!sLn4iBWhQ|u2u;wEj7|gmmEkXIr<@MzRGx{Qu6QBUhR+B&0fU=bUad8 zM(J*D2L?HqWQJpXp@0L4Wh4+*iMA)V)d5Y)3Zl%5Dn^GOnAT*X=I7FX0VWKljZjZc zEsE$2;wt~0(kO^c|Nr2(P3V;qVeHEGR-W)g(L`N>N#$hT?X2mPk{Mt+2N&4~7;%-l z=>a@W?VT+wAhSXE3f6Lci3Fr1z&ETesDcRx4Y>9PIl>m%y0wMsW$Nj-Gp7smYP(f> zyDJ%G^tdC9Y(HGbv*?@gb_a9t@VpcTPGxaL<*c`j&AnP16AO^T9iB*x$mGgKjDR0a zo5AHeb}O4AB+k~p$_iFtvC!b;wAgA&t}LrN_*B?4sy)UoyXitN2;X^2Mhg}~qyj;L zXvVn4m;GpD(3W2`N{3o^;&x(25F^;CcJvu1GtE?_{b8i>N2>j`&7E$?D)-@3)^hXG zJRX-Rx#h^8NwPt0)q5O>hS8RcPI}>hXrkwM))5Iy8^`73eG~E!{GgB5Ga$7+MF1fv z{Zo;VVks#^YZOl$1_~H{CNcw)Qxq`I2(QCoeQ2sw%5Wbz3f)*NPazUkUsIqr`2~;s zlXyoKv*ICM;Xa?;WuriMN~JmRugPfnuP{V<-(@l=I}2p#n9xaUXxv;Se|J>?I#s(x zL($rp%i-_AUhUHEFDaT9mI9H+FQ1_>k5lAJPpM1-q7T* zc>gJ})=$0Iq3nkX>OM#c0H}6fgal(87-H{V2APqD34j`IemA$421+rWC#R}qf6h~6 z&qoqsM^rAgeKr#1;MQMr={R5=)IBa6^2i@gC{^!G@zT~~M<2*)HO+EK_G!l_gAlCD z4dEnUB*z;YGM)`2?N_36q4ONF^iXhJ*QIoa3NdkE72+j~_NaDM7b-2$5gtYvFq)YS z>5kN<8g#J-vL8>Sbg#>*a+pont~9q8=^gfErE8~9tD2RVDi3+siuP*XQ{|4 z{w}TYPWH9b)rs+1n2X}XV-+W;CrQ*OUQw5Ek2f{1ocG-I*KGDXI=fNSs4O>eyUVVKZc5{>Wcow~rzv#M zd=Q9CxC%=S74~#qXt`SEZ&F5XYxJ~Aeq#EsHrFHrB<0+qS#@Oj+tZcUF=|f>!;7{FkJ-t< z${E)RmUc}pwhDC*^N=O?zFqa@s6E{Trb7Sl-;ufRntIoRdwK;l5-z#Sj2^u%iu;PF zw6b0(1&u|J=w?ajJ?-t*+*j6zm+Dhnn(X1Ps{D0cQS6kh`Ep(MykJ_uJQ~+*?5W1? zrM%7rhpZjfxxj?^jLPzBKDzkxUNEnHUq<+PlPgUg@y5Kd-O}Yt9sr=2)az>ZF zQ-o0&|H;+)!I)OZ(oS;z=PAR3I`_dL3;~fpgV8#T$6-3SC_Z(Yc4Vj=|uQe5`9|X z;o%>1t_PdaYut|yEhKHdd6ol^0ya1(w+iSU0UW=V?cH+RJCCtkW$uH05gqDnu-}mo z)6QTWGdj6$F(Xj^{K0y!xP^SF@{&%CWK}tD8Qy*Fts1Gy2fI;< zMr=_gj`3sq z2F;VkE1TaQpcy;mD>mYrORE1$=Oj3y0=Ia9lN~E*ZnTP%psGzp>TvFjA5XNUG3<~F zxbR<<*~uHV5QDLdd+VtNrQ1P`M(zA8vye&_fm{gOhA>?L<7`E~i{9qPFF0!5;(YaN zOZa8bhLlHvmm_a{8qPKhe8gnNpIFBv7xqL*TB27|5BY%9h?>&H`8xMthv19sOQMsl z_=Z|HCfbV?rLBBIH(i10?lSU_-oJxJ(+?js9DbhWsfuFe-tg;KRzE*ik1<(p(0n@1 zyvv8{(1%s*rDu5p{n8?-IUZZVB|kguX7dd?jMLUD@4oC3O+w%{%qbT_kJHeYD}G!iUn(1O>*`R-wTn`Fv7i!dKG2p z#Hx1i_UgpbP%&}chW&zM+=iJ(msdSB@3iTfR`LkLks_KVdvkRt#3l@m416WL@bp0@ zlX8vip?0#BWHV>YvQLX~^8TQ4Yfjl&_Tjwf*mYls^Tl3c>hobS%=j=ebG+{YOL0<2 zRc*T~9U+Hcy z8Kkj<@WR3h4{PG*;1s{ZAUk>*SD&RZR4_K?!Z5$kw02TxePQddQZGAt zS>@rmDgsI8u(#W<#tHoQU3sj0LsP~z%qP&7wSDDp%zv+zRh2S|3=kSLR6wsrL&>gb zWY9XIa*__0qU*kPE)&ZtN)G8_FW~^++s^3o{fepQm90Y>N>FNGd6EFX9iby6p@dhthBZndclUl-Diaf_!d!(cES5A-&XeH z(z!T%$g~b&Ysx+7y|E5pAYdsAV#LEVDKU-nY=j}G0+x8r5&p{yQU z+4TNIxqL?L(soyJ8zZ&4)PUK8%H_Ki`73g*ay6`%E;2EvZNF%<18=3~9GzOoH}jBI zkUAo#8vDEA?HHe6(c!3u<^7ed*CtTus^jS>g33@MhVPalY~i&EnRE~>*U3)t4Xp&# z9S03pOA0)sub~I6l^BKlr?QOvmcHWmr*XEYN6e@7mORj({3;Z>m>4}I<=?kECE7?( zE_mt~seRGOZnxpaVO8P~EEcXJBlNRPHivxM9PDZ?_v*1(Jnk&eNK$f=)$;4WACupZRuRIj{mL=EdxU;>@YDK4e;j?KR zugkygLteH1wrh66?bnv20!jUJUQuZggPv0{p#`LC$ZLW^NzKDZyk}H}pspGvCVBfb zJD3T%&yk9y_!&VfgSa{^m|8meqf?naP-O~!^te1-x4~jMY<+j`it#zDB{m>}@beIkhX`%Se_nvXv!pb!`&Uir_1@P>Ws7?5#S|IsW)fH-@ABMaLn5 zm&rm0#cVOOs(eAF<)I!K@^kookU$>pwJc@Vy+phU^bQ|X+T}cd+GG+Pqbfg6Z$eVW z>#?7IjG$0^&W1&`u9oM6Yk+Lpw7Qh5NE|2Qs0mH1)GqHo`O(;O<>N86deu*R|6%^d z6Sr{++ed1u1=f;B&{5U%5{7UlF=r@3nE-Uh+Lbp1>}w);1#L*c2hvv&)U2Fr&AbTe zZajVj9ldhtLS4=hI@;C69={%S)VjV{Gh{&M0`x*N|Y@&Tx{HgDXw1EgTmPK_gT&{2ok-yhc>Kw;ea0iX-bK_6)Tfm+g@c* zrr>)9Q?`e7F~vKj608?}H4-39)OjUg=v=fTdo)euS4&)8X`A@%A2vQm%k)|iX{djm z`QCzMp!Tg)7@uP9M&&?GrU%t8Y;3A+yM`~d1D@?pRT%1#>7K|s7 z_@({XKf*ypf{&ANm-6Kq;;^0DZ{MLQlH3!pC6_XAEvpq-4Yq#WZ|;mK9D!dS{u1&0 zAkr*mS}&Q;a7g24v#C`@9WS(QgYb0TJX6K^CZqN4E%5*_?wgLn9L{`6Q<;^d+ zdC8&>t4oIiUZX?wDW5I>C_|Nf&0&b@sWB|+=jeeORWNifrmpe4=4_^G%!$WzQ8B_? zYbjxZHn8W4v$6$xWXqD|Jon?r>-s?ILF0hy%s`2`5W?q|`)w(qhVoMzMe-NfWL`ml zkB6_A#+m81@fPb`1Z$X7Ph>gXfyGSpbgB*jvA zEex8|a~y87Vj>!A&){RIkcfWmRkOhzbn}k0L6MnudkW2sR~Q0+OQ8ZyW~Re58HSgGo4UQ>!C5tZmzq)eeHFy;)95=TWCwrx zDV`}dQ^d(+3tcKfC1WF9QIwmi!BTolj=?{$IL+aOoSS2KU8~*HbF1va*QrjmM^#2b zTu;5!FdoMjF)Oy)E-y_{U5ak+huW@4XQvlD$EvwJVQ|-tG>gdU8$lGQ#a>E&26~;G z;(5>~_}MnvM`GVedee9rY`;s7waf^?EoG589Djy#cKONlCAzQT>jyflSr52*Q&^UG z?!|wj(H^2nm2jsZO4KgZRNY`zBOhjRHp#1+4LPrGcoIqMboHUUHFP_oYbq#(!CESo z1~!NlmO;gaR)iD-9y{9Z)y3@W*hZq5=dH*VdD-6oJB7yg@_RsV=z*r1mYMC7;J2A( zI1Q{EXiPaqs(mHUwB*7LFKBJ=Ga)a$%<53)7p*Fpa;;!h9z9*9vz6FbwZYz3%30?s zM;H>LQ)=Pt5pM4npxI8LUb}s9;3(%&SU3++ZZ$3RtED%65{6XypdciJE!f_;<(HK* z<1AI7c;~bLp2eEWh_%zI1;{~JOddH-`S~0}7BMy8K7d4p)8Qsq8>Eu;jQ-kWeQSD=+tC9_fK7u0I6f~u0NbN)?a#|$j2>lF;FwStEWATf6r zE(kbaY%8@CDBZHGpfYQZTCNAi-}aYF?P#=Zk3$4+^$pH)?@eM~xCz1^Tb8H4HTg#g zKR>MFgD!<1KOpblcK4{#IC&jp2h_$l*MCCmfmNr9@? zR0C|soj<>_vDBC~YX-k8FSEh-#0joOj;Pu)9Xf{+Yl)KbYpS0FTU0k5@)VVtWg4&R zPAoYNG~z?!zP_Sb54N2D_1H0ucS7TbG8{JsN*(H_g*eAr3;RF8e*>oMSyiVv$XUPd zvVk!J<8`et*{Xi%Qyq5vodgs<)_fg-XLKuRXQ8~v>7~WAKQv9NfL(uV`y=U97hZia z?%!FVLS2%i5@so?a5%20pZA+@HcZ>XVQ&mQ;Xy+EMXqaz5f~P!yAh4|_UyA8>|KoW z&U2k3X|SYoqUHLg zsz%`FvoIqNXGxhg9Cti5e(=cIEd8W!q=MM-8*+@ijZ9#=jbo~VcpaUnYIzACz$;-X z0~fH`9BQNXF=wwuwI@zN89!DJ63Q_#Qs@{mTnz}Y&bmr_9(S}LI7-?m*<#pb4R9BR zf~*JA*3i+cJ$&$0fpkM+pl10|0C5p07gMB^Z~V^hZ>Ffn{+#tCasSwY)z%A1en1`4 zOwrjo@AvL}nCDGtlR7lMyq|{gr8ycz8o8{c-shEA6ZKjy?d)L2L-EmLO!z$5a@8>sCRor9s z(!$XvO1tG{w{dFrwss+b2194k8b(ssjt)`6@)gSghUfA8O@TycC%Miy2SRJ15)vgVZucYgd<^a{%q0)8a<+Mqrz~{jGqFp?KMxO+NRv(*esI$kRo#0MaFu=nF9{03U()v zcQ)hdUdtQ@ZsYd3?k_|LB^iDbP(C_bvDlZeY8&92jR6a<+ydYqPRL6VQ}D8ygQTk9 zHQg*G`-aNe$tAS8>gu2!0gp0xqvPW@;63MMFZxYq>cuRxjvueO|N;La}FEnG_5036PC`sBLyKXx& z+a3KDsf+g$iZ?yND?lvA?K~MY$x);Kj z{+QRB`86w!s!Ahm^G{7lSP$7UbDh`N9)Cev)Kiorq@-k^UbNZ$rZ9$#K~?emGd&W9 z&6kZTg1@**nvv!9%aPw|c84sJse2TnI}3{7zeu;NYVdU6&+H3WAng3q_d;>aqc($* zLrOW*{!%l5S3)GbTnfa94I(QpRR)cP^t2_8(4-j+*4)`1`64dW@z_%lnFMC_$;MU! zj}6onk3&hRLF^0bK$96G;S#sgOM|2O1xw9snaCB7IC!v5U>`x4o=>c~VArjRzhs{e zSqdD-F#Kt3L7V|Ox20q5^evHAMC zCS8ggXB)N!_r)J3@g=f3xKH}e^vF^NbGW{>2Q0Uh=6=?xUr!!UEX{8$;BLiqqdr|d zqVHwIMI1Y$eHYu2dlKf_$|=NdON4eu4~jAAgZDedUqBeS-7`E^t@at0&fU#6s3q7a z&a!dQk7WxrcX`i0*p2?34?u8WqB$F0{xQh(Q1YZlYN31Bqq+FHWg|&KP_m1wJ_Un$ zzhO7=!_3Bph}=tInQSd(cv1$+VISB2zbYCIbh-hMGSUMhdDiyw$46LiX!IXCSj#pl zb7>A(*I#-(Bt|~G=REtXIKI^)Y5RrY_0PKaR=X^}Ui1p9E1DJKVrYmNIxEiP9+Hlz z!SL122icIaxRwJrYpC^>CGnrpF%p^AsIbU7) zUv|3~a1X72b66(UrB%zyGBcJa91%QuTqaXoUg=ebcJ^b1>znO9P;8F4c>S?*uEEl7 zb=t_|N*AcNX#m7fe6Bq*H4ZF*uA2W?<@sCoe=W_g2kfqVsPdjpC&mwgdH6>$Q z_79Jd&&G$ogs(S1KTQZueVRgzkIpn%8#g{znC$C?NVU+b?S-7iJjvJY4g@(Vp|%9! zWRO6cwKeB)XhXGUm$u_*VQcSb&6TO;Io9d+l8n*fRY-MvkcF;=(f8ttvI!BpYpSOf z>sFlV&r0{VwgAX|uo9oRyu?q;?kg4c`^e+}jGjuq0r3Uq!!ZJbDkO5!4w8)sozou4 zJ(qvf@%#cN8(VmbdrQB)4H8guzO)mfRKcWG5yU8>R0OH9i)|R~%cBi-77P`I84fFg zf8&|lO>+0Qom@Zgu9SH6U`cMRph#{;fBm()S*IF_S!?OyR5x7fc539lGmG`GjAZ+? zwGxZ=tWq6An(=bKcLeG&%&3DrQ5U$VEpd^iZQ%XR$-l1xOeYk3d?ZTj&q`Q(yfHP_jCc}44@ z!IhMrMsm=pPXvT<3EtLI>b;e*i7wF2e%v7a6*CF_WbOS-g#KbY>*P4?N_=-IG5z*_YcHWujCkHv zyY7rRG4IrVt1DR8c;3qW{!OfNz3HXqC^R^CjQ)vs5l-|oL9Yjqf-Ea&6s84$m>#ICR1D^)NNRn+PRA0q2EB=b=QIi|$^d!{6ORYG767bQ) zb>1#m4cX&23gUW=%l(A$Rps!A;7GC$>Bc~AF@y>k`^ znThc6kN$)!y3$b%I;zo0v)^n6)jg?&xIi#$Bj|)l5Hau`KIQQ zz^14+2V_jmXsS>>HF%WRq$O3`wkbjBYn!XvOiuf^sx9exmx>F9&r6?bJ0*S(4*oKY z^)24kYhWKJ9b!V=v`V+)h}Ek*Mmb!{Yvb6+LlDrFpS7H_G%>2KL?2{dRwBL)dgZ zVL`h17LAdBv%@K7Q)P4~hr)@6xvhn-op>$=P`EwN3*X%P<}d*RMYhKW*=}$ufA0?j zo7B2xEi^VcMu9C^6sc;wV5x18MXsFH4a{jK@ReO}=|2{91=Jk2ZS}8K-pvY4)&qaF z-dA3TaH7M==+(JzHn@OV?AZM?$Z=C{j(~T%)nyf!?NEBn6^9TRU+w$6IQ8HRhpnbw zu%!9jU}ekIgLHRh{0PFI<WhCXSBmuX`(LoFo?%pAY*GnKD&Ai0Aua%P6*l-SXLZf;?#2=pwo0`6UN*w$j2{Q@mF+wHnfkp$U z%nW|HNPx24GBPlzbp3tmn&CY7fzCz2{-2sQH>|}n`YJZSXI(OqB~u#zDLg8G^$n#D zz7NoHVBohbv)zA4F@XN`!lS>YC8?8#sp*mR|02GW8g(8bzQxw=!K2IlNfKR--4=Mt zEc%H(VR?Dkw<>yBgoF*IJy4DbhYOQ2 z{y{Q$M}Xdg&l44vsb+siCK~(t^NR}MR4upwOe#V`Lc^h~kZJP(L)riEWD*4wIPS)= zkE~-Kgt@)NkTEz$4%qZ~R8M}`{3oiojqslQg| zNk!L^DByaxpkO$(V^Py@@I*pbNQ(#HNkR(;0gUdyu`Z&k9F(?Qp z{=c|xT1pa;{j2+u_#T~j`1rDRc5LdibPuIm5MRd$oB9Tv5n@vFQhAeY7_qfPPzrwj zczshg=vjZzcHw_htN7bL#Y@7Wp#7L$_aIOfu?0f%rn%)E7L`K%4jMHPlHeU)iMA^y z6$W2MAwsq(O5zEZi%CVqiG-=HI`;WUH^wOzh*YP_A-PY_STNRHNbBy%3vFQz@fwuw zJckiTm(Rk`zH)GK#(w%F5p6L7We^Tk7A6^Krub4Uh77ziOzmxvV!?L44V~Y(fD+Y1 z%jJ^h`PzV5q=GIyg#>77l2dqAYDjo!A~x``;=(2w%>K)0h7c z^l-3R9A3}zO&r{r35*X{q>!PTadZz333M~5A0++oz+7I~aGQI@aL34TgP}E_+RFiu z;aA`%*-z=ryCd;ncn`y~;Q*-56foRMD4kk&{Q(WeaI_ZxGxbt!mwUa9jcQ zK?7Rv3_M`EC-`~N{sH^W@vb0EW>C{M5eUmuz+^OSL|^*cnP&j#pUB|AkHGLjV0gIT z%koPUfBnos`z77EXl_tprkvoUb+q65r z_q6^y?fW! zKgAyu{u5@{OhQP+Y#nVaey_$#k4Q^T%YzgMhn}1hEBuo{$gVe#r_rdF=e%36)i>~? zxK~UfeSRGb3-V*OLpC9jr!Br>d-`7}?QD z?xQ0<_xAP)jbK@z%1GQ8ky*)fXJDeZ4?6<>e)D@1(m+_BM!g8XhvY1OFAaX40;}#Q zuWKouG$zybMKS5WGk5*(`2|S6jiW~j`)~}eM948`9B}OLi!baR66{a76rKhWPT6)K z^#R^plmts}FyF`t$YkYMb^iErcg{VeNc2?Zxd>r63N$si+-kdlV7ii#o-YwMMffJ( z^OO2b7u!niU4rq-aoE>VOV5i;P4LE2?^JLll4%bMltLH|+xxwQEVoAU=TGpjKEVDi zWIi;vG&%fA{CW?GLij^7#r;?Oes{;GH(%O~paQ<4@^%eQXs;*CLTr2e^TxL~ zPV@q%?HM&NtAg^R(EV3>#kqT@l`UXsJt&s{$I0j-ABgo z8hdId3F-oWpUa7{)`s$aST?BbO|zgPDa0P8mcqkj0`=Tw12+;HLp_kVJ2n7H7Bl)~1?+~C z7OsOeW-8Wi^V`N;r1VxmiU<#X;-Vfv&knjw(vNYQg@@$SV~=a5uO1$Qi}u5XMMQ$@ z>A6s3;Y7&MjP>A1OWjXxgIuJfr25C%nt*?Z&|_IGYVFsiJAUyoo1l|;r#_>hp+WPH zY;JB&HAp(8^sXmCd!+}*QfjWAOJXxRcu`1~^toF8&O(Z?Z20&$rhhc{C4!|}k;DV( zh=?e>6Vbr;Ic$*8{li?miZK}^mK7rx7fI)3LB|DFz51~OA5Z6Zc4a1}vHB&AbQa?{-%-*TJ^B)*Q1wZ(+I`Y~X?a$T#;pIMbL#B_)mJUNw7Roc-mFsp;A=(s zI4%=(eOzD9 z<5kHEzq4T&aHj*G^?T#2v2V1C|7XCY&zTUnp|NguzsL3&kHqPj)|{K%#_kfU&-Wf# zePTXXuL3ukjAA_UR}P^6=@NcvUJk#1epkR$ife36KW>D0c4x&8qpo%)wu`lp+O33K zY#(v``8LW62lj*kO}nP9HHMs0n@^!ldK`@0?3q0=F4(A#0dJQN+QBSas-4%kj#ZzE_I9$$*EY8Z3RqV+^Z)7S?y!(7W+?a zt7uV1CGRH=9Ksw> zUqeqA9N-fF4I)9}q$Ga9uidtMLdw$p?j8Vw{$$JR?o7H*9xbov`Tq!aJw!+V5mOd3 zweb7Elnvb*EEP ziT&rKV1Uy%(a|58#0zf40>rppON_)~UIuc;@TcK_DiZDrNRLPVQ3NG_sir9WHN|US_M?3|- zX|mWX+6@cQry&AteDB+A8KBi<*`RMAsaqK&QMOl>Qej8#`THH2c=`dAXSM_s|Z z(T9H0pRy#uXgbzZb+@}X{`U6Q)%kuI%YEJO=N^YVdEpDsv(Nd{1^N3QCEjX1K|jb(fG0@)52iqVrEzZYyQSC-CvxA9NzJYyupa*;d^^Ik9&rtyVi~>;=kI#N%7mGA2%-T3WzR2|2bnRCVGE7Ngz)}psYw?&GZg6F%gFEKV73}VCC^0QbVy3d4=TH&Q`DU(VzRUW zN$VWNH%ir(i?ox@g*U(94BpEsmnAah@C(-((-MV=$gXG)Ivtoib8^RQxRM z5;5-8XKkgo?D*5xgRZ%L!CU9LKW4rzdLzZts#Y}P1uD%BC?B~pE^WjKiPs^ZWXKdw zX&h>v2UNikp=jf}cz;MJNhqi?d%ux9*;wmeItCb{j6Jx~3OaDT%sZWOm^+_%QFmZ6 zeW*t?;13e^z;TSBsRF+qI2X%W4rV+fqpW>B*XLnWI-}IkqCXItK)2NW#^?k7x{!aGi z43oyklTBQ;>v1==#~#{UUT~7eul)Dw#4}p>jtheh9&dR)8CA(z@*3}$S3b@Buq(#5 z{$<1tgM>dF&m$h5eO`7e9qTUb+dKLm_ww*twZUcM-Ix3v)f{u6%lE#}Yz!cAZ27$K zk%OH*g5>N@eC+G~(62D_FX$exYN^h0$#H0Y>7v)EWzMdDww`Y`=hW%z;r8-IOvG-|Moq}NUG)y1# zDs=j@x=u3Dh?E1t2*bHV4Thp;*rT$CyK!oL4@tQcqBCx;X}9ax*Fjk`mCEL5M=Byn zOrML8cACe6`@wZf&<~xP4$W)#PSgZfAMENcGF)eq{lzCaH!Yu$V!_S*n3kTriZv=T z%D(zb8glQ|aa&2;kNwm*+^&jeO_!>x^*3Lqm#z-y<}GEbZMYa5&((fdx){6g6Hu(O z^mb0#y|5yu?!RODiG5X6>@i-i?WVduUq|B36xw}9u|4tYWJ_Y#sW8i>SNjtoIyZF~ z0T$Ivsj{8hYu;6d>%#0JB7;pBRw?4^m2jzHnj|`iPh$fjgA+C3%}EY3FXyPl%}LBw zoQ~@6CF*Q{$I~br#^?~UcV3_I{Da@hKol=THeKqMF(yd~ZBHLRl6wjdI6f1eV1Tzz zWW*uXW2fj@ypdRUyNe>c?ectk>ymXF-@nK0!D<)vo?=qJT=#6C*Kr8t zY--=N|5#=1qbIyDXeGkY;M>g${C;Uyf(eoZSP7kn#k)jsqa*b0;WyS?q_4&ebfI7_ zN_Yo8t6W#><@jrl6}=CwNj-`)Lvf(CCga94<+|lYqkA5y(mC0%B=#T{gkch$tG8wY zA+le6@HcksR6Nbi^v*CS_0UL{1o<>KUuRIvJ*vQfB81=Dw;Zc}AnoQ>UM>t3#BeHE z!#iodU2)YwsWg~!Z>yZww{lmSdczdzjEqo(4R0Skv_(X5+H%AE8xFfEFOV|UsQ**q z{dB_Nn&g5dBJIR=rLjP85SKd!{;>NYzw%-3SrQ6lwX5OPbT1y0hbbMakHV-zblx}O zB@(+t%BXYp?K3>KJs$IWGIy}22rk%V**)pONIQf5gryYGZzt11ph!;oCfp4&7FiAZ)AXBW%-BcxIqQ;$@kBk%oT!-Jee>Pgf- zh9D9q+k^nm%JYrnGOCzQVYr0%eGF5dwW58yUp@$h-42h`icHpa&VHV#^FM1mcgi2l zi#Vm56OdRt8h}*NuaoPNO4>=9d$t{P8@v&zCx%Au3@5V_|2SrC z@D0DZr{a(>*e2Czu8Xze!_w=M@m<5LuwxaE%L3v|#8^bf<}8=3@3SYxj`f;Od6azT z9<&+S$e3YhF2{@=?mPG7hY?|$-}Z2eaj&}L5$)ZU@47IJ`==A`2&Z_lTq^>xtx1vmAC&QAKMry*U0JT3xj2UZ@MstZ|xqL?b$Il z8wL5fM;ck$x)$4HLXAEfg2kx#878ggpZF2TmA&vzicp$HCqKSIn75mMRYvwA@U;S? zaMeO{wFjR7_o?-?;7>HM+7k79b=oO|;?I6oHOvzs0l7&y^R1EXu-77_d`7s;DEv8= z6>ek_dbJoVLqY}&dtc?0J@;NTJ{jN{ zm3p6;ms;kOYGn<$F4xT;wy|a(r*CMT=L)coZEEAz$!^pHgv1xjWH*0)H)DXP(P_$s z;=qyf$T0kx}zayjOd}^F*4O+wI$K3@Ys@-xQFyc?>s9 z!D*f#s>^|haH$c`W)7$gAvN)kMQqNB#M6qddGgo$FYctBuHO=WRlXn=lssyNR=eena(z;c9RG`_J=` z4u-F5N$>^Qze{{-(Ji3D`nfyd5t}0Md7#nXE0`tl+jx03w-G8#Bmt~A5!Y(Dd;%g( zMGlw5ole{6*sx#p*|N3!a}!XhU{UbHzPJnd4ppl1H%-}j6^vTqiz)KH!5O)MrmKi* zdIqWn=`7*vpIdSfLrmh5AR<}|bwMe^>^9Zz@s`E)7wjA}IQZA~{KHfpu3sR>-SduZ zN#=RAR|-M-v6qU+e7S8nOSD0aq~rn514~#64=T@d}`xbrmRBjKx}&> z13N`tI(+GHL1wkZ0Sh$Xa*Yn9j0Z7$((sV^%5TZ;g)R7aZHFffFCO$=)92DAK3#A< zM)lSmbKH@Fk7|%SLqR8|>I#pHW2c*IMWWm9@vLvMs$0^6sJT_U;+Jd4a3mm^c}mcA z6(}hUE;Dg1Bq8xdI^XcrGxnAqWX_8OPJ_in!{>1~w+8-*_z5Pu^?2CqH!Ee43(e&o z*|TZ$)8`F_;~v1wz**oV5SyAzm56^&*0=Zi%3vd8!=%@t<-w#7eVI?^aM;7D5krV% zTWjvSj(_l^sD0z8XpW7AjA7oqEm(s^>U>Ju=&)G)u4{3c&@|m>Zn4}WE39tCURbV_ zBg_NEN&o%tBCVURH-2-=TU&w_gzt#_e?Lw*Z{VMAN}1qBHgCR}X{;7IlHBv^8B!3b z2Dj>BZQi#zYAmHpckS?ZJ0om9Y6jbG19&{}N%?s13 zBq<}8Wr|v?$)Nw@pK(#dzldzOTd-O;&x4Hl&XH>98%P1`9oU8rybN#pvLQ>sr_0x2 z-ux0e%;m!V17T#W&(>m#k{q0TzMJP$Dp#&`IFIK%o0Tw}?@UNe|ES1cQ}&8=TlWIb z!i0i^ebuOm;5zjt+vb+}?*14~lXL_cZoGky_Ncl13F0I8((1r5i=mgWuusI#w1fN0 z3XAU%+O+Rgy3_OhZ8di*uO`V~3?@_DNM3y?ubo!sw?$NqW!>8yaSlND#5@(6KR`PK z=a%aIZ3~z?`{Hs}TQ~yx=Y+ev%&8IeY$%rI9j{x)sXUf<5OAmmUgGq(V)L6I{Nx<8 z$SwRlA<70ByO&@ed&A(-GvE|5r;Ql(y1-pv9_9G zt`j_u=Be=#r9r!ZjuQf%t88#Tco3Ya`=pmS!_*FIScVNIJjpCR#k6ea|7hH?ZLVE? z&u>{7xC53i?CQDI2(@cs0fs#Z?G%pfun&DCX;)tR*WE?J`*JhG4)n9HHPuIUpL0I5 z=tGDFy4tkOo6+fd_xfwQOV=xKRj=-cnsM?TQjQV%(EyzLt5L^UZ71^p!pV*ZT{VkQ z36-18s<74;2Fe+emjY?NdT%0Qj8P-|;aKz*G>8ugl^0!_dMpnKxv_KqaA0L|`Z zVwa!gx=%dDs`*ByI0vzg(!zYi@kXmv;kZA%9k$^w(yO8jkl z*nwQdY=5(rBHwcOOI?`ZW_H@aOG=su>T5CF2{b#_CQJM>(a@zRjZFSbg?<_f%wiXO z=F&D=4cG#k-PA-dOSwu1*9CwjBq>M?FT}_G9o6Eg_IgslT=&<@($F(?vF4yM$o;dt zeC;YqTJVT!r(N6VL)7ls{CpjTDUAj{%g_BDwk>65c!GhESlbQ!CPG|#$j9%8dlnL;?ku8bJVaa6dRmLK zk5qFjmY+~m#?=q?d9lsmv%GMRy`4s?dz|4vG$=jfIQG|BlR=MzF5^PIUD7eDtgM`5 zu)~tkxflKwZ(RyV>f$;Q>uoR!lb=-ek|8I~FV$g!)zD=a6MPApohmakVvO�j{b* zIIK#;G>1nawS^@dW=w~S&!-oY3)V9q1Z5;i;MS0V%^u37j0V%`;Nali;aD%FFXSZN zB!-3KaSC@Ac>eyyF3K3{dOeVoGrqf2*Zn}>-30Qb*0o-Q6!^Z%YM({31@0{!Pu=xy z4LARRuk76q;X6$MwJjyc4LfJ1UIwZ~xR)rDbi8Ot3bZMgfn@#wqKGV%Yiq+1xijq2 zh-;c!>Lc{4$^#0N;gxhISe=m5N*f{+r&oZ`63KHzuO3yf8~u zYTjNoaW7$Z^z+>^FY(98QjD(eCDowge8cm^_8Xzqy5isIenAOz60db0aqB3et90K| zKUOGMdW$EDBR7@68+6G)2_J?gr-}@R_>66qkRdVqj(8%tS&^GX_V(&*2_4!E^KagG zgbw8Qruk^;oLval%(?`hAAb~Fx$zNam9a&_q$hOkbe&HAzP`cX*SE(>-G|K7lAv;3 z+O%UMB}M_|#N)c+^5&B2QDI8MxwH{P!0b;E9!$+c@g$G9TMn=}c1HVUGLSO0;nSH* zcbNhb#-m4#kKOakvYVm5Jad&BZ{WVq1}R(t{+$}!+^|d3M%yx*ytkluq11fahpXc8 z*nVU_)Cjs(K#ZLxSCjq-)m7e&Hme4d*WRfExcc!DW=Uwc`b*Rc#rg{U8 z{GZ8ornWgyN^BCeVg5wl3rD`pA0y|o9{A;dxX!pZsh63YU-)MG( z8H3NOO^1D{s8kY+ZE+L1hZPlDbIIK&KAqpo#M+0r-f#k5$iJ}&ew`d3nBW|;NPH0F z+96IcUuSfmD})v+d2#B=KKBjg8j)Ic<5&^78WKL1j3!5<{;|U0{lY((mkS9vGA)TO z1dlb1IZU;be0KKz)U1YF#1F&gN=W)D-Kv|C5SESO$PME)tBfSiep})2m)C8;W(NbX z%?otEM_oOBGqi0OG*BCJFIT^8n37&M3i=IaJb? z)*1EEu)88miH`($br@_$PERLgRHd+og^6%?MYD1(HZmqzu&s_lIroDcOVNw^qr@vdKzHy*Y!*McOXib83R7t3n3stKW>l}e?o3z{rX1tpV&?P1Y->mr(u~dW2 zg6(4mi_zW{?BTM<>=8tCp#>DNIU8@@PINKDVqRFq^Rm&wKU(xQSA*}T<%{}0foWnx z!O^lUPKaJzvR>QAbJt}vGwCA_Gj3m}*qd7IloelK|ApqsSb}XCUW4(1NqY_EbyAh= zUP?i0>r~HN*NbYEC{tQqsPZ9NBj#Q!KvfGz1o-9$a;vnT=jyTz)*XGb?QG@ajJJSw zC%@7lP&p+!lDPZk?Cw!1NB`_rZ(1 zT9tA|1b)k=7XJvVwdr4BHOR<&pX*dY9qa46HBQ%o+W-ygOY8)`w^RyNWq5pKIgx^_ zMxX-+nNF&V}5m=)_PYsyHmM( z-hs_H@|gl#+^dqn?Q>K0%4Q0N7CE7Z=_QV99(oj5$R~Q%o2mg->PIyciWvL)B{V7dG3Yp%zGj+kFvyzcAPxMr^}~4Jx(}kTE$F zL)&u9X|n1bLTqW*oqi{pixnXM(0;uNt{l8KiE(qBZg82tA^fUxATg}-HXz9aq{SjJ zTUm#QY5ZL{clY__>sV0V2CycE1wJeqw3}?N@G+LqXyg*;B!*zA6R7A2x@|}G24xAP z4^ac%ivOQ(#bY)IQ=ni>8e@loRk8P z{$CoCdJ{U}@jkT-T*Q0ZHt8h(qM}%29o0<9`$1uNi4wr?M1KeMrGeUf1vQIdE$c6J zZ>D@FW>Hny7}M*Dy@)6vJ|~cRgF9U_Upk45208SiEwo|muyitL4kFATn;;?(IuIOV z*=-Y{YH=pdOVOC2h69?$*$P|OW0w!(G5Asy3j<|qn%oUW<2OF4i90CfyhHRYp>Fp# zLtLhT+8jNRmvW^MG5Kf|N z+%FP1HH4!EEM2-TpZTd!I&dIGmjJ^#34jX)_0lg{G&CMSJMj_!kU~-$cwPiGXI2gE z>{3-*P!E$~=rmEM#iWN$^VXyd>sBtoY}rGWzJrdd1DHD!xZ~}497Q$Pz2(wz*b)1+ zF1*WUNUT>^L`yLc?hW?Wl!#-hTA0E$r-Z|G9RM=Df#B3jSVX^I({N>KLTrWkdgsD0 zzJ2s-TgZ&qGp^Fzt{1R}ZhysF3`#m4`}EHy+2J#z=IzI`Yi69u+v{*ExHe5$C9@x=RSxEN2?j`rJ zye*9X$*dol!fkBc!Hpn0fB22{ZH0NAR-k^6p|`7!@5NyD)9#sE%BAx)i8)v~9FDW> zttH!By=iDz^NyjpaKEfE5e={zy5z8vtGrwMPIFMXg89;2XQRX@!sQpEBJX>gaS7f> z6P|_IGkeP{gM_71d&%EQnu5$Ph=1__$^Lat?&|2HR zP6fNLaIW(eFvS}w{bd5tY?Zj*%jneHJG2d@|6b;S=Pz% zAw%8GPG?c74?*1RKqPo~K39ueO}R8&CWd%-P&iS<7}#iIs)qI>8Ux=(NBbnT)&3_N z+h%=Xe8H%^Bfd)xGhbxgGJ{eUnfn2sn!|y9Bw5?XPUu7%Tc<2!yJhBq$Rdk`3qD+m ze!a?IjPMj=7nOKtlwr~Ao}{oy$hDC-SGd=Xt4XKMb(wITq>OztQ}1Thq|nt>3)KUi zDhW9GW3qwoL-?xu<49n=bg>*;@&Gy)5C|KVGl^j9gv+zQF*FHm7xT^rm2p}sI#~R&$M;vhSOH_b$8jnV1a%`8*fm+1Wz^nLs65bwrWxGO7DMY;rPQ>X}V8c*mr|R=t z<=r=`9<_xYYN|tTP`2tg&L&9w6hyVl8DfHLA=^13=|~PmbQ%xy6uX;Du{p6@33U36 z=#<&#bG>~m!of=4+Za$v^s5}-#UUxv(ymlU2QT84>vg+r4s=^a4}!YxE17`ZH7ZJxqle;qi~M5r zfYV}Py*GS^0JV%ddd&SPF8wC%3cCLxOj|OvyBHb;shEZ8_DNF46{Ir{NskJAngtRb_N*yY zD`<(%jb`EZ4|O6``d1x6N$sGWJca~N8@KkPD6T)E6Cl>~lTL)TdSuI~A|1)!_H$B} zI(5XNfvvzQH?oADBJ-~MrxG26344HG7W@7;um-@JXcsy3$$M?nr~Lil$R|ceBD_d@ zCi9OcklQ>8{F~8L;o>1R_LDaf}j%(oW#WYDbmyQWkh7E`}JeZi9w$hQQX_ZajJd`AX8 zs$7FNXhe#0XeP9@f_WHm_~hLc+I&BQJ^$t-#D`_I2NH)*37RJSLdP4?R-8X=K`l)= zxlEF4q{cER*AC0*!2s_FT%V&{tf6bf`pp7_=X(06DM_D6=@pGQz#5q!u!c9E*<812 zxZ$FLt%#-YI8f1f$v2*bqWI4lPR~F|)f-j6{Uze+lzf9;S*EC+?XW8;SgguCo4XqMs z62Q>|fITF1mzS^7Fa|Gn^W<#r@5E)KUL|slk0`uMWCS~=GGu`ivH>ZRgO8T=+4awE zAm`?=a47*Z!JxPqh2n{Dr)N;sDDK{N*p0UBR!#N~I50g6n?!-6!h~&%K^F2^Fjvn{ zLqXoi7<&36biNWGnC6r`ysQ{C5XP(2myh;E(>@UE>4Nsa9n;{!a}kd$}%g>JmiS zj=ACP*_=AXRwuh7%s#q8oE_3F3=`X$p`sU9eEPxodhdT2A*j4`BuY}S|0RY<(>cFa zX_b|(Je{Df1kuy$QqeDQpV^1~F(Xm$6ARZpfo z^d3HoVh9me@g1zuR67mQ!~Rz>(eQ5jBJ?mZ_tUHeOpnH1PM0oHE;^i#6?!rwg?p)J zZ`JXSBQ!4MtyOz@Va;XUkg7?-IdvzS(rYJ=Hec2gXD)CC^;BFlHWZ`yvm>*x-&1>p zp0PHH!(F}AMtV5;f-a`Z$Y2iu-|{O$9dc|tVbY9!1sll@_98~dY*Q=>`~3rHYgPnC z8OwN7%XAqu^_0reT&k-YMSln=&}*>NxfNNWya`0CsS(oO$oexq_apde*SE6doGTI%0IJLYR-Xdh&cwrMuPGsF z3Lc%wyEgbEO4_SK)aL$8Gk-)gVAd)=4?wzsFUT*WZ4(R8zfT?yyvP|WyX{WPC9n=h z!;)nRy(i|YxEn}kyktQo%UHAW?B_GE=Q(q|qBqyWox>sC1IA=|0Lz9?4IJ|iMiHQ7 zP!+@%ue>Zo@z`jxl;hjh(&-uA1AvKr^tS!VXEy9}-yy&fyiyl2Pwp~jvz))>& z?^104fPN)D(N_owFTFG<<^z~h(oKW-^Gl~tXO!#2^oUBtzV8ybesIQ-?R0QxP%P$; z^MRN-4{QP}lH#cD*DRBL=&tMCja2a#KxheXr-+vfoc{tHk0j7Ttqh7JCa;4$b z*v}_>K74X%Nhl-14;mJH2X@mI9?(_+1Whx5MWyCE<%Ib!&N33=YtOlF)>+MalN7+{4ZkkPPA2B?4WzP-zg7o>(72v#+ z16D8|?K=rwfFEMaxuD|*;WeVC6i=A{A3M|HCw+L4GZ;N_d4s5XPhd_@KrzRT_aMg^ zG63hw1+0B-LDsbXO~E3k+n3E<{(1JW+OLchA3HV+E)YOWe%T4(K92wplX@Q@s|zo> z zsy?=zf+w5rF`#`t1O98RQ@YJ>T6CmXK2LI0x4%f?=g%3yvo%(Uz757(3BXp{3gLQ# zJoHYofPsLagGnOZ~HgKz$QySu;BVq%dHT4kz$ zJLVdGbX@GuRo{k(zxI-B<}zEYnT2@#tFQN@jj(4XQqZ*z@~-umP~fncVVio&3Ne=| z6}9lHlN=DU4sJwp_O0r%%YO|ML)ae0NF^}hWQ>zTjnDSr$unYi-kT9u^IHGE@1${)a)3F^KrwqS(jK#MD zSBMYoq(@Y}OUuH55W}EP#(&W0KFrpSc}<1Y$~?qBt#()h{-3X$evC@_LYgYV-x>JR|aYM zG0mX!Rs_s@8ZJq>GwaT;1Gla@iyPGcQ|kNE5EOc(!3l)mahuU2o6R=VTcf#<)lO9Q zm=wNs^|-GS-k4PN1=5mo9KR&w&m8XnR!pxP%0=NrL zgwJ!;TC;XLMs!Cb(>igCX}q%$)`9;xl8HP8-8Z;Ce%NVV+%}mwS`!(grK_C6RJwH> z1(OTsL3$;U=%Qp`pF1JYAt22p@*fs+9Ht}L{tRuy0>o4c7dvMEz`37L_>M5STy1#j z&~3TPLMOKOu%r^YxRj$lu(x&Cl)W$f5otyMOBH#7)M8EciU5td^72yp3iycJL~Ko% zT(r@gBVW={=Wn1a&DLLvTkZC(jhYTpTFgv3TdZ+4&&GYl%r+9lXk)(q5fs52mx*zHain4 z17O*&0FScDx{CML)qDcV^eD#L3-2|t0iSLF1>9tSHw*V)C-C-(dey#J4Aij^;1jGe=;X@&aZTD!tgRTv@h4&z zE$~^<>gM3z?wkcwjU^;D7^wDo43H(`@6*M<#SsJFo?RWA0Y#8Jal;f0L3DrLQ2_5K zypNxlUyo1OTP=W$|GYzs1oGh%xl0FH00OdyQBoeD`D3ba=mWZ(ljuL}KTL;x z<-~qqd$3E6c_( zm-CN5=G~V>O%0>7C{0(^y8hHH${NTAPx2=$5W5lOQqsSi{?dm2vO`h$teZ2{s(6s)(weBE|;JgjQE^dEavqXKkhxZFqt=xh^_g2$}=>*YX&ame4= zy#oZ7^tR6quu&U{<@8&UgoZ;`n$3G|bqq|@mOp$x@6@O9p7R^f3n1BVjQ5>sq3sk} zKN|t7`*$qzzfJ&LB6Ivt9dbgJE&dw5I8ewtpEW}u^}|~<^Ctm66wqz{>LLlqa z%^`rtB!K&;X=ty2gl&1$o4`PpR2W9qkuf5H0wLo2U#AfPjNI01VVqonVh1ol3w18% zdyE}{D1&<_(Eo{_8qhvp+s`LSrxG*oOUi#qO5W>0CKSVApPqbQsuwZ) zH-f}rx{#T_G&Elz@ILoV0cx~D`epbYRRa##7&Bax(4T0E;G~ojWyqlZN43OEd~|LH;N5kgrKIjeqeDIM)wd1Ji@yoC@kIfWJNH$E3? zDt{V`rhWX}@FuS3kFEeJS!`>&xs4yj8juOI=4CeHyDqEg(13OPlUQ8)sPnGq1{{;{w3tyZu!vUe)v z19AeGM^6O0zAedEL+U?D*h3SV?+ZCdROMXjm2!ZdGEyu{v`)7hYz|};@!4i~7VkKP z{mqF~33ebF^JS-T2<_K8o@A5)x)aLbgWLf2RMQ6b~l%1CFFj3|Wt@6e%mL+dD9qdQ8$!L)!l`>qB48 zld@@a;y^Tr@hbI4)c?SuHVFm-(!Y%(C4O!@k?np99Pzhx2_(X!SpEP})Z~+smX^+5 z@sq6c(tAmlBK6;3sgwH0_K%*T*zKV=8E=M3y}=kqz#RUV@|DZ5fif$@AD>3*ErwtM z_`z5)lVd`RUc%=~{}5*tH8nrSN3~Pfz8Q@BPvsauq=><#P8WwsqFUB~JlRHnJl!~) zk;{K=pg;@t!lu&v%#2OxAqkW86s4VW!O{Psn>FS4;{|4Y{f{-PNHIYLUb5(bBX%HC=64iNlT*c8yq4T!j{-orf2bp#OpCly`5 z|1j%*iG?kH4~&9Be>l_q760?cr>U4H5p&=Qg%EJylZ#>gfAUrYp2WQ6(w2ZReWFfb zk}~}zEC2xDIh^W9@f@gXU+mU$kKojY3`wdP z>D$}lrX0(2>1zFXfR5kG=>5HiK5jQ)IAXqXAp(x&;-rcE|7Gtj|j>hD#{iR&*y#{lw z{DO$_J5kal|KP+H*N1$=O$rkrM;{ga0V~y~FqcQ=Sa(gJ)=~$CYR;6_%{z^h;PsLD zOG$t>*`iHDAL2auZ`{?L^VF&`HGjF{)YV$CzRMHgY_8Nb+Ft!b*vkunf{6jC_H{@b zN4o=K(@va8`D8?U5@^NE=r$YaWnlKy$LNf zb?KK&m~;KnX#{m1J-&;8!2nFWhj(j4F+f0#p=VB)gGqX_SiA^4DgSl~a$M0A;NoLE zhbiKd(uDKX1n1K!3!BTWmj0|PoZjmFIg7370u?C<`%;^#xoL`Metq374+$H)k13&<9zA@ zEe6z80>K)Ktd!u!68PA`)vA=77?($6P!K44|O z{+k)uVC>Fb`Ps}qk{A<_6%e2p++}!SCCZ$9ylEQcgRH@7dPD~ZNRzNHq<56Bzm{~`%FwOzyh*~9v& z*Nrlhlm%k)>rk~JOkILzrjhvgA(*F~BKF*? zaq*(hN=5tp@xjroC+Nqc!-1Kz{s~4~L!m!I?l24IkHEuJ$T8iRoa_QrljEgbuHlfuT9Xl*PM89d~ z<)%o&OY@1QH5=0(&D1NP)Va4FME7f_RH{lmREmt`ml>_8>|Uwua9%YF7r))|vj!Nz zVO2j_;GW`3q*)(uYfC-?CL6-AlY1^$`v)TxC9FcmqC55V@VAEifOQ-0-SCXJTl&(o z{x#>pd%NL|xs34@;in@snnxJ|D&z+tkc#yzy#Dk24S`XO%T%*xx>~O-uGZXE=2}!e zh0d94hLI;e*42R2I>YR*0js=rJ4I-1&>AOw=jrxku*hXU1F5wPm z)T{>L)M#_56NGh%3jHPr;SXsK<#f`|U_!MgI;JZ=^+gZMRT#SV5xEziWK}%R?E3xE zNq+9-$);)A=V^+DHGp?L6pC|rqV<#gMEjf=>M-4!pd#t-Oh3i3Q+A)WW{{6>JrN+A zfj50wIYlzob4Kmg7IkUd*RXTRnhx1)>WMJbzfH4i*sPZGjD>GW#<@0QdQ)+6?eD5i zgmWe1?s-$*>*%t^aZcc(kfC*ENdqj?il`OSe|+{&u!w2Ady!TIN|J9>k#*jXRQskW zk5SdZi~ZBno5RiU&ookMADg@{wUCHI-y1e}HLboXI=viE0q?evNt z!AE_(EWWZ=SxD54XIWrna!@d^0KlS}w~dzpqeTWqlA-k~ayCjEKe76kUpvIOY{tltw6A<4*XT!dJ=jj> z6Wef?4iW})6HJxh;KGy9=%|o;4DKkiM$}vB*})Gt+fw|z-y6$vG8$>P@7*p}=ysxO zaaBeg_3INaunviU<-D!g32WV!;{p63@yW(yLnAoc&%WN=&43N{vfZRBn+$-yU$tKA z~Vrl=cUx3?Sy zC(5@piKLeSEH3-E{1;8TB!&oYYcbeWn&-c8iYyL{VzIFeHZFU3U{Uo>XB9BJ^#|4c z3d41g9@XDrLzrlakFu^0U*_v{XH%wMS&P7HU2ZsJE8-HoOJ!6b_>3m)PLj6HGbaDt zg>t4?7^RJRY`R2YlmAGw+1AMT_u6W~WmctrL=wJG{x|}%`RW3i=l+iR1M}=9m!qaV|37yti=XM{R!pGj zZs8PQ?U=aU-*#2&HQP#T`Ax%%G)|9@B>XCQ+@5Fjx$LSOwN>f=x|N$8SY&ZWysueP zaIO*HXnSFE8C_q9N)xmD`m@4;3*i?`DlNx+i@LYyCg9)7oW z@m+lT(hJ_H%K7&o3>cEF6lHS!YgQCSJ{Lc@=kuIrOUgVP3{9^yA*m;UEz)%YT#_CS zw!bWG%1+uGDpCZ~=^_Ebsc*O~Y0MAx8)sTe7!+%Z(5cS}-+LJ4nQ4&w*`?39J>F1d zyc0ZM(@rg^Fm-RYN2KfE-PdHtc$VQX91D-Z@(C|nqKlNTl1%m)YgiMnMZOG{-p;z~YpZ;@3W1eS z&s2OF)-Lj9E{lv@+Qa|T-dl!c8Fc@mf=G(eCGpS(odQzQ@BmUG_0U}s(%nddba#Wa zG$NfMEh#D8BI%h&(f2+7efG!mVPDtw;sY}G#9Fh~nz`@!{bmi8kXyMUAb$Jy@=c(v zrA>n?S%#kaJa2PM@%wY7;3W6hrxP}Wy!@JWc}wr>)PM2~DIs*uhL2 zlb?jc-BCHAsemLf=zs;Yswwnoxy+c55y$zWsx|V9&8--GI+Isf-Wx`!W~~R=JXYGh zjDg#9@mKv?CZVd$(-H;q;#YwLXpkE!)kdafMGEG0cWn6&DOWM=p#9cmLbaBKq2u>^ zfQ^HE4Lg;6Z%Ay#^Lm)oa%_8??WeP=^-iHHcWhvUnI#}n!HP-B@>{q>K93e-W^2cc zN#YC0`pzs;O5aUJJ~-_?)9UBk9JKq)B5<%uQA%9?<7HU>xAk4!!TG#lfGQ4b>{46U z(*GGc?i#v>J>dB{CZ<2^1HX&?10JzEoqRjUCs1CouC$_VV*Z67p0`Fpj?M>-f_?5} zzv&EC?MR6t**{3empuwyT3Ha^nI3IAGHU=(G~2*BDYeal_Bj8pAEEV=OzVbbpW>Ol zg&paKf;t^ukgNO%bK}m3V#Oipu^0qM^?*lJPAJYe&c4Bs&oZ)*&Av%_5>8P>SJ@+x z{pZ4uD{zHJ$NidpdGe<7+hHrX_;e`G1f~hL!^L|wjP0Xa8oSd5(VGXmFC81)vW<3G z8Hsg$(hdlhk+e^p4Wd;Dj>w60iAnMUac+oEq{Gd*@$+GB^F`u@P)|=co94Gt^^fJC z=tFi~kk@zDfbtg)>9|8{?<9)23EwcGmL1_tDDn?c9c;%XwYJBojduolB@kHZKX$YF zy?2)X;P8=bt+|Rf9)m#KJ$qi3PvmTd{>w~A+P+A7nqIxr#ui_(Q;dQc)$LJ;!^}ME z1e^%4piRrElBWcxN;~Tdu_c*OQVQLI}rWYN? z7n)c)J4ITmCo|9x7_u?UeOxWu8RA408kdmzn8=^6G2eAP%$Z^!jcY&g(HiCC0u{F| zqA=#E7{6W04Hqirtr4p`xB;d^LS+dg)xl%M)77t0}Q&Qs#*(a03gG8%$dt3CDj=qL35*jwj&sQoUGh zfTbZ5b5vZP%3l-{VaJMUBZJ=a{cM;$FHz^u?e?;=8xm96S!?22rY&Y8c3AXz-^uPD5>vtnMwO>&2v<^GE4N@&IsY!PwVjNcbnw73S&S%naWr8{H9FS? za3C~S%qlkz_&3(Kmjt55}DBltrfHf8HXX=cg4Uq34wL# z_Cdx@oQQ)8lds|PtMhMmB_|HYr?Sc!m!VrOGz=cE$u#E$E^X@Np8ru&awA%dLHaLA z%IK+mprM()%47_Sl6A~7&({bXmenMg-*c9hp>}W0p=I9KC89fB5$jV+cMi6fh`9!L zrjy=`w%ABq4sf)8kk!TMQa4QWfjg)~%?e1Uqtzq|@b@r}Iw5E(tXh8koF*9DvmP#0 zZGUp0YbNJjQksRS)TvkM@3dBn+#_Fv5E_uf6>6{3@u_;6`&8N_ip?EUT;uCJtK6P? zI#|bE4{y4Z^LHWPBI@YfvLwQ6D-g=N%inu67Ao)+Yd^fr(XzXshChwJC3Rl*Y}E4j z{sH3%!CI(8wmvMuim{}y)v>`RHBYn?C7?jkfRQLQ?%JN|{dYR}3+@Ua2tk|k0y)aH zWsS#U7K7M*`2&_SZax+P9cAS zi;=E30tHMQB#iFsf{r&W;8v*?K?DZRGO#aLdE3cOa0GOZKUE$v#or@ zj)ga9Sd$Bl?s}`I2m#vQ_Ixyg3ts*^x;`qU7eznu;LVLXP{D*Bhz>8!Tr>=I^Mdu2=FP&>Pd_8_h0Es5m(+~zRiEmDxhR!>?NuC?a6K7TQ{(rGf&)Ub{63!cwrDYvG=^{QpAPIqcqXKel?F zzP~lEfQHNQxcW>b;TflqcN>0hbyS~haihAU$b*S0VJ~O8K|It1L(E<0uPhh1dBU=j z0$0wtHQ)0RJ>q>9x8t-6BO-vuT7bcBwzw9nCii*j$c)G0o$Lve| zr4m+_w7N`8^BcQoG^tSv5=vFa5mWPc?%btY`YF|!ok88f@d1lq?!46b;f`e;8Yw_w zS11Ay_BREb{Uq?7I_Vc9DLC2QafX7s<*j2BXD~2*@qk8msTQ4mHhj_JU4br{cib;x zeZDzx??Kh&vC}@EH**UYD(kyR&G@BPoaT5Fh$8`oA~BXuTjnS9vxUcgFgcr*yWx|Q zG#6;B{#{L`qvbogrD1yHA$2?UnYL^9ZDfM7M;uCQ#a!v6R5|kbl9O5BbgW-2Xfc}2 z{X@iSEX{{w<_*PSSTb|pKHDH1RK6ahyXtwh^(b*SCW?6sc`0X+E)O%?<~JiJujBqi z9xH$v=P1;IYg@W}G-x8>$A8l~Wb3ec0=l7^r!0mm+sQ7ptJg~@S?rSS9fO@qS5>D$ zd@7Y{rP`MbidgYFNfiFQ?D~X#HD=gcqYz%0&z$CBQU3i2GN1$L^u2`6nGOAa z(_Gw70$K+w)Xf8R46R2~^zf_lGf}3dx6EKM16(fQBffXBGcYl5cZ&&pWhP&U+g;(W zEB73Hapa=15t!%~P6@76Re?^>KIZDL-Yc7xQ0FYtC4buxb|Uq)$oL+$<=@gn6~(jUs>JJHw&1PBv~q`JWzEbCt9Rs4>nLNRXZ!*b6*@&Tc6DuQrIJ)+xQ1 z=XyWS`s%Ws|4v9(C;R$X;Zs3LaAQQLeQyGn+5=5i*ss=b+ZQvHT?N-eXFP6QwmdsE ziHP1tN$GxQ1o!-$&ZSE4TB=ujsRj`-KupTmeQJ#66)}$uZubseB+d-QR)uV;#z(C? zMvLL z!6W!EI-xwci0sH>cVbsEH`-Tm4`X8NV^&Tu)Op(iZwY4APV?Xdi%u6Yyxt`~l~tq% zgMg9WC8BHMcyx_do+vZ4rjnOHr{j5O9&M#lwzEf*2vcH)8Hbab2YI7M+(4Dy{=%kIHc zOf$o?W$p73_0NaA_$;0^=qcQm)8~v|gcj6vGM zuH07p2ef>-8o18K-?`55CglgbP-^ACRU%Kjw_H#Gv1hz7)X}p1qNOTqVy8lh(L7Uq z)Y2xuLB%jBDSR5QVaOQ4oz%5xh{|r|{<;o-8VYjFWBRVtNoLi(w%nqsI~a6&n0Va4 zak8nvy^;Sf&vDOZ`Iv;-?O>TAik_R_&YvMIc5#!$p8YfuMv5W2oc6 zYH+JZU!sQK5BU5y5u~gAB=Ho&u8x72$(iVgRw8oIDQ z`5X6;hPAYKVr-qgo@lkBU>```KRmW=EhqB+s26OthnH6T%e_@}_i~$Ct>|T`-u=W@Ey9l5nibH6DW5K4}RD1w4DvlP!cu7A8l z%&j?0XD(2bS}UJ_xa_H|QTEZyh%TrdsX3H?jQ95IbsG{rl+QX=C7QFJ_tUdn#yyq7 zreA>zO@iMnI4xufSZ}u$YeosNFp`u?{Bm^R;pUvZO>L&7@+m*qw%Nzpvk*(&IwZNx zIY|3s5AKq3N-a42nm+v&T>F#UU2bn~6HWAm2&aorz}!3kt-kuKZ<=)hW^~PZG#4>>Qtcrzusl+` zr<^@?S~y~npV-J#2Peu->69iIDSF z)~udd{UJy+UN)=MZpi4TT6+W{kBK!W_sXK|nI(r$JwrkGl;6Zgo%fk@n7dB@Y>rxB z`FPU!-Ge)<dN=zGbJ|$T?12diOOaEmx*aMtYa!A6DGG8O0n~Wszt|?vq!D581J9(_I2#vI z36L3zfsr$7k*66FzdB=hUzG`1 zDzj9Zdf?E#KH$riP~u!QiQ+`mU1N{3WZ_x%WY+?;1+56|V2Rj%!aS02V`&mv`73wuMs+^-VYI< z=xI`YXK^{4;_?~c`yIX7iEtes1EJxEb_Z!T?T-6^WZzYh#Ojk*ab7dZd8_3JrMk=- z5D6CL{t*e5jF>&^Ua=-HdzQ?Fig5YD$pqkI^~;pom-Sav&_y4=sRVFhnoMS10Q(hC znT~_f%RprMWLuDLUpvrztQeQD-`MOxtnMk=ha%G1+Dr&4sSS^n4S|*EGSQb$=X9^! zu$5jl5^JK5TwriN`okvJXQ1AnC1&^KV5#&~tHH9fi%cSTn&`ImtSuFp8uBW+BL(@D zXT+=9&)A8y{4^ej(}Pb*l21lYu#9vOVf9Y&!^%|G&8CKnq>wdj!4=$!&5!jsu&8(QiS)e1zn*2<70=+)4H+%|^*L*VvI*Yk_t=}T($vddp;0M(d##>#q2DStA)Tt2g$dSNfxxWu{?3 znRimWC}R)xjY^%Rp#w^%LI!bLZ9i+jfvIpwUK6szsYmLzs5HkVJHM4Gi+I%#n&7}nKf?!I6*_;|vCi~KFb zm?)L*HQ!t=Ooj8}C8nuj{#$bfRl40oW>fG^B_msv;sHPsOLZiF(Yeos7YSF-gMvA# zg-d?fiwq}>>TKDaC2D|eVEyWF<2o}q$G0Fife$9^jD2OE5;g5r4pKe(ZuI|xVxbxKOe$+(m{3SX{NG|2~lf#7_pEis_` z6sXm&3Z@3flL6!sDUOrVnA_QDPkj`r4s!j*b&8 z$FYRG+}B;ZFbM9A2P96tM}5dh1V4gTkRLGo3Em$4%H`Ck{F_*tr1LdPa8*0P!@BY4h`u@|d(0=WdJ%vW(B!mO^S=SCAx_!3f$1O6gsr3w{5$0C zDQ7JyfCf0vGl-i%aq%YdChMuhkWWvUj0v&10HYuIp(NAOm*pOaeu{E zuf8A6vboY=kt4R?(h2dwFF*elfJ+`*^u#zYkW1!omH?VXSHzHet^$q_k? z4n26#hrtX$>5`_Ngu4FI%{W&9p&{zOm+b?Aq$)@XhX1xAZ#s4z=Ut_qS zv2P|)bS7+=IFe~z*8XU3+ZAF~p1gwt=>GiB$P0zwX!edoubdsA~8(s*&oymvqg&8C&(ou&8Q#s;u3gn-rl`2I@6enu)h`e|1dC;A(;^q?T( z(Vo5NXV8Cv`V*FLi>6cR1v&j1z_b09(?>+8d_SC&5D!lh+_JxDkj6vnjfI2i$AaLf zl%kWKO`jq3$1#D&Nyd2*%lnvbw8Sz919-+f7HTNVNT(zEJ1})J<0^I~<;4Y-|KNkS z(4L}QP9I`zVQJP?>%Pl*|Ncc-+}P(pI2sy|eE?2e$3NpI<^yo&2|dqBw+6tg{tRt9 z#D8MRQ>2t%t6Zo#{RiuFChO%x)4G+Od&=xBgpM1ggn?(*nv)~CcL)dN4Ff!Qi`MzJ zXF=(hyx?_)y~*UnO+ArYR7e2|Q6!vwwMT)sWTX*4Qi7ZaIHUsuj{qW+3l1cgCE+mD zZ5%hIxF2KX`lg0#`QBgPlh-e}gOtl9P=iQ=2RtGNH9Y>7NB?X#hF+p1nnP& zFh>r+)!sNX!LhL@NDyg=9GmVW>`lEU`${F0rjz5QCoiNgaHtxc5DX4~bVocP;U4ah z4ZQ#0akGoVu%tQw()}INAF$2x`0uHUd<4hou4WuM$=tYL5S;vkPDBoXpZm<-P3U6&<}aY;ys@{lb`?F@N#B9B`vYi;M@3xdMkx?M@vH-BgW)vk4OE z`i{XME2SZ7aET3TT%6c%B$kA;2HFdpsK886pwY0KJ?CZ;oPZnwp!V-LoD0VNlLiQn zlm>^D<39cb?xM+69o|Jt?T`7V|4NX9hNiK}nH#ivAC+bkWPxG2zsN3i(@75&P~2T_ zgWuE%>c}8?8X8+w!KS}JNgBA?v8A5@Zb;iH(o$H`>*|hEm7cqQxG*a`p!PG*TY%n# zisAjL9SE;mXc+ego(~xS18mo9WO#tKg6;hZu2<({$oqmnqxS~K5jayt`dz^AQjoI3 zl0Jv^OTE3g(aj<}UBGoQ?phGAFEK=`Y&ud#DQr6Cn=tCl0C!-v*YjXyXwyI(;*nre!FjUfVLr70A8#J~V$ri8(>=Gefa{|J)QiRBUF({uLa zoCL^Ce;19QorlK_Pw)X;J0_9c zv(igAlw2<=>UZfzk3aI%k;()dJV~Iw89AllhnyMj-Q_0Mj%X60$_*(XyLOgdo2ZKv+LwN~5QxrG2j3-u*XC zizH+Q$-V?_uD((>?X=$!5hQ=%{Z7ViV>1}-<|(S?@pn*62n}|I@!Wgjh)Hn#_p*#V zCmc+F6S6gc!B&bTol+w30gyJXU9KZKuB)L<#NP+nToKh1UVqiuQv+d22M6q}zDe&H zal)Bk|5`Yf-UnaOsfI58kthG{_5Oc{C_h>Ik3KNqvfN6ngCG!s2*C8h2?T(fL3+Y> z)YfxVEA?=2tHyN`-(P$88isErN}eaAHo?)Z+$w`U|Hak&8^G8A2bSgtnh4_QpS_}x z7ibab!(qaS5C|t|_9v`I|E4by4(~S2f<_?y2ityA7zy8y9Oh)HLI3Ag0kF^DtDVew zlcNAUK&^i@o`#1c2m+!iKZ_cd#a9xZqwQ?;jxk5!riOf7sfR$leeB<&Uc)EF z@v74P8h&!U(4aeX=qHp%GC%SfoVBFzz26%;oVaq&)*?9lIrXT2rOi$-erIMUUom0I z*-tm3o#t*$4;u|UKMt|_vv?#nfE%E(V@Sk7%MpdB0tkrvg?9^E#_*8T`bBx^j6piC zM9*+*Iin+^zZ4IRw;-69pN|a?qTv%t!vfeF#BlN&Wh68JNs%^EM)ymJl=tcvm2CCg zoJ!BV5cZWg`uuvBi0;x`+RrGDpom)T18(R*5=E~r8A-@(@EhB$D5*#xUL^P>=zCA( zE$QpD$7sJ%UDm#C-gcgqs5I?gD4PUCCOx!cVe}XwRSNd>n-n}aI6b%E%e0UI4o&1e zZ*V}=7yG!!Z}NVuL*s;00nu&dYw4DR_fI$L�hS&{m6ay6F$j#T;eYS|nqokNLK- z@MXSe>&!*27IVvtwr~3I9`@B*&M<|jTD(K#G<#X+fIm3{@A=zDEV$BbRx3?ffCDUT zZR)NadJEP1OLe02p)QS&K*lGEWvI`#6K^kbl`NcBzmjXVDQOuvFM-u$nWe;srTM8P z3?x*E!`wW>Lw`C%;r*(Z??PxtI^ZQFT~Oz(DKWla8cDULV~*`!xKyFL{_s4mY}P~z zS)p=GV_l(Ozon3O@^7Twh)I{?! zZBTuZV0<6kk;M79_i&`Uvx$85!^T13VMlUn2ctwts+s!eOA6N)Q7;lYI{KJJ;M@)ut#CDbXDHn$4m=>~^^l-S9 zo6pu`{Bhr%?HZBQ=tg>Um%SQ)*@S_#R=J@j!zO4_#dAW|%whNOhhzP9drz=@DlBHl zR&8g;vkm=iXQaoFPt1)@~C4YO4b{CXRrTUXv~b@do8|B^iBns9d1oE6)3H>Fec zkbg@ee~%Z~V8$fmhyu=?5iaFf4Rz&T7NbJuhofl%p zG15#kYAD@6uVp`?EJ=lq_DPBfJC{cJd!(jJhm`-oxn;SrhIwl9@bpD>YR#mb!0?vJ zR^=qjoPV#I;=(oUu%Sa|<*{ zzWh1AgR_Ut<6crUzB+n%+@W0N^x^LJIY}l5^5W40zC5?;ZCB^T&Rn*Qt2ajR#);Z2 z#GB6=-W*X=IEM>1tz@!Q|1!1Jz4)3H5mX-%{$|7d6eUV%W!`!MjZWMrw-gZxS3@Jt zCAaa`N18c}po=OPBuMA*TY;VEC;y?xZ=O#Er?qx%T|1XM1YN&!KaMrD?T!h_-pPpY zY5ujqV5-@wXrMhHb{vbG+7Ln%Y^eN};8H{BFhJ_iiyOGN26;3)v#WvC7w$M7}HU0%nPMzM9GU2D# zRIRL+j-Ez{V)}bjENJ@|#o7ECL}8>DwNF)45ql4dhS*MqF9@96P)^MmCQk||&O)4* ztztx7k)Rvv#{_pbXQnKqV_)hUTnx>P>|~+Rjf7kc#X5Zybq$t#P5$kRV{L-1aB|XN zpZg#N>F2Uh)JZeJp(Ce>+u`v)DnH1C!1i2_8@Dpf(B!M5U*^BGp@=iqLRoO!OlI7x zZ;v3yV@~)^gTgJixx+YOL(P3`5xG8lO;$xO90JKj7YWx@XT-W>4-*!nz@}2OSQ7 zV>s28H}a{a=S)G)-lb@EHBErNZ?Ku9!Ld_o(tG=4h-?qd@4zQGTo(*F@B7&2(R6cj zE)322voZR^_v}CCv{31+FefbWAPG}bMPrZdB7)0=s&P(DYJ{*piY};S-L8@yNz9kq z5#Tu&z^Gbi9_#kZdp0LN#Jl?N)T-Bnt?x(`qT{wV1VQ{VWD{3HUGPb!Aqr2?au|Cd;*80_1j9_Iy#XlM91Uuscw~CW{jXp0jqT zv{6QfGP@MJW`noC`00>h433|2g~An9OLA)A3#L8bWzM}!0x;W9k4+0x_asLK-+<7( zA$`Gf{R!%+U+`=MdTBVOdf$eaxcrTTixwZMROB|PMabE|$hD05%+K6L!b>n8+ zvQdQ#G|%(B@KB>w-nuiemg|?U!4I(j_`<&OhqDtOlG8P)4mz@DMt8p*jnMR(+$$e-5%p zc3PPuq0&uo(Bh*df{o3vt+HV~W0fOhW%wGtQtKYm zX~MtAhKra^VXo1|YrW+Y#cFc)DdBP6Ltcl0nz#ENj2A;N#2u!7>H7TgJn}Tp?cS-% zmgq*OOZPbN<#;nMyv(#l#ji_v?2!M&hqKAJ-zswD=0XTkrC~FRc{#kr&aLTV%37 zdduW(i=@q9HAWWyVk&9KHo|=k-MlHqs zPswR84xeC*AeAR~Q0ZQBJ&8s42C6^`>tOuK7-{v9hS*ZWH(2ngiI9dsX3SVYQq*1f zS$#sLE4iV|Zsp;1=Gk$mONvr=74v+T^^nGL#MW_~TNNv}izMilf_w}5q{(Vjeq_0! zy7W=`yr;N*V@7PgxF;nye`Svy(Z`NY9|QcsvVrj+_ObtSFCCn>MoV;-1dxQ4g@(dI zDWm<1S_;pwV;qxxw%q!o>xhGhI%tvVCpYei!=FbBaj^^AK_2$XxnJ0}av2H}g?^3@ z(rt#|&e6hH4&>e4rZPPv>jJ8W*w4H#G>x=;2ON!h1^wt6q}7?@%L@@2IiL|#6!ml#iJ?9HGxzw zgE{$+(#^q5=!8{=?orFQ)N%cm@@^^X??9{zpuiN~&8o5;v@aXYkh?VXJ&9_W7+Q33 zj|gtXf?iTt%+Lw3RFsMp%D)0OscZUZcT^q0lchdGR(4}P)8WgBQNEsfpe_B2lESsiH9>#r4(`?EISOH4)sW+y z7=_Fzq0vMkY@(U3vW(ei+h)chu|h{AZP>p1JIe4oxzfX-F9b(A4pdkm7)qf&2-Bzq zv#2M=t{BXM@zYmIck6Jln#tC{U(vZm4nAxPmJ#cX`$tD*{Xd$f=Wm(sb@gICma6G9 zTrsBSV9=@`JljJVL0I3l4LCxb8Z({|rNeo>H~9vWb^SOoir(b=wwCMzLvfg2g2FkA z+Ol7KeF%4mAWi*nyf(|o3%iC<^V;?kCY-A>fr;_6ZIY~rLKVTs)NDr+#zP#v)O4i; zJFLN~sB{MgQ`R9MG{(i{kn7??z-x2o2kkAO_cQLoh5U!G_upHx753a1e~}z#ULGHj zpYHliaOxcIC}{6)*k!$Y%3y6BmuB$&JHBlQyOF^{i=P}ZkECyh*p9i_Zok`;R-LSF zx0)XkXBrC@YRy*pXVTC6x0p>KpXf{8?eKYok;}6LE56q&1S9wokgIOW4A$fex+P>w z!u?{ z#ZFjbL6?Lp7No%v7CR`N`y2w6Fkxz&5C;q_=%KZCtGvQ#LPFzAm18Pn}Q7&Pl(alchbDMqC;=w25ziHyAnEO0C8~Xit2ExE+-)xR$DWnhH2TE^VR2 z6x;#KS$7E$kPrz!dvEcYXf52&QrdV~_DX$|Tx;TtQk!cH38{DXeK^WUciGF*IZD?P zciJJyPkr-uF^aVMg6MGr^;g{INQ%*TFIa1qI0w$@eQj-0TSJF8$lqCj3F})lgI#(3X^Fus+tGrZT3$M)Iw! zlF|qVFufc#-`9M9+bRJ~yz(rAxq+&kbvuPT@IirII{lN0&=qKYU0DS0ZP6ogX(#h zpXk2~yt{Sl&I2h?VP#7-V96CN!G(~Uann!($;CtTu_aT5z9Qq6>dk5SDyk|C`P~d3 zl?KQobt_l|>nSRtDIv5vJl(T}K~cr0SJ=cx4&21|Tl_Qh66SFc2Qozwv+*KYVq(KX z{QN+X+c4t1{&a!<={5mX@1l{fRe{-ttC7dKufoZ!7vK02BRbbOfjp z3zk=9Gh3!VGGL+Bc>Q>15)n)QpTg+m`lZIG0Tx@X>)sHU0)f=oeT*89{fR<3mHk1Tzw@8ph(; z6#q^d0zQj(M|q~V=z>yoltR}p>>1!=kwOoBgDbvxWijiIMJFelJ5-7OHO^;NlOZpX)DsfVRG3exDn%bkD(w2OTPct z8u*)c8!&k95j)+R@&9&O7O+R5{)qG6H4%aHB8d}rVk z3B~^o7~U;#3sxM4|E>vP0q&)9tV#6WUHsp{{hzNn?_alr+nO>Lu^AU{fqznBa-wC= Hb$tFGzvD?u literal 0 HcmV?d00001 diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 6bd41c58e..b3c93c1be 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -6,7 +6,7 @@ from llmcompressor.utils.dev import dispatch_for_generation # Select model and load it. -model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +model_id = "meta-llama/Llama-3.3-70B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) From b336fa2c0e5fcb2664798af5389c2f757f05b3c6 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 15 Jun 2025 23:33:27 -0400 Subject: [PATCH 23/36] deprecate sequential_targets on modifiers Signed-off-by: Kyle Sayers --- src/llmcompressor/args/dataset_arguments.py | 10 +++ src/llmcompressor/pipelines/registry.py | 8 ++- .../pipelines/sequential/helpers.py | 66 ++++++++++++------- .../pipelines/sequential/pipeline.py | 4 +- 4 files changed, 60 insertions(+), 28 deletions(-) diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py index f60c9560e..949933f97 100644 --- a/src/llmcompressor/args/dataset_arguments.py +++ b/src/llmcompressor/args/dataset_arguments.py @@ -186,3 +186,13 @@ class DatasetArguments(CustomDatasetArguments): "{module}.{method_name} or {function_name}" }, ) + sequential_targets: Optional[List[str]] = field( + default=None, + metadata={ + "help": "List of layer targets for the sequential pipeline. " + "This is typically a single DecoderLayer. " + "Not specifying this argument will cause the sequential pipeline to " + "default to using the `no_split_params` specified by the HF model " + "definition" + }, + ) diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py index 2ac384866..cc4c29d8c 100644 --- a/src/llmcompressor/pipelines/registry.py +++ b/src/llmcompressor/pipelines/registry.py @@ -17,8 +17,12 @@ __all__ = ["CalibrationPipeline"] -SEQUENTIAL_MODIFIERS = (AWQModifier, GPTQModifier, SparsityModifierBase) -CALIBRATION_MODIFIERS = (SmoothQuantModifier, *SEQUENTIAL_MODIFIERS) +CALIBRATION_MODIFIERS = ( + SmoothQuantModifier, + AWQModifier, + GPTQModifier, + SparsityModifierBase, +) class CalibrationPipeline(ABC, RegistryMixin): diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index b7937a2fc..17e6724db 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -2,7 +2,7 @@ import inspect from collections import deque from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Set +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set import torch from compressed_tensors import has_offloaded_params @@ -23,7 +23,10 @@ from .ast_helpers import autowrap_forwards -__all__ = ["trace_subgraphs", "Subgraph", "get_targets_from_modifiers"] +if TYPE_CHECKING: + from llmcompressor.args.dataset_arguments import DatasetArguments + +__all__ = ["trace_subgraphs", "Subgraph", "get_sequential_targets"] @dataclass @@ -416,44 +419,59 @@ def match_modules(model: Module, target_names: List[str]) -> Set[Module]: ) -def get_targets_from_modifiers( - modifiers: List[Modifier], model: PreTrainedModel +def get_sequential_targets( + modifiers: List[Modifier], model: PreTrainedModel, args: "DatasetArguments" ) -> List[str]: """ - Infer sequential targets from modifiers list + Infer sequential targets from modifiers list and dataset args :param model: model being calibrated :param modifiers: list of modifiers being applied during calibration + :param dataset_args: dataset arguments passed by user :return: list of sequential targets """ - # avoid circular import - from llmcompressor.pipelines.registry import SEQUENTIAL_MODIFIERS - - sequential_modifiers = [ - modifier for modifier in modifiers if isinstance(modifier, SEQUENTIAL_MODIFIERS) + modifier_targets = [ + (modifier, modifier.sequential_targets) + for modifier in modifiers + if getattr(modifier, "sequential_targets", None) is not None ] - if len(sequential_modifiers) >= 2: - types = [type(modifier) for modifier in sequential_modifiers] + # deprecation warning + if len(modifier_targets) > 1: logger.warning( + "Passing sequential targets through modifiers is deprecated, " + "please use `oneshot(sequential_targets=...)`" + ) + + # cannot infer from multiple modifiers + if len(modifier_targets) >= 2: + types = [type(modifier) for modifier, _ in modifier_targets] + raise ValueError( "Cannot infer sequential targets from multiple sequential modifiers " - f"({types}). Defaulting to {types[0]}" + f"({types})" ) - elif len(sequential_modifiers) <= 0: - types = [type(modifier) for modifier in modifiers] - raise ValueError(f"Cannot infer sequential targets from list of {types}") - modifier = sequential_modifiers[0] + # resolve single modifier + if len(modifier_targets) == 1: + if args.sequential_targets is not None: + raise ValueError( + f"Got sequential targets from both {type(modifier_targets[0][0])} " + "and dataset arguments `sequential_targets`" + ) + + sequential_targets = modifier_targets[0][1] - # infer sequential targets - if modifier.sequential_targets is None: - sequential_targets = get_no_split_params(model) - elif isinstance(modifier.sequential_targets, str): - sequential_targets = [modifier.sequential_targets] + # if no modifiers, use data args else: - sequential_targets = modifier.sequential_targets + sequential_targets = args.sequential_targets # may be `None` - return sequential_targets + # validate and infer + if sequential_targets is None: + return get_no_split_params(model) + elif isinstance(sequential_targets, str): + return [sequential_targets] + else: + return sequential_targets def add_line_numbers(text: str) -> str: diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 3e0490b70..628fdf4d2 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -11,7 +11,7 @@ from llmcompressor.pipelines.cache import IntermediatesCache from llmcompressor.pipelines.registry import CalibrationPipeline from llmcompressor.pipelines.sequential.helpers import ( - get_targets_from_modifiers, + get_sequential_targets, trace_subgraphs, ) from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context @@ -64,7 +64,7 @@ def __call__( # prepare to trace subgraphs modifiers = session.get_modifiers() - sequential_targets = get_targets_from_modifiers(modifiers, model) + sequential_targets = get_sequential_targets(modifiers, model, dataset_args) ignore = dataset_args.tracing_ignore # trace subgraphs From 34ef39418fa97d3679a54eebeaaf4f74f66d9745 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 15 Jun 2025 23:38:54 -0400 Subject: [PATCH 24/36] update examples Signed-off-by: Kyle Sayers --- examples/multimodal_vision/idefics3_example.py | 2 +- examples/multimodal_vision/llava_example.py | 2 +- examples/multimodal_vision/mistral3_example.py | 2 +- examples/multimodal_vision/mllama_example.py | 2 +- examples/multimodal_vision/phi3_vision_example.py | 2 +- examples/multimodal_vision/pixtral_example.py | 2 +- examples/multimodal_vision/qwen2_vl_example.py | 2 +- examples/multimodal_vision/qwen_2_5_vl_example.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py index 27f230569..09722c127 100644 --- a/examples/multimodal_vision/idefics3_example.py +++ b/examples/multimodal_vision/idefics3_example.py @@ -31,7 +31,6 @@ def data_collator(batch): GPTQModifier( targets="Linear", scheme="W4A16", - sequential_targets=["LlamaDecoderLayer"], ignore=["re:.*lm_head", "re:model.vision_model.*", "re:model.connector.*"], ), ] @@ -91,6 +90,7 @@ def tokenize(sample): num_calibration_samples=NUM_CALIBRATION_SAMPLES, trust_remote_code_model=True, data_collator=data_collator, + sequential_targets=["LlamaDecoderLayer"], ) # Confirm generations of the quantized model look sane. diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py index 4b9d1cf9e..984e8a1fd 100644 --- a/examples/multimodal_vision/llava_example.py +++ b/examples/multimodal_vision/llava_example.py @@ -30,7 +30,6 @@ def data_collator(batch): GPTQModifier( targets="Linear", scheme="W4A16", - sequential_targets=["LlamaDecoderLayer"], ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"], ), ] @@ -46,6 +45,7 @@ def data_collator(batch): num_calibration_samples=NUM_CALIBRATION_SAMPLES, trust_remote_code_model=True, data_collator=data_collator, + sequential_targets=["LlamaDecoderLayer"], ) # Confirm generations of the quantized model look sane. diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py index 251fe4297..fc3657b0e 100644 --- a/examples/multimodal_vision/mistral3_example.py +++ b/examples/multimodal_vision/mistral3_example.py @@ -43,7 +43,6 @@ def data_collator(batch): GPTQModifier( targets="Linear", scheme="W4A16", - sequential_targets=["MistralDecoderLayer"], ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"], ), ] @@ -59,6 +58,7 @@ def data_collator(batch): num_calibration_samples=NUM_CALIBRATION_SAMPLES, trust_remote_code_model=True, data_collator=data_collator, + sequential_targets=["MistralDecoderLayer"], ) # Confirm generations of the quantized model look sane. diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py index 3c5236d1c..7d94a677c 100644 --- a/examples/multimodal_vision/mllama_example.py +++ b/examples/multimodal_vision/mllama_example.py @@ -30,7 +30,6 @@ def data_collator(batch): GPTQModifier( targets="Linear", scheme="W4A16", - sequential_targets=["MllamaSelfAttentionDecoderLayer"], ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"], ), ] @@ -46,6 +45,7 @@ def data_collator(batch): num_calibration_samples=NUM_CALIBRATION_SAMPLES, trust_remote_code_model=True, data_collator=data_collator, + sequential_targets=["MllamaSelfAttentionDecoderLayer"], ) # Confirm generations of the quantized model look sane. diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py index fd274ea12..324df5d31 100644 --- a/examples/multimodal_vision/phi3_vision_example.py +++ b/examples/multimodal_vision/phi3_vision_example.py @@ -75,7 +75,6 @@ def data_collator(batch): recipe = GPTQModifier( targets="Linear", scheme="W4A16", - sequential_targets=["Phi3DecoderLayer"], ignore=["lm_head", "re:model.vision_embed_tokens.*"], ) @@ -88,6 +87,7 @@ def data_collator(batch): num_calibration_samples=NUM_CALIBRATION_SAMPLES, trust_remote_code_model=True, data_collator=data_collator, + sequential_targets=["Phi3DecoderLayer"], ) # Confirm generations of the quantized model look sane. diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py index 6414cca0e..b2b4c7440 100644 --- a/examples/multimodal_vision/pixtral_example.py +++ b/examples/multimodal_vision/pixtral_example.py @@ -36,7 +36,6 @@ def data_collator(batch): GPTQModifier( targets="Linear", scheme="W4A16", - sequential_targets=["MistralDecoderLayer"], ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"], ), ] @@ -52,6 +51,7 @@ def data_collator(batch): num_calibration_samples=NUM_CALIBRATION_SAMPLES, trust_remote_code_model=True, data_collator=data_collator, + sequential_targets=["MistralDecoderLayer"], ) # Confirm generations of the quantized model look sane. diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py index 94c97398d..14033872d 100644 --- a/examples/multimodal_vision/qwen2_vl_example.py +++ b/examples/multimodal_vision/qwen2_vl_example.py @@ -79,7 +79,6 @@ def data_collator(batch): GPTQModifier( targets="Linear", scheme="W4A16", - sequential_targets=["Qwen2VLDecoderLayer"], ignore=["lm_head", "re:visual.*", "re:model.visual.*"], ), ] @@ -94,6 +93,7 @@ def data_collator(batch): num_calibration_samples=NUM_CALIBRATION_SAMPLES, trust_remote_code_model=True, data_collator=data_collator, + sequential_targets=["Qwen2VLDecoderLayer"], ) # Confirm generations of the quantized model look sane. diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py index 7923d317d..096596d24 100644 --- a/examples/multimodal_vision/qwen_2_5_vl_example.py +++ b/examples/multimodal_vision/qwen_2_5_vl_example.py @@ -73,7 +73,6 @@ def data_collator(batch): GPTQModifier( targets="Linear", scheme="W4A16", - sequential_targets=["Qwen2_5_VLDecoderLayer"], ignore=["lm_head", "re:visual.*", "re:model.visual.*"], ), ] @@ -88,6 +87,7 @@ def data_collator(batch): num_calibration_samples=NUM_CALIBRATION_SAMPLES, trust_remote_code_model=True, data_collator=data_collator, + sequential_targets=["Qwen2_5_VLDecoderLayer"], ) # Confirm generations of the quantized model look sane. From 58fe92972a80941446a9cb66a1a6c07a43c8dd0d Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 15 Jun 2025 23:40:00 -0400 Subject: [PATCH 25/36] fix deprecation warning Signed-off-by: Kyle Sayers --- src/llmcompressor/pipelines/sequential/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index 17e6724db..9a1751ea5 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -437,7 +437,7 @@ def get_sequential_targets( ] # deprecation warning - if len(modifier_targets) > 1: + if len(modifier_targets) >= 1: logger.warning( "Passing sequential targets through modifiers is deprecated, " "please use `oneshot(sequential_targets=...)`" From 54ef06a95e430760acf831a3185977b7dfa77bae Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 00:01:46 -0400 Subject: [PATCH 26/36] fix layer sequential pipeline Signed-off-by: Kyle Sayers --- src/llmcompressor/pipelines/layer_sequential/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index 2cfda0d0e..6d862a8a7 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -16,7 +16,7 @@ to_next_layer_kwargs, ) from llmcompressor.pipelines.registry import CalibrationPipeline -from llmcompressor.pipelines.sequential.helpers import get_targets_from_modifiers +from llmcompressor.pipelines.sequential.helpers import get_sequential_targets from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context if TYPE_CHECKING: @@ -68,7 +68,7 @@ def __call__( # find layers modifiers = session.get_modifiers() - sequential_targets, _ = get_targets_from_modifiers(modifiers, model) + sequential_targets = get_sequential_targets(modifiers, model, dataset_args) layers = match_modules(model, sequential_targets) LifecycleCallbacks.calibration_epoch_start() From 4bb86e54b14652c0340e8a2fe227b4381355a2ae Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 00:07:36 -0400 Subject: [PATCH 27/36] remove unused import Signed-off-by: Kyle Sayers --- src/llmcompressor/pipelines/sequential/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llmcompressor/pipelines/sequential/__init__.py b/src/llmcompressor/pipelines/sequential/__init__.py index d96ee6987..7c726f6c4 100644 --- a/src/llmcompressor/pipelines/sequential/__init__.py +++ b/src/llmcompressor/pipelines/sequential/__init__.py @@ -1,3 +1,2 @@ # flake8: noqa -from .helpers import get_targets_from_modifiers from .pipeline import * From b2367cef328ac3c2ee6e3808236f323a6d9e30d9 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 10:46:54 -0400 Subject: [PATCH 28/36] dispatch in pipelines Signed-off-by: Kyle Sayers --- examples/quantization_w4a16/llama3_example.py | 2 +- src/llmcompressor/args/model_arguments.py | 5 +++- src/llmcompressor/entrypoints/oneshot.py | 10 ------- src/llmcompressor/pipelines/basic/pipeline.py | 2 ++ .../pipelines/layer_sequential/pipeline.py | 17 ++++------- .../pipelines/sequential/helpers.py | 29 +++++++++++++++++-- .../pipelines/sequential/pipeline.py | 13 ++------- 7 files changed, 43 insertions(+), 35 deletions(-) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index b3c93c1be..6bd41c58e 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -6,7 +6,7 @@ from llmcompressor.utils.dev import dispatch_for_generation # Select model and load it. -model_id = "meta-llama/Llama-3.3-70B-Instruct" +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py index 9cf8a687c..ea3c3936a 100644 --- a/src/llmcompressor/args/model_arguments.py +++ b/src/llmcompressor/args/model_arguments.py @@ -82,7 +82,10 @@ class ModelArguments: ) oneshot_device: Optional[str] = field( default="cuda", - metadata={"help": "Device to run oneshot calibration on"}, + metadata={ + "help": "This argument is deprecated and nonfunctional " + "and will be removed in future release" + }, ) model_revision: str = field( default="main", diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index c1dae7933..54a36abfe 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -2,8 +2,6 @@ from datetime import datetime from typing import Optional -import torch -from compressed_tensors.utils import offloaded_dispatch from loguru import logger from torch.utils.data import DataLoader from transformers import PreTrainedModel @@ -125,14 +123,6 @@ def __init__( # initialize the model and processor pre_process(model_args) - # offload to cpu if possible - if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available(): - offloaded_dispatch( - model_args.model, execution_device=model_args.oneshot_device - ) - else: - logger.warning("CUDA is not available! Compressing model on CPU instead") - # Set instance attributes self.model = self.model_args.model self.processor = self.model_args.processor diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py index 15b94786a..35c52f166 100644 --- a/src/llmcompressor/pipelines/basic/pipeline.py +++ b/src/llmcompressor/pipelines/basic/pipeline.py @@ -9,6 +9,7 @@ from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch from llmcompressor.pipelines.registry import CalibrationPipeline from llmcompressor.pytorch.utils.helpers import tensors_to_device +from llmcompressor.utils.dev import dispatch_for_generation from llmcompressor.utils.helpers import calibration_forward_context if TYPE_CHECKING: @@ -37,6 +38,7 @@ def __call__( :param dataloader: loads data for calibration :param dataset_args: dataset arguments relevant to pipelines """ + dispatch_for_generation(model) model_device = get_execution_device(model) LifecycleCallbacks.calibration_epoch_start() diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index 6d862a8a7..d8ad73a10 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -3,7 +3,6 @@ import torch import tqdm from compressed_tensors.utils import disable_offloading -from loguru import logger from torch.utils.data.dataloader import DataLoader from llmcompressor.core import LifecycleCallbacks, active_session @@ -16,7 +15,10 @@ to_next_layer_kwargs, ) from llmcompressor.pipelines.registry import CalibrationPipeline -from llmcompressor.pipelines.sequential.helpers import get_sequential_targets +from llmcompressor.pipelines.sequential.helpers import ( + dispatch_for_sequential, + get_sequential_targets, +) from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context if TYPE_CHECKING: @@ -56,15 +58,8 @@ def __call__( """ session = active_session() - # check for offloading - if model.device != torch.device("meta"): - logger.warning( - "Attemping to use sequential pipeline with a model which is not " - "offloaded to the cpu. Deploying a model in this way may lead to more " - "memory usage than is required. It is recommended to set " - '`oneshot_device="cuda"` or call `force_cpu_offload` on your model ' - "before compressing" - ) + # prepare model for sequential onloading + dispatch_for_sequential(model) # find layers modifiers = session.get_modifiers() diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index 9a1751ea5..869f60578 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -5,8 +5,9 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set import torch -from compressed_tensors import has_offloaded_params +from accelerate.hooks import remove_hook_from_module from compressed_tensors.quantization import find_name_or_class_matches +from compressed_tensors.utils import has_offloaded_params, offloaded_dispatch from loguru import logger from torch.fx import Graph, GraphModule, Node from torch.fx.graph import PythonCode @@ -26,7 +27,12 @@ if TYPE_CHECKING: from llmcompressor.args.dataset_arguments import DatasetArguments -__all__ = ["trace_subgraphs", "Subgraph", "get_sequential_targets"] +__all__ = [ + "trace_subgraphs", + "Subgraph", + "get_sequential_targets", + "dispatch_for_sequential", +] @dataclass @@ -503,3 +509,22 @@ def is_ancestor(module: Module) -> bool: is_ancestor(model) return ancestors + + +def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel: + """ + Dispatch a model for sequential calibration using a sequential pipeline. + The model will be offloaded to the CPU and dispatched to CUDA device if available. + Removes any existing hooks. + + :param model: model to dispatch + :return: dispatched model + """ + remove_hook_from_module(model, recurse=True) + + if torch.cuda.is_available(): + offloaded_dispatch(model, execution_device=torch.device("cuda:0")) + else: + logger.warning("CUDA is not available! Compressing model on CPU instead") + + return model diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 628fdf4d2..a2a714565 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -2,7 +2,6 @@ import torch from compressed_tensors.utils import disable_offloading, get_execution_device -from loguru import logger from torch.utils.data.dataloader import DataLoader from tqdm import tqdm @@ -11,6 +10,7 @@ from llmcompressor.pipelines.cache import IntermediatesCache from llmcompressor.pipelines.registry import CalibrationPipeline from llmcompressor.pipelines.sequential.helpers import ( + dispatch_for_sequential, get_sequential_targets, trace_subgraphs, ) @@ -52,15 +52,8 @@ def __call__( """ session = active_session() - # check for offloading - if model.device != torch.device("meta"): - logger.warning( - "Attemping to use sequential pipeline with a model which is not " - "offloaded to the cpu. Deploying a model in this way may lead to more " - "memory usage than is required. It is recommended to set " - '`oneshot_device="cuda"` or call `force_cpu_offload` on your model ' - "before compressing" - ) + # prepare model for sequential onloading + dispatch_for_sequential(model) # prepare to trace subgraphs modifiers = session.get_modifiers() From 06bb6611088a0cb44070031f961d7fbd298f0929 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 11:09:00 -0400 Subject: [PATCH 29/36] add train dispatch Signed-off-by: Kyle Sayers --- src/llmcompressor/entrypoints/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llmcompressor/entrypoints/train.py b/src/llmcompressor/entrypoints/train.py index 4b5d8b73b..0bfb26e53 100644 --- a/src/llmcompressor/entrypoints/train.py +++ b/src/llmcompressor/entrypoints/train.py @@ -8,6 +8,7 @@ from llmcompressor.core.session_functions import active_session from llmcompressor.datasets.utils import get_processed_dataset from llmcompressor.transformers.finetune.trainer import Trainer +from llmcompressor.utils.dev import dispatch_for_generation from .utils import post_process, pre_process @@ -63,6 +64,7 @@ def train(**kwargs) -> PreTrainedModel: ) pre_process(model_args) + dispatch_for_generation(model_args.model) # train is dispatched same as generation processed_dataset = get_processed_dataset( dataset_args=dataset_args, From a64a777e8af4bf6473f4574b4210da47a9cf8e0f Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 12:01:36 -0400 Subject: [PATCH 30/36] use remove_dispatch Signed-off-by: Kyle Sayers --- src/llmcompressor/entrypoints/README.md | 9 ++------- src/llmcompressor/entrypoints/utils.py | 4 ++++ src/llmcompressor/pipelines/sequential/helpers.py | 9 ++++++--- src/llmcompressor/utils/dev.py | 5 +++-- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md index 5e907b802..0ead2b7ec 100644 --- a/src/llmcompressor/entrypoints/README.md +++ b/src/llmcompressor/entrypoints/README.md @@ -29,9 +29,7 @@ from llmcompressor.modifiers.quantization import QuantizationModifier MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" # Load the model -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -204,9 +202,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" oneshot_output_dir = "./oneshot_model" # Load the model -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -226,7 +222,6 @@ from llmcompressor import create_session, train # Student model model = AutoModelForCausalLM.from_pretrained( oneshot_output_dir, - device_map="auto", quantization_config=CompressedTensorsConfig(run_compressed=False), ) diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py index 4bbc31e82..2c77dc73d 100644 --- a/src/llmcompressor/entrypoints/utils.py +++ b/src/llmcompressor/entrypoints/utils.py @@ -3,6 +3,7 @@ from pathlib import PosixPath from typing import Optional, Tuple +from compressed_tensors.utils import remove_dispatch from loguru import logger from torch.nn import Module from transformers import ( @@ -84,6 +85,9 @@ def post_process( Raises: ValueError: If saving fails due to an invalid `output_dir` or other issues. """ + # remove any existing dispatches + remove_dispatch(model_args.model) + if model_args is not None and output_dir is not None: if recipe_args is not None and getattr(recipe_args, "stage", None) is not None: output_dir = os.path.join(output_dir, recipe_args.stage) diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index 869f60578..4f562818a 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -5,9 +5,12 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set import torch -from accelerate.hooks import remove_hook_from_module from compressed_tensors.quantization import find_name_or_class_matches -from compressed_tensors.utils import has_offloaded_params, offloaded_dispatch +from compressed_tensors.utils import ( + has_offloaded_params, + offloaded_dispatch, + remove_dispatch, +) from loguru import logger from torch.fx import Graph, GraphModule, Node from torch.fx.graph import PythonCode @@ -520,7 +523,7 @@ def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel: :param model: model to dispatch :return: dispatched model """ - remove_hook_from_module(model, recurse=True) + remove_dispatch(model) if torch.cuda.is_available(): offloaded_dispatch(model, execution_device=torch.device("cuda:0")) diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py index e773b48f1..57ce74fb1 100644 --- a/src/llmcompressor/utils/dev.py +++ b/src/llmcompressor/utils/dev.py @@ -6,8 +6,8 @@ import torch from accelerate import dispatch_model, infer_auto_device_map -from accelerate.hooks import remove_hook_from_module from accelerate.utils import get_balanced_memory +from compressed_tensors.utils import remove_dispatch from huggingface_hub import snapshot_download from safetensors.torch import save_file from transformers import AutoModelForCausalLM, PreTrainedModel @@ -124,7 +124,8 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel: :param model: model to dispatch :return: model which is dispatched """ - remove_hook_from_module(model, recurse=True) + remove_dispatch(model) + max_memory = get_balanced_memory( model, dtype=model.dtype, From 8f71004e6807e8dfed77e9c65ba136b2e6d9751d Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 13:29:22 -0400 Subject: [PATCH 31/36] fix example Signed-off-by: Kyle Sayers --- examples/quantization_2of4_sparse_w4a16/README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md index c011ecf1d..51e04dd98 100644 --- a/examples/quantization_2of4_sparse_w4a16/README.md +++ b/examples/quantization_2of4_sparse_w4a16/README.md @@ -49,9 +49,7 @@ import torch from transformers import AutoModelForCausalLM model_stub = "neuralmagic/Llama-2-7b-ultrachat200k" -model = AutoModelForCausalLM.from_pretrained( - model_stub, torch_dtype=torch.bfloat16, device_map="auto" -) +model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16) dataset = "ultrachat-200k" splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} From 7d7b00d09e1db029d4d9c9686fa11d04535fbca4 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 14:57:39 -0400 Subject: [PATCH 32/36] remove device arg from e2e Signed-off-by: Kyle Sayers --- src/llmcompressor/entrypoints/README.md | 3 --- src/llmcompressor/entrypoints/utils.py | 3 ++- tests/e2e/e2e_utils.py | 17 +++++------------ tests/e2e/vLLM/test_vllm.py | 2 -- 4 files changed, 7 insertions(+), 18 deletions(-) diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md index 0ead2b7ec..f023d3c02 100644 --- a/src/llmcompressor/entrypoints/README.md +++ b/src/llmcompressor/entrypoints/README.md @@ -112,7 +112,6 @@ output_dir = "./oneshot_model" # The model to train model = AutoModelForCausalLM.from_pretrained( output_dir, - device_map="auto", quantization_config=CompressedTensorsConfig(run_compressed=False), ) @@ -146,7 +145,6 @@ Comparisons are defined in `/src/llmcompressor/modifiers/distillation/utils/pyto # Define the teacher model distill_teacher = AutoModelForCausalLM.from_pretrained( "meta-llama/Meta-Llama-3-8B-Instruct", - device_map="auto", ) # Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with @@ -236,7 +234,6 @@ num_calibration_samples = 8 # The number of workers processing datasets in para # Define teacher model distill_teacher = AutoModelForCausalLM.from_pretrained( "meta-llama/Meta-Llama-3-8B-Instruct", - device_map="auto", ) # Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py index 2c77dc73d..418725d47 100644 --- a/src/llmcompressor/entrypoints/utils.py +++ b/src/llmcompressor/entrypoints/utils.py @@ -86,7 +86,8 @@ def post_process( ValueError: If saving fails due to an invalid `output_dir` or other issues. """ # remove any existing dispatches - remove_dispatch(model_args.model) + if model_args is not None and model_args.model is not None: + remove_dispatch(model_args.model) if model_args is not None and output_dir is not None: if recipe_args is not None and getattr(recipe_args, "stage", None) is not None: diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py index 2325b7a34..853d2318b 100644 --- a/tests/e2e/e2e_utils.py +++ b/tests/e2e/e2e_utils.py @@ -14,28 +14,21 @@ def _load_model_and_processor( model: str, model_class: str, - device: str, ): pretrained_model_class = getattr(transformers, model_class) - loaded_model = pretrained_model_class.from_pretrained( - model, device_map=device, torch_dtype="auto" - ) + loaded_model = pretrained_model_class.from_pretrained(model, torch_dtype="auto") processor = AutoProcessor.from_pretrained(model) return loaded_model, processor @log_time -def _run_oneshot(device: str, **oneshot_kwargs): - oneshot( - **oneshot_kwargs, - oneshot_device=device, - ) +def _run_oneshot(**oneshot_kwargs): + oneshot(**oneshot_kwargs) def run_oneshot_for_e2e_testing( model: str, model_class: str, - device: str, num_calibration_samples: int, max_seq_length: int, dataset_id: str, @@ -49,7 +42,7 @@ def run_oneshot_for_e2e_testing( oneshot_kwargs = {} loaded_model, processor = _load_model_and_processor( - model=model, model_class=model_class, device=device + model=model, model_class=model_class ) if dataset_id: @@ -86,6 +79,6 @@ def data_collator(batch): # Apply quantization. logger.info("ONESHOT KWARGS", oneshot_kwargs) - _run_oneshot(device=device, **oneshot_kwargs) + _run_oneshot(**oneshot_kwargs) return oneshot_kwargs["model"], processor diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index 64d8204e5..89ddb5219 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -88,7 +88,6 @@ def set_up(self, test_data_file: str): logger.info("========== RUNNING ==============") logger.info(self.save_dir) - self.device = "cuda:0" self.prompts = [ "The capital of France is", "The president of the US is", @@ -105,7 +104,6 @@ def test_vllm(self, test_data_file: str): oneshot_model, tokenizer = run_oneshot_for_e2e_testing( model=self.model, model_class=self.model_class, - device=self.device, num_calibration_samples=self.num_calibration_samples, max_seq_length=self.max_seq_length, scheme=self.scheme, From 501056e44a9f02a1178b680d85af84bb911fd24b Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 15:07:30 -0400 Subject: [PATCH 33/36] simplify pipeline inference logic, add comment Signed-off-by: Kyle Sayers --- src/llmcompressor/pipelines/basic/pipeline.py | 2 +- src/llmcompressor/pipelines/registry.py | 49 ++++--------------- 2 files changed, 11 insertions(+), 40 deletions(-) diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py index 35c52f166..dfb99172e 100644 --- a/src/llmcompressor/pipelines/basic/pipeline.py +++ b/src/llmcompressor/pipelines/basic/pipeline.py @@ -38,7 +38,7 @@ def __call__( :param dataloader: loads data for calibration :param dataset_args: dataset arguments relevant to pipelines """ - dispatch_for_generation(model) + dispatch_for_generation(model) # basic dispatch is identical to generation model_device = get_execution_device(model) LifecycleCallbacks.calibration_epoch_start() diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py index cc4c29d8c..2c1a54cf5 100644 --- a/src/llmcompressor/pipelines/registry.py +++ b/src/llmcompressor/pipelines/registry.py @@ -7,23 +7,13 @@ from torch.utils.data.dataloader import DataLoader from llmcompressor.modifiers import Modifier -from llmcompressor.modifiers.awq import AWQModifier -from llmcompressor.modifiers.obcq.sgpt_base import SparsityModifierBase -from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationMixin -from llmcompressor.modifiers.smoothquant import SmoothQuantModifier +from llmcompressor.modifiers.quantization import QuantizationModifier if TYPE_CHECKING: from llmcompressor.args.dataset_arguments import DatasetArguments __all__ = ["CalibrationPipeline"] -CALIBRATION_MODIFIERS = ( - SmoothQuantModifier, - AWQModifier, - GPTQModifier, - SparsityModifierBase, -) - class CalibrationPipeline(ABC, RegistryMixin): @staticmethod @@ -48,7 +38,7 @@ def from_modifiers( :return: CalibrationPipeline instance to be called with data (if not datafree) """ user = standardize_lookup_name(user) if user else None - inferred = standardize_lookup_name(cls._validate_infer_pipeline(modifiers)) + inferred = standardize_lookup_name(cls._infer_pipeline(modifiers)) independent = standardize_lookup_name("independent") if user == independent: @@ -64,30 +54,11 @@ def from_modifiers( return cls.load_from_registry(pipeline) @staticmethod - def _validate_infer_pipeline(modifiers: List[Modifier]) -> str: - if any(isinstance(modifier, CALIBRATION_MODIFIERS) for modifier in modifiers): - return "sequential" - - active_qmods = _get_active_quant_modifiers(modifiers) - if len(active_qmods) > 1: - raise ValueError( - f"Recipe contains more than one active quantization config " - f"({active_qmods}). These configs may be conflicting, Please modify " - "your recipe to use at most one quantization config" - ) - - if len(active_qmods) == 1: - quant_modifier = active_qmods[0] - config = quant_modifier.resolve_quantization_config() - if config.requires_calibration_data(): - return "sequential" - - return "datafree" - - -def _get_active_quant_modifiers(modifiers: List[Modifier]) -> List[QuantizationMixin]: - return [ - modifier - for modifier in modifiers - if isinstance(modifier, QuantizationMixin) and modifier.has_config() - ] + def _infer_pipeline(modifiers: List[Modifier]) -> str: + # only in the case of weight-only qmod quantization can we skip calibration + if len(modifiers) == 1 and isinstance(modifiers[0], QuantizationModifier): + config = modifiers[0].resolve_quantization_config() + if not config.requires_calibration_data(): + return "datafree" + + return "sequential" From 74aa7c92def694b9de8fc82869217d2d4db752bd Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 15:12:08 -0400 Subject: [PATCH 34/36] update examples imports Signed-off-by: Kyle Sayers --- examples/awq/qwen3_moe_example.py | 2 +- examples/multimodal_audio/whisper_example.py | 2 +- examples/multimodal_vision/gemma3_example.py | 2 +- examples/multimodal_vision/idefics3_example.py | 2 +- examples/multimodal_vision/llava_example.py | 2 +- examples/multimodal_vision/mistral3_example.py | 2 +- examples/multimodal_vision/mllama_example.py | 2 +- examples/multimodal_vision/phi3_vision_example.py | 2 +- examples/multimodal_vision/pixtral_example.py | 2 +- examples/multimodal_vision/qwen2_vl_example.py | 2 +- examples/multimodal_vision/qwen_2_5_vl_example.py | 2 +- examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py | 2 +- examples/quantization_kv_cache/gemma2_fp8_kv_example.py | 2 +- examples/quantization_kv_cache/llama3_fp8_kv_example.py | 2 +- examples/quantization_kv_cache/phi3.5_fp8_kv_example.py | 2 +- examples/quantization_w4a16/llama3_example.py | 2 +- examples/quantization_w4a16_fp4/llama3_example.py | 2 +- examples/quantization_w4a4_fp4/llama3_example.py | 2 +- examples/quantization_w8a8_fp8/gemma2_example.py | 2 +- examples/quantization_w8a8_fp8/llama3.2_vision_example.py | 2 +- examples/quantization_w8a8_fp8/llama3_example.py | 2 +- examples/quantization_w8a8_fp8/llava1.5_example.py | 2 +- examples/quantization_w8a8_fp8/qwen2vl_example.py | 2 +- examples/quantization_w8a8_fp8/whisper_example.py | 2 +- examples/quantization_w8a8_int8/gemma2_example.py | 2 +- examples/quantization_w8a8_int8/llama3_example.py | 2 +- examples/quantizing_moe/deepseek_moe_w4a16.py | 2 +- examples/quantizing_moe/deepseek_moe_w8a8_fp8.py | 2 +- examples/quantizing_moe/deepseek_moe_w8a8_int8.py | 2 +- examples/quantizing_moe/mixtral_moe_w8a8_fp8.py | 2 +- examples/quantizing_moe/qwen_moe_w4a16.py | 2 +- examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py | 2 +- src/llmcompressor/utils/__init__.py | 1 + 33 files changed, 33 insertions(+), 32 deletions(-) diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py index 96baf5995..5fdc231c9 100644 --- a/examples/awq/qwen3_moe_example.py +++ b/examples/awq/qwen3_moe_example.py @@ -3,7 +3,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.awq import AWQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Select model and load it. MODEL_ID = "Qwen/Qwen3-30B-A3B" diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py index f286ddc7f..9c2e494a8 100644 --- a/examples/multimodal_audio/whisper_example.py +++ b/examples/multimodal_audio/whisper_example.py @@ -4,7 +4,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Select model and load it. MODEL_ID = "openai/whisper-large-v3" diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py index 9ac1f4ff7..5437ba36c 100644 --- a/examples/multimodal_vision/gemma3_example.py +++ b/examples/multimodal_vision/gemma3_example.py @@ -5,7 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Load model. model_id = "google/gemma-3-4b-it" diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py index 09722c127..1225349c4 100644 --- a/examples/multimodal_vision/idefics3_example.py +++ b/examples/multimodal_vision/idefics3_example.py @@ -6,7 +6,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Load model. model_id = "HuggingFaceM4/Idefics3-8B-Llama3" # or "HuggingFaceTB/SmolVLM-Instruct" diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py index 984e8a1fd..0a17d8c50 100644 --- a/examples/multimodal_vision/llava_example.py +++ b/examples/multimodal_vision/llava_example.py @@ -5,7 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Load model. model_id = "llava-hf/llava-1.5-7b-hf" diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py index fc3657b0e..e70ee43ec 100644 --- a/examples/multimodal_vision/mistral3_example.py +++ b/examples/multimodal_vision/mistral3_example.py @@ -8,7 +8,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Load model. model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py index 7d94a677c..6672aff2e 100644 --- a/examples/multimodal_vision/mllama_example.py +++ b/examples/multimodal_vision/mllama_example.py @@ -5,7 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Load model. model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py index 324df5d31..fa4b0feab 100644 --- a/examples/multimodal_vision/phi3_vision_example.py +++ b/examples/multimodal_vision/phi3_vision_example.py @@ -7,7 +7,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Load model. model_id = "microsoft/Phi-3-vision-128k-instruct" diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py index b2b4c7440..a0ed50ef4 100644 --- a/examples/multimodal_vision/pixtral_example.py +++ b/examples/multimodal_vision/pixtral_example.py @@ -5,7 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Load model. model_id = "mgoin/pixtral-12b" diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py index 14033872d..8cccf768e 100644 --- a/examples/multimodal_vision/qwen2_vl_example.py +++ b/examples/multimodal_vision/qwen2_vl_example.py @@ -8,7 +8,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Load model. model_id = "Qwen/Qwen2-VL-2B-Instruct" diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py index 096596d24..10a0edeec 100644 --- a/examples/multimodal_vision/qwen_2_5_vl_example.py +++ b/examples/multimodal_vision/qwen_2_5_vl_example.py @@ -8,7 +8,7 @@ from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Load model. model_id = "Qwen/Qwen2.5-VL-7B-Instruct" diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py index 4bf505047..0b83d7384 100644 --- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py +++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py @@ -3,7 +3,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot, train -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # load the model in as bfloat16 to save on memory and compute model_stub = "neuralmagic/Llama-2-7b-ultrachat200k" diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py index 6246d41f7..f753d71dd 100644 --- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py +++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py @@ -2,7 +2,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Select model and load it. MODEL_ID = "google/gemma-2-9b-it" diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index 3ee8c38db..339c353fa 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -3,7 +3,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py index 39d832830..0d16e1b22 100644 --- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py +++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py @@ -2,7 +2,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Select model and load it. # Phi-3.5 is a special case for KV cache quantization because it has diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 6bd41c58e..89c9d353e 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -3,7 +3,7 @@ from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py index da526cad7..d35de8d30 100644 --- a/examples/quantization_w4a16_fp4/llama3_example.py +++ b/examples/quantization_w4a16_fp4/llama3_example.py @@ -2,7 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py index f209a581b..95d01657b 100644 --- a/examples/quantization_w4a4_fp4/llama3_example.py +++ b/examples/quantization_w4a4_fp4/llama3_example.py @@ -3,7 +3,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py index 5c41a4d35..1b56512b4 100644 --- a/examples/quantization_w8a8_fp8/gemma2_example.py +++ b/examples/quantization_w8a8_fp8/gemma2_example.py @@ -2,7 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation MODEL_ID = "google/gemma-2-27b-it" diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py index 1c21c23d0..6a1454cd0 100644 --- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py +++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py @@ -2,7 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py index 346012e4e..39c196752 100644 --- a/examples/quantization_w8a8_fp8/llama3_example.py +++ b/examples/quantization_w8a8_fp8/llama3_example.py @@ -2,7 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py index 41a02b156..a03188a61 100644 --- a/examples/quantization_w8a8_fp8/llava1.5_example.py +++ b/examples/quantization_w8a8_fp8/llava1.5_example.py @@ -2,7 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation MODEL_ID = "llava-hf/llava-1.5-7b-hf" diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py index f52fb5c9e..ebadbe973 100644 --- a/examples/quantization_w8a8_fp8/qwen2vl_example.py +++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py @@ -2,7 +2,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct" diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py index 403de54a8..2cbbebe7d 100644 --- a/examples/quantization_w8a8_fp8/whisper_example.py +++ b/examples/quantization_w8a8_fp8/whisper_example.py @@ -3,7 +3,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation MODEL_ID = "openai/whisper-large-v2" diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py index 13c900d4c..d332532b0 100644 --- a/examples/quantization_w8a8_int8/gemma2_example.py +++ b/examples/quantization_w8a8_int8/gemma2_example.py @@ -3,7 +3,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # 1) Select model and load it. MODEL_ID = "google/gemma-2-2b-it" diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py index 66487dba4..feab87455 100644 --- a/examples/quantization_w8a8_int8/llama3_example.py +++ b/examples/quantization_w8a8_int8/llama3_example.py @@ -4,7 +4,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.modifiers.smoothquant import SmoothQuantModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py index 5de7911fb..9880e9248 100644 --- a/examples/quantizing_moe/deepseek_moe_w4a16.py +++ b/examples/quantizing_moe/deepseek_moe_w4a16.py @@ -4,7 +4,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ from llmcompressor import oneshot -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # NOTE: transformers 4.49.0 has an attribute error with DeepSeek. # Please consider either downgrading your transformers version to a diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py index e247c77fb..0bc9c24df 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py @@ -4,7 +4,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # NOTE: transformers 4.49.0 has an attribute error with DeepSeek. # Please consider either downgrading your transformers version to a diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py index 8648dbbf8..3ec506c34 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py @@ -5,7 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # NOTE: transformers 4.49.0 has an attribute error with DeepSeek. # Please consider either downgrading your transformers version to a diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py index e45217203..a17bf873d 100644 --- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py +++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py @@ -5,7 +5,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py index 51b8821b9..40a78a9b7 100644 --- a/examples/quantizing_moe/qwen_moe_w4a16.py +++ b/examples/quantizing_moe/qwen_moe_w4a16.py @@ -4,7 +4,7 @@ from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # select a Mixture of Experts model for quantization MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat" diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py index 616db364e..590b74611 100644 --- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py +++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py @@ -6,7 +6,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.obcq import SparseGPTModifier from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils.dev import dispatch_for_generation +from llmcompressor.utils import dispatch_for_generation # Configuration MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/src/llmcompressor/utils/__init__.py b/src/llmcompressor/utils/__init__.py index 98d5e1c65..c4fb71cdc 100644 --- a/src/llmcompressor/utils/__init__.py +++ b/src/llmcompressor/utils/__init__.py @@ -4,4 +4,5 @@ # flake8: noqa +from .dev import * from .helpers import * From e4487e24a0d0dbd669a0b32d179a65faf5ce3c6a Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 15:17:20 -0400 Subject: [PATCH 35/36] fix call Signed-off-by: Kyle Sayers --- tests/llmcompressor/transformers/tracing/test_models.py | 1 - tests/lmeval/test_lmeval.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/tests/llmcompressor/transformers/tracing/test_models.py b/tests/llmcompressor/transformers/tracing/test_models.py index 327f3d55d..135928902 100644 --- a/tests/llmcompressor/transformers/tracing/test_models.py +++ b/tests/llmcompressor/transformers/tracing/test_models.py @@ -136,7 +136,6 @@ def test_model_trace(model_id, model_class, targets, modality, backends): modality=modality, trust_remote_code=True, skip_weights=True, - device_map="cpu", ) target_modules = get_target_modules(model, targets) diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py index d4d6e6056..51aa50665 100644 --- a/tests/lmeval/test_lmeval.py +++ b/tests/lmeval/test_lmeval.py @@ -90,7 +90,6 @@ def set_up(self, test_data_file: str): logger.info("========== RUNNING ==============") logger.info(self.scheme) - self.device = "cuda:0" self.num_calibration_samples = 512 self.max_seq_length = 2048 @@ -103,7 +102,6 @@ def test_lm_eval(self, test_data_file: str): oneshot_model, processor = run_oneshot_for_e2e_testing( model=self.model, model_class=self.model_class, - device=self.device, num_calibration_samples=self.num_calibration_samples, max_seq_length=self.max_seq_length, scheme=self.scheme, From f134e56e087bba14eaea590d72b2f3653e32b9fe Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 16 Jun 2025 18:00:15 -0400 Subject: [PATCH 36/36] wip: run compression in parallel Signed-off-by: Kyle Sayers --- .../modifiers/quantization/gptq/base.py | 75 ++++++++++++------- .../quantization/gptq/gptq_quantize.py | 21 +++++- 2 files changed, 67 insertions(+), 29 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 7ae61f3e2..fb8baf0a7 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -1,5 +1,6 @@ import contextlib import warnings +from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Dict, List, Optional, Tuple, Union import torch @@ -22,12 +23,12 @@ from llmcompressor.modifiers import Modifier from llmcompressor.modifiers.quantization.gptq.gptq_quantize import ( accumulate_hessian, + initialize_linalg, make_empty_hessian, quantize_weight, ) from llmcompressor.modifiers.quantization.quantization import QuantizationMixin from llmcompressor.sentinel import Sentinel -from llmcompressor.utils.metric_logging import CompressionLogger __all__ = ["GPTQModifier"] @@ -252,34 +253,54 @@ def compress_modules(self): """ Quantize modules which have been calibrated """ - for module in list(self._num_samples.keys()): - name = self._module_names[module] - num_samples = self._num_samples[module] - quant_args = getattr_chain(module, "quantization_scheme.weights") - - logger.info(f"Quantizing {name} using {num_samples} samples") - with torch.no_grad(), align_module_device( - module - ), self._maybe_onload_hessian(module), CompressionLogger( - module - ) as comp_logger: - loss, quantized_weight, scale, zero_point, g_idx = quantize_weight( - module=module, - quant_args=quant_args, - hessians_dict=self._hessians, - blocksize=self.block_size, - percdamp=self.dampening_frac, - ) - comp_logger.set_loss(loss) + import time + + start_time = time.time() + + futures = [] + with ThreadPoolExecutor() as executor: + for module in list(self._num_samples.keys()): + initialize_linalg(get_execution_device(module)) + future = executor.submit(self._compress_module, module) + futures.append(future) + + for future in as_completed(futures, timeout=300): # no timeout + name, num_samples, loss = future.result() + logger.info(f"Quantized {name}") + logger.info(f" num_samples={num_samples}") + logger.info(f" loss={loss:.2f}") + + logger.info( + f"Quantized {len(futures)} modules in {time.time() - start_time: .1f}s" + ) + + def _compress_module(self, module: torch.nn.Module) -> Tuple[str, int, float]: + name = self._module_names[module] + num_samples = self._num_samples[module] + quant_args = getattr_chain(module, "quantization_scheme.weights") + + with torch.no_grad(), align_module_device(module), self._maybe_onload_hessian( + module + ): + logger.info(f"Quantizing {name}...") + loss, quantized_weight, scale, zero_point, g_idx = quantize_weight( + module=module, + quant_args=quant_args, + hessians_dict=self._hessians, + blocksize=self.block_size, + percdamp=self.dampening_frac, + ) + + update_offload_parameter(module, "weight", quantized_weight) + update_offload_parameter(module, "weight_scale", scale) + update_offload_parameter(module, "weight_zero_point", zero_point) + if g_idx is not None: + update_offload_parameter(module, "weight_g_idx", g_idx) - update_offload_parameter(module, "weight", quantized_weight) - update_offload_parameter(module, "weight_scale", scale) - update_offload_parameter(module, "weight_zero_point", zero_point) - if g_idx is not None: - update_offload_parameter(module, "weight_g_idx", g_idx) + # self._hessians[module] already deleted by quantize_weight + del self._num_samples[module] - # self._hessians[module] already deleted by quantize_weight - del self._num_samples[module] + return name, num_samples, loss def on_end(self, state: State, event: Event, **kwargs): """ diff --git a/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py index 4392ed8cf..b3fc63fab 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py +++ b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py @@ -17,8 +17,25 @@ from llmcompressor.pytorch.utils.helpers import tensor_sparsity GPTQ_PRECISION = torch.float32 - -__all__ = ["make_empty_hessian", "accumulate_hessian", "quantize_weight"] +INITIALIZED_DEVICES = set() + +__all__ = [ + "initialize_linalg", + "make_empty_hessian", + "accumulate_hessian", + "quantize_weight", +] + + +def initialize_linalg(device: torch.device): + # pre-load torch.linalg module to avoid loading the module in threads, + # which can cause lazy loading assertion errors + # https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp#L50 # noqa: E501 + # https://github.com/pytorch/ignite/issues/3004 + if device not in INITIALIZED_DEVICES: + _input = torch.ones((1, 1), device=device) + _ = torch.cholesky_inverse(torch.linalg.cholesky(_input)) + INITIALIZED_DEVICES.add(device) def make_empty_hessian(