From 1aea4ddd2339e096c704e066e40640b59eadde2d Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 3 Jun 2025 17:31:24 -0400
Subject: [PATCH 01/36] wip: alignment context

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantization_w4a16/llama3_example.py |  5 ++-
 src/llmcompressor/pipelines/basic/pipeline.py |  2 +
 src/llmcompressor/pipelines/registry.py       |  4 +-
 .../pipelines/sequential/helpers.py           | 35 ++++++++++++++-
 .../pipelines/sequential/pipeline.py          | 45 ++++++++++---------
 5 files changed, 67 insertions(+), 24 deletions(-)

diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 09bf63fb8..1016a8081 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -1,3 +1,5 @@
+import torch
+from compressed_tensors import force_cpu_offload
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -9,9 +11,10 @@
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
-    device_map="auto",
+    # device_map="auto",
     torch_dtype="auto",
 )
+force_cpu_offload(model, execution_device=torch.device("cuda"))
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
index 15b94786a..431dc1965 100644
--- a/src/llmcompressor/pipelines/basic/pipeline.py
+++ b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -37,6 +37,8 @@ def __call__(
         :param dataloader: loads data for calibration
         :param dataset_args: dataset arguments relevant to pipelines
         """
+        # TODO: warn about cpu offloading
+
         model_device = get_execution_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py
index 77d6e79ab..0a4cbb645 100644
--- a/src/llmcompressor/pipelines/registry.py
+++ b/src/llmcompressor/pipelines/registry.py
@@ -75,12 +75,12 @@ def _validate_infer_pipeline(modifiers: List[Modifier]) -> str:
             quant_modifier = active_qmods[0]
             config = quant_modifier.resolve_quantization_config()
             if config.requires_calibration_data():
-                return "basic"
+                return "sequential"
             else:
                 return "datafree"
 
         if any(isinstance(modifier, SmoothQuantModifier) for modifier in modifiers):
-            return "basic"
+            return "sequential"
 
         return "datafree"
 
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index b7937a2fc..38f1cdadf 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, List, Optional, Set
 
 import torch
+from accelerate.hooks import AlignDevicesHook
 from compressed_tensors import has_offloaded_params
 from compressed_tensors.quantization import find_name_or_class_matches
 from loguru import logger
@@ -23,7 +24,12 @@
 
 from .ast_helpers import autowrap_forwards
 
-__all__ = ["trace_subgraphs", "Subgraph", "get_targets_from_modifiers"]
+__all__ = [
+    "trace_subgraphs",
+    "Subgraph",
+    "get_targets_from_modifiers",
+    "keep_onload_context",
+]
 
 
 @dataclass
@@ -485,3 +491,30 @@ def is_ancestor(module: Module) -> bool:
 
     is_ancestor(model)
     return ancestors
+
+
+@contextlib.contextmanager
+def keep_onload_context():
+    original_pre_forward = AlignDevicesHook.pre_forward
+    onloaded_modules = dict()
+
+    # onload once and disable any future onloading/offloading steps
+    def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs):
+        ret = original_pre_forward(self, module, *args, **kwargs)
+        if module not in onloaded_modules:
+            onloaded_modules[module] = (self, self.offload)
+            self.offload = False
+        return ret
+
+    # use the patched pre_forward function within the context
+    with patch_attr(AlignDevicesHook, "pre_forward", keep_onload_pre_forward):
+        yield
+
+    # manually offload all modules that were onloaded
+    for module, (hook, offload) in onloaded_modules.items():
+        hook.offload = offload
+        hook.post_forward(module, None)
+
+
+# def is_cpu_offloaded():
+#
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index 22c47d894..fcb91d803 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -1,9 +1,9 @@
 from typing import TYPE_CHECKING
 
 import torch
-import tqdm
 from compressed_tensors.utils import get_execution_device
 from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
 
 from llmcompressor.core import LifecycleCallbacks, active_session
 from llmcompressor.modifiers.utils.hooks import HooksMixin
@@ -11,6 +11,7 @@
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
     get_targets_from_modifiers,
+    keep_onload_context,
     trace_subgraphs,
 )
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
@@ -51,6 +52,8 @@ def __call__(
         """
         session = active_session()
 
+        # TODO: warn about not cpu offloading
+
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()
         sequential_targets = get_targets_from_modifiers(modifiers, model)
@@ -59,37 +62,39 @@ def __call__(
         # trace subgraphs
         sample_input = next(iter(dataloader))
         subgraphs = trace_subgraphs(model, sample_input, sequential_targets, ignore)
+        num_subgraphs = len(subgraphs)
 
         LifecycleCallbacks.calibration_epoch_start()
 
         with calibration_forward_context(model), DisableQuantization(model):
             # prepare intermediates cache
             model_device = get_execution_device(model)
-            intermediates = IntermediatesCache.from_dataloader(dataloader, model_device)
+            activations = IntermediatesCache.from_dataloader(dataloader, model_device)
 
-            num_subgraphs = len(subgraphs)
             for subgraph_index, subgraph in enumerate(subgraphs):
                 # prepare tqdm description texts
                 calib_desc = f"({subgraph_index + 1}/{num_subgraphs}): Calibrating"
                 prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating"
 
-                # do a preliminary pass to trigger modifier hooks
-                for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
-                    inputs = intermediates.fetch(batch_idx, subgraph.input_names)
-                    subgraph.forward(model, **inputs)
-
-                LifecycleCallbacks.sequential_epoch_end()
-
-                # this pass does not trigger modifier hooks
-                # and is only used for capturing outputs from newly compressed modules
-                with HooksMixin.disable_hooks():
-                    for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=prop_desc):
-                        inputs = intermediates.fetch(batch_idx, subgraph.input_names)
-                        output = subgraph.forward(model, **inputs)
-
-                        if subgraph_index < num_subgraphs - 1:
-                            intermediates.update(batch_idx, output)
-                            intermediates.delete(batch_idx, subgraph.consumed_names)
+                # reduce memory movement by keeping modules onloaded
+                with keep_onload_context():
+                    # do a preliminary pass to trigger modifier hooks
+                    for batch_idx in tqdm(range(len(dataloader)), desc=calib_desc):
+                        inputs = activations.fetch(batch_idx, subgraph.input_names)
+                        subgraph.forward(model, **inputs)
+
+                    LifecycleCallbacks.sequential_epoch_end()
+
+                    # this pass does not trigger modifier hooks
+                    # and is only used for capturing outputs of newly compressed modules
+                    with HooksMixin.disable_hooks():
+                        for batch_idx in tqdm(range(len(dataloader)), desc=prop_desc):
+                            inputs = activations.fetch(batch_idx, subgraph.input_names)
+                            output = subgraph.forward(model, **inputs)
+
+                            if subgraph_index < num_subgraphs - 1:
+                                activations.update(batch_idx, output)
+                                activations.delete(batch_idx, subgraph.consumed_names)
 
             # redundant, finish any remaining compression
             LifecycleCallbacks.calibration_epoch_end()

From 6705bf4e5e8c3c05407e5a8b4ad6d38100f22d90 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Thu, 5 Jun 2025 22:19:31 +0000
Subject: [PATCH 02/36] touchups based on remaining steps

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 examples/quantization_w4a16/llama3_example.py |  5 +-
 src/llmcompressor/entrypoints/oneshot.py      | 38 ++++++++++++-
 src/llmcompressor/modifiers/awq/base.py       |  1 -
 src/llmcompressor/pipelines/basic/pipeline.py |  2 -
 .../pipelines/layer_sequential/pipeline.py    | 54 +++++++++++--------
 .../pipelines/sequential/pipeline.py          |  2 -
 6 files changed, 70 insertions(+), 32 deletions(-)

diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 1016a8081..6ac0328d7 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -1,5 +1,4 @@
 import torch
-from compressed_tensors import force_cpu_offload
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -11,10 +10,9 @@
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
-    # device_map="auto",
+    device_map="cpu",
     torch_dtype="auto",
 )
-force_cpu_offload(model, execution_device=torch.device("cuda"))
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
@@ -67,6 +65,7 @@ def tokenize(sample):
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    oneshot_device=torch.device("cuda") if torch.cuda.is_available() else None,
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index bedca7392..730c280f9 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -2,6 +2,9 @@
 from datetime import datetime
 from typing import Optional
 
+import torch
+from compressed_tensors import force_cpu_offload
+from compressed_tensors.utils import get_execution_device
 from loguru import logger
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
@@ -10,7 +13,11 @@
 from llmcompressor.core.session_functions import active_session
 from llmcompressor.datasets import get_calibration_dataloader
 from llmcompressor.entrypoints.utils import post_process, pre_process
-from llmcompressor.pipelines.registry import CalibrationPipeline
+from llmcompressor.pipelines import (
+    CalibrationPipeline,
+    LayerSequentialPipeline,
+    SequentialPipeline,
+)
 
 __all__ = ["Oneshot", "oneshot"]
 
@@ -186,6 +193,35 @@ def apply_recipe_modifiers(
         user_pipeline = self.dataset_args.pipeline
         modifiers = session.get_modifiers()
         pipeline = CalibrationPipeline.from_modifiers(modifiers, user=user_pipeline)
+
+        model_exec_device = get_execution_device(self.model)
+
+        # Sequential pipelines onload models layer by layer to minimize GPU memory usage
+        if isinstance(pipeline, (SequentialPipeline, LayerSequentialPipeline)):
+            # unless pure cpu run, throw warning if model lives on oneshot_device
+            if (
+                model_exec_device
+                == self.model_args.oneshot_device
+                != torch.device("cpu")
+            ):
+                logger.warning(
+                    f"Model device {model_exec_device} is the same as oneshot"
+                    " execution device. If you encounter OOM errors, consider"
+                    " loading the model up on CPU, so that more memory is available"
+                    " for the oneshot algorithm to run on GPU. Example available at"
+                    " examples/quantization_w4a16/llama3_example.py"
+                )
+
+            # set cpu offload for model
+            elif (
+                model_exec_device
+                == torch.device("cpu")
+                != self.model_args.oneshot_device
+            ):
+                force_cpu_offload(
+                    self.model, execution_devce=self.model_args.oneshot_device
+                )
+
         pipeline(self.model, calibration_dataloader, self.dataset_args)
 
         session.finalize()
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
index f95aaaea8..e5e02b62f 100644
--- a/src/llmcompressor/modifiers/awq/base.py
+++ b/src/llmcompressor/modifiers/awq/base.py
@@ -34,7 +34,6 @@
 __all__ = ["AWQModifier"]
 
 
-# TODO (Brian INFERENG-531) Add support for offloaded models
 class AWQModifier(Modifier, QuantizationMixin):
     """
     Implements the AWQ (Activation-Weighted Quantization) algorithm,
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
index 431dc1965..15b94786a 100644
--- a/src/llmcompressor/pipelines/basic/pipeline.py
+++ b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -37,8 +37,6 @@ def __call__(
         :param dataloader: loads data for calibration
         :param dataset_args: dataset arguments relevant to pipelines
         """
-        # TODO: warn about cpu offloading
-
         model_device = get_execution_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 9cb2f3708..d4b79f188 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -14,7 +14,10 @@
     to_next_layer_kwargs,
 )
 from llmcompressor.pipelines.registry import CalibrationPipeline
-from llmcompressor.pipelines.sequential.helpers import get_targets_from_modifiers
+from llmcompressor.pipelines.sequential.helpers import (
+    get_targets_from_modifiers,
+    keep_onload_context,
+)
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
@@ -73,29 +76,34 @@ def __call__(
                 calib_desc = f"({layer_index + 1}/{num_layers}): Calibrating"
                 prop_desc = f"({layer_index + 1}/{num_layers}): Propagating"
 
-                # do a preliminary pass to trigger modifier hooks
-                for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
-                    inputs = intermediates.fetch(batch_idx)
-                    layer(**inputs)
-
-                LifecycleCallbacks.sequential_epoch_end()
-
-                # this pass does not trigger modifier hooks
-                # and is only used for capturing outputs from newly compressed modules
-                with HooksMixin.disable_hooks():
-                    for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=prop_desc):
+                # reduce memory movement by keeping modules onloaded
+                with keep_onload_context():
+                    # do a preliminary pass to trigger modifier hooks
+                    for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
                         inputs = intermediates.fetch(batch_idx)
-                        output = layer(**inputs)
-
-                        if layer_index < num_layers - 1:
-                            next_layer = layers[layer_index + 1]
-                            output = to_next_layer_kwargs(output, next_layer)
-                            output = maybe_inject_pos_embeddings(
-                                output, next_layer, inputs
-                            )
-
-                            intermediates.delete(batch_idx)
-                            intermediates.update(batch_idx, output)
+                        layer(**inputs)
+
+                    LifecycleCallbacks.sequential_epoch_end()
+
+                    # this pass does not trigger modifier hooks
+                    # and is only used for capturing outputs from
+                    # newly compressed modules
+                    with HooksMixin.disable_hooks():
+                        for batch_idx in tqdm.tqdm(
+                            range(len(dataloader)), desc=prop_desc
+                        ):
+                            inputs = intermediates.fetch(batch_idx)
+                            output = layer(**inputs)
+
+                            if layer_index < num_layers - 1:
+                                next_layer = layers[layer_index + 1]
+                                output = to_next_layer_kwargs(output, next_layer)
+                                output = maybe_inject_pos_embeddings(
+                                    output, next_layer, inputs
+                                )
+
+                                intermediates.delete(batch_idx)
+                                intermediates.update(batch_idx, output)
 
             # redundant, finish any remaining compression
             LifecycleCallbacks.calibration_epoch_end()
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index fcb91d803..c043d2c8a 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -52,8 +52,6 @@ def __call__(
         """
         session = active_session()
 
-        # TODO: warn about not cpu offloading
-
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()
         sequential_targets = get_targets_from_modifiers(modifiers, model)

From cf1f87d4422c2333faa13e84a36412a374c3af7a Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 5 Jun 2025 21:40:22 -0400
Subject: [PATCH 03/36] implement oneshot_device, pipeline warnings

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/args/model_arguments.py     |  2 +-
 src/llmcompressor/entrypoints/oneshot.py      | 38 +------------------
 src/llmcompressor/entrypoints/utils.py        | 12 ++++++
 .../pipelines/layer_sequential/pipeline.py    | 11 ++++++
 src/llmcompressor/pipelines/registry.py       |  8 +---
 .../pipelines/sequential/pipeline.py          | 11 ++++++
 6 files changed, 38 insertions(+), 44 deletions(-)

diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
index 870f6d772..9cf8a687c 100644
--- a/src/llmcompressor/args/model_arguments.py
+++ b/src/llmcompressor/args/model_arguments.py
@@ -81,7 +81,7 @@ class ModelArguments:
         metadata={"help": "Whether to compress sparse models during save"},
     )
     oneshot_device: Optional[str] = field(
-        default="cuda:0",
+        default="cuda",
         metadata={"help": "Device to run oneshot calibration on"},
     )
     model_revision: str = field(
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index 730c280f9..54a36abfe 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -2,9 +2,6 @@
 from datetime import datetime
 from typing import Optional
 
-import torch
-from compressed_tensors import force_cpu_offload
-from compressed_tensors.utils import get_execution_device
 from loguru import logger
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
@@ -13,11 +10,7 @@
 from llmcompressor.core.session_functions import active_session
 from llmcompressor.datasets import get_calibration_dataloader
 from llmcompressor.entrypoints.utils import post_process, pre_process
-from llmcompressor.pipelines import (
-    CalibrationPipeline,
-    LayerSequentialPipeline,
-    SequentialPipeline,
-)
+from llmcompressor.pipelines import CalibrationPipeline
 
 __all__ = ["Oneshot", "oneshot"]
 
@@ -193,35 +186,6 @@ def apply_recipe_modifiers(
         user_pipeline = self.dataset_args.pipeline
         modifiers = session.get_modifiers()
         pipeline = CalibrationPipeline.from_modifiers(modifiers, user=user_pipeline)
-
-        model_exec_device = get_execution_device(self.model)
-
-        # Sequential pipelines onload models layer by layer to minimize GPU memory usage
-        if isinstance(pipeline, (SequentialPipeline, LayerSequentialPipeline)):
-            # unless pure cpu run, throw warning if model lives on oneshot_device
-            if (
-                model_exec_device
-                == self.model_args.oneshot_device
-                != torch.device("cpu")
-            ):
-                logger.warning(
-                    f"Model device {model_exec_device} is the same as oneshot"
-                    " execution device. If you encounter OOM errors, consider"
-                    " loading the model up on CPU, so that more memory is available"
-                    " for the oneshot algorithm to run on GPU. Example available at"
-                    " examples/quantization_w4a16/llama3_example.py"
-                )
-
-            # set cpu offload for model
-            elif (
-                model_exec_device
-                == torch.device("cpu")
-                != self.model_args.oneshot_device
-            ):
-                force_cpu_offload(
-                    self.model, execution_devce=self.model_args.oneshot_device
-                )
-
         pipeline(self.model, calibration_dataloader, self.dataset_args)
 
         session.finalize()
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
index 0186628f0..f63734985 100644
--- a/src/llmcompressor/entrypoints/utils.py
+++ b/src/llmcompressor/entrypoints/utils.py
@@ -3,6 +3,8 @@
 from pathlib import PosixPath
 from typing import Optional, Tuple
 
+import torch
+from compressed_tensors.utils import force_cpu_offload
 from loguru import logger
 from torch.nn import Module
 from transformers import (
@@ -62,6 +64,16 @@ def pre_process(model_args: "ModelArguments"):
     # untie tie_word_embeddings weights
     patch_tied_tensors_bug(model_args.model)
 
+    # offload to cpu if possible
+    if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
+        # TODO: consider renaming function to something like "offload_dispatch_model"
+        # TODO: modify function to remove any hooks if they already exist (making sure
+        # to move to cpu when removing hook
+        force_cpu_offload(model_args.model, model_args.oneshot_device)
+
+    else:
+        logger.warning("CUDA is not available! Compressing model on CPU instead")
+
     # wrap model.save_pretrained
     modify_save_pretrained(model_args.model)
 
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index d4b79f188..3130f75d9 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -2,6 +2,7 @@
 
 import torch
 import tqdm
+from loguru import logger
 from torch.utils.data.dataloader import DataLoader
 
 from llmcompressor.core import LifecycleCallbacks, active_session
@@ -57,6 +58,16 @@ def __call__(
         """
         session = active_session()
 
+        # check for offloading
+        if model.device != torch.device("meta"):
+            logger.warning(
+                "Attemping to use sequential pipeline with a model which is not "
+                "offloaded to the cpu. Deploying a model in this way may lead to more "
+                "memory usage than is required. It is recommended to set "
+                '`oneshot_device="cuda"` or call `force_cpu_offload` on your model '
+                "before compressing"
+            )
+
         # find layers
         modifiers = session.get_modifiers()
         sequential_targets, _ = get_targets_from_modifiers(modifiers, model)
diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py
index 0a4cbb645..f472c0f0d 100644
--- a/src/llmcompressor/pipelines/registry.py
+++ b/src/llmcompressor/pipelines/registry.py
@@ -18,6 +18,7 @@
 __all__ = ["CalibrationPipeline"]
 
 SEQUENTIAL_MODIFIERS = (AWQModifier, GPTQModifier, SparsityModifierBase)
+NEED_DATA = (SmoothQuantModifier, *SEQUENTIAL_MODIFIERS)
 
 
 class CalibrationPipeline(ABC, RegistryMixin):
@@ -60,7 +61,7 @@ def from_modifiers(
 
     @staticmethod
     def _validate_infer_pipeline(modifiers: List[Modifier]) -> str:
-        if any(isinstance(modifier, SEQUENTIAL_MODIFIERS) for modifier in modifiers):
+        if any(isinstance(modifier, NEED_DATA) for modifier in modifiers):
             return "sequential"
 
         active_qmods = _get_active_quant_modifiers(modifiers)
@@ -76,11 +77,6 @@ def _validate_infer_pipeline(modifiers: List[Modifier]) -> str:
             config = quant_modifier.resolve_quantization_config()
             if config.requires_calibration_data():
                 return "sequential"
-            else:
-                return "datafree"
-
-        if any(isinstance(modifier, SmoothQuantModifier) for modifier in modifiers):
-            return "sequential"
 
         return "datafree"
 
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index c043d2c8a..4af40f772 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -2,6 +2,7 @@
 
 import torch
 from compressed_tensors.utils import get_execution_device
+from loguru import logger
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
 
@@ -52,6 +53,16 @@ def __call__(
         """
         session = active_session()
 
+        # check for offloading
+        if model.device != torch.device("meta"):
+            logger.warning(
+                "Attemping to use sequential pipeline with a model which is not "
+                "offloaded to the cpu. Deploying a model in this way may lead to more "
+                "memory usage than is required. It is recommended to set "
+                '`oneshot_device="cuda"` or call `force_cpu_offload` on your model '
+                "before compressing"
+            )
+
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()
         sequential_targets = get_targets_from_modifiers(modifiers, model)

From 97c8d303fd4b40030bf0e41741f041849862e4de Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 5 Jun 2025 21:42:19 -0400
Subject: [PATCH 04/36] simplify example

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantization_w4a16/llama3_example.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 6ac0328d7..df5a8f826 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -1,4 +1,3 @@
-import torch
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -8,11 +7,7 @@
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="cpu",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
@@ -65,7 +60,6 @@ def tokenize(sample):
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    oneshot_device=torch.device("cuda") if torch.cuda.is_available() else None,
 )
 
 # Confirm generations of the quantized model look sane.

From ecfe15d85c01bef92f6396d5c51cf99b3ff4509e Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 5 Jun 2025 21:46:13 -0400
Subject: [PATCH 05/36] move offloading outside of preprocess, which is shared
 with train

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/entrypoints/oneshot.py | 11 +++++++++++
 src/llmcompressor/entrypoints/utils.py   | 12 ------------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index 54a36abfe..9659b7d7e 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -2,6 +2,8 @@
 from datetime import datetime
 from typing import Optional
 
+import torch
+from compressed_tensors.utils import force_cpu_offload
 from loguru import logger
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
@@ -123,6 +125,15 @@ def __init__(
         # initialize the model and processor
         pre_process(model_args)
 
+        # offload to cpu if possible
+        if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
+            # TODO: consider renaming function similar to "offload_dispatch_model"
+            # TODO: modify function to remove any hooks if they already exist (making
+            # sure to move to cpu when removing hook
+            force_cpu_offload(model_args.model, model_args.oneshot_device)
+        else:
+            logger.warning("CUDA is not available! Compressing model on CPU instead")
+
         # Set instance attributes
         self.model = self.model_args.model
         self.processor = self.model_args.processor
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
index f63734985..0186628f0 100644
--- a/src/llmcompressor/entrypoints/utils.py
+++ b/src/llmcompressor/entrypoints/utils.py
@@ -3,8 +3,6 @@
 from pathlib import PosixPath
 from typing import Optional, Tuple
 
-import torch
-from compressed_tensors.utils import force_cpu_offload
 from loguru import logger
 from torch.nn import Module
 from transformers import (
@@ -64,16 +62,6 @@ def pre_process(model_args: "ModelArguments"):
     # untie tie_word_embeddings weights
     patch_tied_tensors_bug(model_args.model)
 
-    # offload to cpu if possible
-    if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
-        # TODO: consider renaming function to something like "offload_dispatch_model"
-        # TODO: modify function to remove any hooks if they already exist (making sure
-        # to move to cpu when removing hook
-        force_cpu_offload(model_args.model, model_args.oneshot_device)
-
-    else:
-        logger.warning("CUDA is not available! Compressing model on CPU instead")
-
     # wrap model.save_pretrained
     modify_save_pretrained(model_args.model)
 

From 6f8624457d445af435292be474bd0cfdcc8b9167 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 5 Jun 2025 21:48:59 -0400
Subject: [PATCH 06/36] cleanup

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/pipelines/sequential/helpers.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index 38f1cdadf..ee282f73a 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -514,7 +514,3 @@ def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs):
     for module, (hook, offload) in onloaded_modules.items():
         hook.offload = offload
         hook.post_forward(module, None)
-
-
-# def is_cpu_offloaded():
-#

From 929f678e371e9dbcdfc675407874203a70c1e393 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 5 Jun 2025 22:43:57 -0400
Subject: [PATCH 07/36] update examples, remove offload devicemap utils

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/awq/llama_example.py                 |  10 +-
 examples/awq/qwen3_moe_example.py             |  11 +-
 examples/big_models_with_accelerate/README.md |  95 -------------
 .../cpu_offloading_fp8.py                     |  26 ----
 .../mult_gpus_int8_device_map.py              |  81 -----------
 .../multi_gpu_int8.py                         |  78 -----------
 .../fp8_compressed_inference.py               |   6 +-
 examples/multimodal_audio/whisper_example.py  |  13 +-
 examples/multimodal_vision/gemma3_example.py  |   4 +-
 .../multimodal_vision/idefics3_example.py     |   4 +-
 examples/multimodal_vision/llava_example.py   |   4 +-
 .../multimodal_vision/mistral3_example.py     |   4 +-
 examples/multimodal_vision/mllama_example.py  |   4 +-
 .../multimodal_vision/phi3_vision_example.py  |   1 -
 examples/multimodal_vision/pixtral_example.py |   4 +-
 .../multimodal_vision/qwen2_vl_example.py     |   6 +-
 .../multimodal_vision/qwen_2_5_vl_example.py  |   6 +-
 .../llama7b_sparse_w4a16.py                   |   4 +-
 .../gemma2_fp8_kv_example.py                  |  12 +-
 .../llama3_fp8_kv_example.py                  |   8 +-
 .../phi3.5_fp8_kv_example.py                  |   6 +-
 examples/quantization_w4a16/llama3_example.py |   9 +-
 .../quantization_w4a16_fp4/llama3_example.py  |   4 +-
 .../quantization_w4a4_fp4/llama3_example.py   |  11 +-
 .../quantization_w8a8_fp8/gemma2_example.py   |   4 +-
 .../llama3.2_vision_example.py                |   4 +-
 .../quantization_w8a8_fp8/llama3_example.py   |   4 +-
 .../quantization_w8a8_fp8/llava1.5_example.py |   4 +-
 .../quantization_w8a8_fp8/qwen2vl_example.py  |   4 +-
 .../quantization_w8a8_fp8/whisper_example.py  |   4 +-
 .../quantization_w8a8_int8/gemma2_example.py  |   6 +-
 .../quantization_w8a8_int8/llama3_example.py  |   6 +-
 examples/quantizing_moe/deepseek_moe_w4a16.py |  13 +-
 .../quantizing_moe/deepseek_moe_w8a8_fp8.py   |   2 +-
 .../quantizing_moe/deepseek_moe_w8a8_int8.py  |  13 +-
 .../quantizing_moe/mixtral_moe_w8a8_fp8.py    |  10 +-
 examples/quantizing_moe/qwen_moe_w4a16.py     |  13 +-
 .../llama3_8b_2of4.py                         |   4 +-
 .../transformers/compression/helpers.py       | 132 +-----------------
 39 files changed, 55 insertions(+), 579 deletions(-)
 delete mode 100644 examples/big_models_with_accelerate/README.md
 delete mode 100644 examples/big_models_with_accelerate/cpu_offloading_fp8.py
 delete mode 100644 examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
 delete mode 100644 examples/big_models_with_accelerate/multi_gpu_int8.py

diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
index 9d2c724d7..7706db7e6 100644
--- a/examples/awq/llama_example.py
+++ b/examples/awq/llama_example.py
@@ -5,12 +5,10 @@
 from llmcompressor.modifiers.awq import AWQModifier
 
 # Select model and load it.
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
 # Select calibration dataset.
 DATASET_ID = "mit-han-lab/pile-val-backup"
@@ -72,6 +70,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"
+SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
index b8f4a4ec1..5775284a1 100644
--- a/examples/awq/qwen3_moe_example.py
+++ b/examples/awq/qwen3_moe_example.py
@@ -5,12 +5,9 @@
 from llmcompressor.modifiers.awq import AWQModifier
 
 # Select model and load it.
-MODEL_ID = "Qwen/Qwen3-30B-A3B"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model_id = "Qwen/Qwen3-30B-A3B"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
 # Select calibration dataset.
 DATASET_ID = "mit-han-lab/pile-val-backup"
@@ -77,6 +74,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym"
+SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/big_models_with_accelerate/README.md b/examples/big_models_with_accelerate/README.md
deleted file mode 100644
index 801f46a2f..000000000
--- a/examples/big_models_with_accelerate/README.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# Quantizing Big Models with HF Accelerate
-
-`llmcompressor` integrates with `accelerate` to support quantizing large models such as Llama 70B and 405B, or quantizing any model with limited GPU resources.
-
-## Overview
-
-[`accelerate`]((https://huggingface.co/docs/accelerate/en/index)) is a highly useful library in the Hugging Face ecosystem that supports for working with large models, including:
-- Offloading parameters to CPU
-- Sharding models across multiple GPUs with pipeline-parallelism
-
-
-### Using `device_map`
-
-To enable `accelerate` features with `llmcompressor`, simple insert `device_map` in `from_pretrained` during model load.
-
-```python
-from transformers import AutoModelForCausalLM
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-
-# device_map="auto" triggers usage of accelerate
-# if > 1 GPU, the model will be sharded across the GPUs
-# if not enough GPU memory to fit the model, parameters are offloaded to the CPU
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto")
-```
-
-`llmcompressor` is designed to respect the `device_map`, so calls to `oneshot` 
-will work properly out of the box for basic quantization with `QuantizationModifier`,
-even for CPU offloaded models. 
-
-To enable CPU offloading for second-order quantization methods such as GPTQ, we need to 
-allocate additional memory upfront when computing the device map. Not doing so risks
-potentially going out-of-memory.
-
-```python
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
-from transformers import AutoModelForCausalLM
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-
-# Load model, reserving memory in the device map for sequential GPTQ (adjust num_gpus as needed)
-device_map = calculate_offload_device_map(MODEL_ID, reserve_for_hessians=True, num_gpus=1)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map=device_map,
-    torch_dtype="auto",
-)
-```
-
-### Practical Advice
-
-When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down. If more gpu memory is not available, consider reducing the precision of the loaded model to a lower-width dtype such as `torch.bfloat16`.
-
-## Examples
-
-We will show working examples for each use case:
-- **CPU Offloading**: Quantize `Llama-70B` to `FP8` using `PTQ` with a single GPU
-- **Multi-GPU**: Quantize `Llama-70B` to `INT8` using `GPTQ` and `SmoothQuant` with 2 GPUs
-
-### Installation
-
-Install `llmcompressor`:
-
-```bash
-pip install llmcompressor
-```
-
-### CPU Offloading: `FP8` Quantization with `PTQ`
-
-CPU offloading is slow. As a result, we recommend using this feature only with data-free quantization methods. For example, when quantizing a model to `fp8`, we typically use simple `PTQ` to statically quantize the weights and use dynamic quantization for the activations. These methods do not require calibration data.
-
-- `cpu_offloading_fp8.py` demonstrates quantizing the weights and activations of `Llama-70B` to `fp8` on a single GPU:
-
-```bash
-export CUDA_VISIBLE_DEVICES=0
-python cpu_offloading_fp8.py
-```
-
-The resulting model `./Meta-Llama-3-70B-Instruct-FP8-Dynamic` is ready to run with `vllm`!
-
-### Multi-GPU: `INT8` Quantization with `GPTQ`
-
-For quantization methods that require calibration data (e.g. `GPTQ`), CPU offloading is too slow. For these methods, `llmcompressor` can use `accelerate` multi-GPU to quantize models that are larger than a single GPU. For example, when quantizing a model to `int8`, we typically use `GPTQ` to statically quantize the weights, which requires calibration data.
-
-- `multi_gpu_int8.py` demonstrates quantizing the weights and activations of `Llama-70B` to `int8` on 2 A100s:
-
-```python
-export CUDA_VISIBLE_DEVICES=0,1
-python multi_gpu_int8.py
-```
-
-The resulting model `./Meta-Llama-3-70B-Instruct-INT8-Dynamic` is quantized and ready to run with `vllm`!
-
-## Questions or Feature Request?
-
-Please open up an issue on `vllm-project/llm-compressor`
diff --git a/examples/big_models_with_accelerate/cpu_offloading_fp8.py b/examples/big_models_with_accelerate/cpu_offloading_fp8.py
deleted file mode 100644
index 248759ba4..000000000
--- a/examples/big_models_with_accelerate/cpu_offloading_fp8.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-
-# Load model
-# Note: device_map="auto" will offload to CPU if not enough space on GPU.
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
-)
-
-# Configure the quantization scheme and algorithm (PTQ + FP8_DYNAMIC).
-recipe = QuantizationModifier(
-    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
-)
-
-# Apply quantization and save in `compressed-tensors` format.
-oneshot(
-    model=model,
-    recipe=recipe,
-    tokenizer=AutoTokenizer.from_pretrained(MODEL_ID),
-    output_dir=OUTPUT_DIR,
-)
diff --git a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
deleted file mode 100644
index be9ecd86a..000000000
--- a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import torch
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
-
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-
-# adjust based off number of desired GPUs
-# reserve_for_hessians=True reserves memory which is required by
-# GPTQModifier and SparseGPTModifier
-device_map = calculate_offload_device_map(
-    MODEL_ID, num_gpus=1, reserve_for_hessians=True, torch_dtype=torch.bfloat16
-)
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for W8A8 quantization
-recipe = [
-    SmoothQuantModifier(smoothing_strength=0.8),
-    GPTQModifier(
-        targets="Linear",
-        scheme="W8A8",
-        ignore=["lm_head"],
-    ),
-]
-
-SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8"
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=True,
-    output_dir=SAVE_DIR,
-)
diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py
deleted file mode 100644
index a8023456a..000000000
--- a/examples/big_models_with_accelerate/multi_gpu_int8.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic"
-
-# 1) Load model (device_map="auto" with shard the model over multiple GPUs!).
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
-    trust_remote_code=True,
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# 2) Prepare calibration dataset (in this case, we use ultrachat).
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-
-# Select number of samples. 512 samples is a good place to start.
-# Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 1024
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# 3) Configure algorithms. In this case, we:
-#   * quantize the weights to int8 with GPTQ (static per channel)
-#   * quantize the activations to int8 (dynamic per token)
-recipe = [
-    GPTQModifier(
-        targets="Linear", scheme="W8A8", ignore=["lm_head"], dampening_frac=0.1
-    ),
-]
-
-# 4) Apply algorithms and save in `compressed-tensors` format.
-# if you encounter GPU out-of-memory issues, consider using an explicit
-# device map (see multi_gpus_int8_device_map.py)
-oneshot(
-    model=model,
-    tokenizer=tokenizer,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    output_dir=SAVE_DIR,
-)
diff --git a/examples/compressed_inference/fp8_compressed_inference.py b/examples/compressed_inference/fp8_compressed_inference.py
index f0d0381d2..57debe2fd 100644
--- a/examples/compressed_inference/fp8_compressed_inference.py
+++ b/examples/compressed_inference/fp8_compressed_inference.py
@@ -19,11 +19,7 @@
     "def fibonacci(n):",
 ]
 
-compressed_model = AutoModelForCausalLM.from_pretrained(
-    MODEL_STUB,
-    torch_dtype="auto",
-    device_map="cuda:0",
-)
+compressed_model = AutoModelForCausalLM.from_pretrained(MODEL_STUB, torch_dtype="auto")
 
 # tokenize the sample data
 tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB)
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index e5a292504..f19b0016a 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -6,15 +6,10 @@
 from llmcompressor.modifiers.quantization import GPTQModifier
 
 # Select model and load it.
-MODEL_ID = "openai/whisper-large-v3"
-
-model = WhisperForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model_id = "openai/whisper-large-v3"
+model = WhisperForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 model.config.forced_decoder_ids = None
-processor = WhisperProcessor.from_pretrained(MODEL_ID)
+processor = WhisperProcessor.from_pretrained(model_id)
 
 # Configure processor the dataset task.
 processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
@@ -106,6 +101,6 @@ def data_collator(batch):
 # and it was a great thing for what it was at the time but it's not a passive house
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py
index 9ac9e0dd9..3310d82d4 100644
--- a/examples/multimodal_vision/gemma3_example.py
+++ b/examples/multimodal_vision/gemma3_example.py
@@ -8,9 +8,7 @@
 
 # Load model.
 model_id = "google/gemma-3-4b-it"
-model = Gemma3ForConditionalGeneration.from_pretrained(
-    model_id, device_map="auto", torch_dtype="auto"
-)
+model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments
diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
index a8157393d..71434868e 100644
--- a/examples/multimodal_vision/idefics3_example.py
+++ b/examples/multimodal_vision/idefics3_example.py
@@ -9,9 +9,7 @@
 
 # Load model.
 model_id = "HuggingFaceM4/Idefics3-8B-Llama3"  # or "HuggingFaceTB/SmolVLM-Instruct"
-model = Idefics3ForConditionalGeneration.from_pretrained(
-    model_id, device_map="auto", torch_dtype="auto"
-)
+model = Idefics3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments
diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
index cbd0bd5d2..c5c370096 100644
--- a/examples/multimodal_vision/llava_example.py
+++ b/examples/multimodal_vision/llava_example.py
@@ -8,9 +8,7 @@
 
 # Load model.
 model_id = "llava-hf/llava-1.5-7b-hf"
-model = LlavaForConditionalGeneration.from_pretrained(
-    model_id, device_map="auto", torch_dtype="auto"
-)
+model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments
diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py
index 3a45855a0..5ad1820f3 100644
--- a/examples/multimodal_vision/mistral3_example.py
+++ b/examples/multimodal_vision/mistral3_example.py
@@ -11,9 +11,7 @@
 
 # Load model.
 model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-model = Mistral3ForConditionalGeneration.from_pretrained(
-    model_id, device_map="auto", torch_dtype="auto"
-)
+model = Mistral3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Use a custom calibration chat template, rather than the overly-verbose default
diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
index d4ddb28d6..9812bcf44 100644
--- a/examples/multimodal_vision/mllama_example.py
+++ b/examples/multimodal_vision/mllama_example.py
@@ -8,9 +8,7 @@
 
 # Load model.
 model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-model = MllamaForConditionalGeneration.from_pretrained(
-    model_id, device_map="auto", torch_dtype="auto"
-)
+model = MllamaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments
diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
index df61b664b..537ff4dc4 100644
--- a/examples/multimodal_vision/phi3_vision_example.py
+++ b/examples/multimodal_vision/phi3_vision_example.py
@@ -12,7 +12,6 @@
 model_id = "microsoft/Phi-3-vision-128k-instruct"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    device_map="auto",
     torch_dtype="auto",
     trust_remote_code=True,
     _attn_implementation="eager",
diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
index 940caa6ca..996eea885 100644
--- a/examples/multimodal_vision/pixtral_example.py
+++ b/examples/multimodal_vision/pixtral_example.py
@@ -8,9 +8,7 @@
 
 # Load model.
 model_id = "mgoin/pixtral-12b"
-model = LlavaForConditionalGeneration.from_pretrained(
-    model_id, device_map="auto", torch_dtype="auto"
-)
+model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments
diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
index 713035eee..cb64e3eb9 100644
--- a/examples/multimodal_vision/qwen2_vl_example.py
+++ b/examples/multimodal_vision/qwen2_vl_example.py
@@ -11,11 +11,7 @@
 
 # Load model.
 model_id = "Qwen/Qwen2-VL-2B-Instruct"
-model = Qwen2VLForConditionalGeneration.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = Qwen2VLForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments
diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py
index 068229a12..83eea10dd 100644
--- a/examples/multimodal_vision/qwen_2_5_vl_example.py
+++ b/examples/multimodal_vision/qwen_2_5_vl_example.py
@@ -11,11 +11,7 @@
 
 # Load model.
 model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments
diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
index 51bdad4d5..e63e9cd2d 100644
--- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
+++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -6,9 +6,7 @@
 
 # load the model in as bfloat16 to save on memory and compute
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
-model = AutoModelForCausalLM.from_pretrained(
-    model_stub, torch_dtype=torch.bfloat16, device_map="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_stub)
 
 # uses LLM Compressor's built-in preprocessing for ultra chat
diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
index fba2dcce6..840e10a41 100644
--- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -4,13 +4,9 @@
 from llmcompressor import oneshot
 
 # Select model and load it.
-MODEL_ID = "google/gemma-2-9b-it"
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model_id = "google/gemma-2-9b-it"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # Select calibration dataset.
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
@@ -98,6 +94,6 @@ def process_and_tokenize(example):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+SAVE_DIR = model_id.split("/")[1] + "-FP8-KV"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
index 4bbecaae0..df866e117 100644
--- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -5,13 +5,13 @@
 from llmcompressor import oneshot
 
 # Select model and load it.
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
+    model_id,
     device_map="auto",
     torch_dtype="auto",
 )
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # Select calibration dataset.
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
@@ -96,6 +96,6 @@ def process_and_tokenize(example):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+SAVE_DIR = model_id.split("/")[1] + "-FP8-KV"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
index 576092cf6..f22e0ea02 100644
--- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -7,11 +7,7 @@
 # Phi-3.5 is a special case for KV cache quantization because it has
 # fused QKV linear layers.
 MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index df5a8f826..7d7bb0448 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -5,10 +5,9 @@
 from llmcompressor.transformers import oneshot
 
 # Select model and load it.
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # Select calibration dataset.
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
@@ -71,6 +70,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py
index d8573d271..4bd0f16b0 100644
--- a/examples/quantization_w4a16_fp4/llama3_example.py
+++ b/examples/quantization_w4a16_fp4/llama3_example.py
@@ -6,9 +6,7 @@
 MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
 
 # Load model.
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Configure the quantization algorithm and scheme.
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
index f9d8f35dc..edff1a04c 100644
--- a/examples/quantization_w4a4_fp4/llama3_example.py
+++ b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -4,13 +4,10 @@
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-
 # Load model.
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
@@ -76,6 +73,6 @@ def tokenize(sample):
 
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4"
+SAVE_DIR = model_id.split("/")[1] + "-NVFP4"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
index 77664f2d5..ed1efe2af 100644
--- a/examples/quantization_w8a8_fp8/gemma2_example.py
+++ b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -6,9 +6,7 @@
 MODEL_ID = "google/gemma-2-27b-it"
 
 # 1) Load model.
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # 2) Configure the quantization algorithm and scheme.
diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
index c99d0bfcc..e4d8bebac 100644
--- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
+++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -6,9 +6,7 @@
 MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
 # Load model.
-model = MllamaForConditionalGeneration.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = MllamaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 
 # Configure the quantization algorithm and scheme.
diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py
index a66200239..5227eabb2 100644
--- a/examples/quantization_w8a8_fp8/llama3_example.py
+++ b/examples/quantization_w8a8_fp8/llama3_example.py
@@ -6,9 +6,7 @@
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 # Load model.
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Configure the quantization algorithm and scheme.
diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
index 31cb4cb94..c05d94a80 100644
--- a/examples/quantization_w8a8_fp8/llava1.5_example.py
+++ b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -6,9 +6,7 @@
 MODEL_ID = "llava-hf/llava-1.5-7b-hf"
 
 # Load model.
-model = LlavaForConditionalGeneration.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = LlavaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 
 # Configure the quantization algorithm and scheme.
diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py
index 564fc6644..c364fbb1e 100644
--- a/examples/quantization_w8a8_fp8/qwen2vl_example.py
+++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py
@@ -6,9 +6,7 @@
 MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
 
 # Load model.
-model = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 
 # Configure the quantization algorithm and scheme.
diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py
index 5efd08a57..7f504a41b 100644
--- a/examples/quantization_w8a8_fp8/whisper_example.py
+++ b/examples/quantization_w8a8_fp8/whisper_example.py
@@ -7,9 +7,7 @@
 MODEL_ID = "openai/whisper-large-v2"
 
 # Load model.
-model = WhisperForConditionalGeneration.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
 model.config.forced_decoder_ids = None
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
index 0573b3249..ac7ff5f49 100644
--- a/examples/quantization_w8a8_int8/gemma2_example.py
+++ b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -6,11 +6,7 @@
 
 # 1) Select model and load it.
 MODEL_ID = "google/gemma-2-2b-it"
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # 2) Prepare calibration dataset.
diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py
index c475e9089..d3067de6f 100644
--- a/examples/quantization_w8a8_int8/llama3_example.py
+++ b/examples/quantization_w8a8_int8/llama3_example.py
@@ -7,11 +7,7 @@
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
index 077bf6a1f..4c56a2c19 100644
--- a/examples/quantizing_moe/deepseek_moe_w4a16.py
+++ b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -4,7 +4,6 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
 
 from llmcompressor import oneshot
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
 # Please consider either downgrading your transformers version to a
@@ -13,18 +12,8 @@
 # select a Mixture of Experts model for quantization
 MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
 
-# adjust based off number of desired GPUs
-# if not enough memory is available, some layers will automatically be offlaoded to cpu
-device_map = calculate_offload_device_map(
-    MODEL_ID,
-    reserve_for_hessians=True,
-    num_gpus=2,
-    torch_dtype=torch.bfloat16,
-    trust_remote_code=True,
-)
-
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
index ac9ec8b19..261ac93f2 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -13,7 +13,7 @@
 MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
+    MODEL_ID, device_map="auto", trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
index da5856fc5..1b8d80a66 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -5,7 +5,6 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
 # Please consider either downgrading your transformers version to a
@@ -14,18 +13,8 @@
 # select a Mixture of Experts model for quantization
 MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 
-# adjust based off number of desired GPUs
-# if not enough memory is available, some layers will automatically be offlaoded to cpu
-device_map = calculate_offload_device_map(
-    MODEL_ID,
-    reserve_for_hessians=True,
-    num_gpus=2,
-    torch_dtype=torch.bfloat16,
-    trust_remote_code=True,
-)
-
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
index 01489e50e..3dc821ce3 100644
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -5,19 +5,11 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 NUM_GPUS = 2
 
-# Adjust based off number of desired GPUs
-device_map = calculate_offload_device_map(
-    MODEL_ID, reserve_for_hessians=True, num_gpus=NUM_GPUS, torch_dtype="auto"
-)
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map=device_map, torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
index df98d0513..ebb4a5615 100644
--- a/examples/quantizing_moe/qwen_moe_w4a16.py
+++ b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -4,23 +4,12 @@
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 # select a Mixture of Experts model for quantization
 MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
 
-# adjust based off number of desired GPUs
-# if not enough memory is available, some layers will automatically be offloaded to cpu
-device_map = calculate_offload_device_map(
-    MODEL_ID,
-    reserve_for_hessians=True,
-    num_gpus=2,
-    torch_dtype=torch.bfloat16,
-    trust_remote_code=True,
-)
-
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
index 9fc681ecf..3952d0a90 100644
--- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
+++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -75,9 +75,7 @@ def get_recipe(fp8_enabled):
 args = parse_args()
 
 # Load model and tokenizer
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Load and preprocess dataset
diff --git a/src/llmcompressor/transformers/compression/helpers.py b/src/llmcompressor/transformers/compression/helpers.py
index 179d5bc11..d02a08809 100644
--- a/src/llmcompressor/transformers/compression/helpers.py
+++ b/src/llmcompressor/transformers/compression/helpers.py
@@ -1,27 +1,20 @@
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import Dict, List, Optional, Tuple
 
-import psutil
 import torch
-from accelerate import infer_auto_device_map, init_empty_weights
 from accelerate.accelerator import get_state_dict_offloaded_model
 from compressed_tensors.quantization.utils import iter_named_leaf_modules, module_type
 from compressed_tensors.utils import align_module_device
 from torch.nn.modules import Linear
 from tqdm import tqdm
-from transformers import AutoModelForCausalLM
 
 from llmcompressor.pytorch.utils import get_linear_layers
 from llmcompressor.pytorch.utils.helpers import tensor_sparsity
-from llmcompressor.utils.pytorch import get_layers, get_no_split_params
 
 __ALL__ = [
     "tensor_follows_mask_structure",
     "infer_sparsity_structure_from_stage_modifiers",
     "infer_sparsity_structure_from_model",
-    "hessian_memory_requirements",
-    "custom_offload_device_map",
-    "calculate_offload_device_map",
     "infer_sparse_targets_and_ignores",
     "is_sparse_compression_target",
 ]
@@ -111,36 +104,6 @@ def infer_sparsity_structure_from_model(model: torch.nn.Module) -> Optional[str]
     return None
 
 
-def hessian_memory_requirements(model: torch.nn.Module) -> int:
-    """
-    Determines the number of bytes needed to store Hessian data for a single
-    transformer layer in model. This is used for reserving memory for GPTQ
-    quantization
-
-    :param model: model to calculate requirements for
-    :return: number of bytes required to reserve for GPTQ on a single layer
-    """
-    transformer_layers = get_layers(get_no_split_params(model), model)
-    total_hessian_elems = {}
-    max_column_size = {}
-    for no_split_name, no_split_layer in transformer_layers.items():
-        total_hessian_elems[no_split_name] = 0
-        max_column_size[no_split_name] = 0
-        for _name, module in no_split_layer.named_modules():
-            if isinstance(module, Linear) and hasattr(module, "weight"):
-                column_size = module.weight.shape[1]
-                total_hessian_elems[no_split_name] += column_size * column_size
-                if column_size > max_column_size[no_split_name]:
-                    # max extra memory for inverse calculation
-                    max_column_size[no_split_name] = column_size
-
-    max_total_hessian_elems = max(total_hessian_elems.values())
-    overall_max_column_size = max(max_column_size.values())
-    bytes_per_weight = 32 // 8  # hessians are float32
-    inverse_reserved = overall_max_column_size * overall_max_column_size
-    return (max_total_hessian_elems + inverse_reserved) * bytes_per_weight
-
-
 def quantization_memory_requirement(model: torch.nn.Module) -> int:
     """
     Determines the max number of bytes needed to store quantization scale and zp data
@@ -168,99 +131,6 @@ def quantization_memory_requirement(model: torch.nn.Module) -> int:
     return total_elements * bytes_ratio
 
 
-def custom_offload_device_map(
-    model_stub: str,
-    max_memory_per_gpu: Union[str, int],
-    num_gpus: int = 1,
-    model_cls: Type = AutoModelForCausalLM,
-    **model_kwargs,
-) -> Dict[Union[int, str], Union[int, str]]:
-    """
-    Calculates the optimal gpu mappings for model_stub stored as torch_dtype, where
-    each GPU is restricted to allocating a specific amount of memory.
-
-    :param model_stub: local path or HF stub to calculate mapping for
-    :param max_memory_per_gpu: Max memory to allocate on each GPU, as either a string
-        such as "10GB" or an integer number of bytes
-    :param num_gpus: number of gpus to utilize
-    :param model_cls: model class to use when initializing model structure,
-        default is AutoModelForCausalLM
-    :param model_kwargs: keyword arguments to pass to model initializer
-    :return: memory mapping for layers of model_stub to be passed to from_pretrained()
-    """
-    max_cpu_memory = psutil.virtual_memory().available
-    memory_limits = {device: max_memory_per_gpu for device in range(num_gpus)}
-    memory_limits["cpu"] = max_cpu_memory
-
-    device_map = {}
-    with init_empty_weights():
-        dummy_model = model_cls.from_pretrained(model_stub, **model_kwargs)
-        device_map = infer_auto_device_map(
-            dummy_model,
-            max_memory=memory_limits,
-            no_split_module_classes=dummy_model._no_split_modules,
-        )
-        del dummy_model
-
-    return device_map
-
-
-def calculate_offload_device_map(
-    model_stub: str,
-    reserve_for_hessians=False,
-    num_gpus: int = 1,
-    torch_dtype: torch.dtype = torch.float16,
-    model_cls: Type = AutoModelForCausalLM,
-    **model_kwargs,
-) -> Dict[Union[int, str], Union[int, str]]:
-    """
-    Calculates the optimal gpu mappings for model_stub stored as torch_dtype. Takes
-    into account extra memory required for quantization and (optionally) GPTQ hessians
-
-    :param model_stub: local path or HF stub to calculate mapping for
-    :param reserve_for_hessians: whether to reserve memory for GPTQ
-    :param num_gpus: number of gpus to utilize
-    :param model_cls: model class to use when initializing model structure,
-        default is AutoModelForCausalLM
-    :param model_kwargs: keyword arguments to pass to model initializer
-    :return: memory mapping for layers of model_stub to be passed to from_pretrained()
-    """
-    max_cpu_memory = psutil.virtual_memory().available
-    max_gpu_memory = torch.cuda.mem_get_info(0)[0]
-    available_gpus = torch.cuda.device_count()
-    if available_gpus < num_gpus:
-        raise ValueError(
-            f"Requested {num_gpus} GPUs but only {available_gpus} are available."
-        )
-    max_gpu_memory = [max_gpu_memory] * num_gpus
-
-    device_map = {}
-    with init_empty_weights():
-        dummy_model = model_cls.from_pretrained(
-            model_stub, torch_dtype=torch_dtype, **model_kwargs
-        )
-
-        reserved_memory = 0
-        if reserve_for_hessians:
-            reserved_memory = hessian_memory_requirements(dummy_model)
-        reserved_memory += quantization_memory_requirement(dummy_model)
-
-        memory_limits = {
-            idx: (max_memory - reserved_memory)
-            for idx, max_memory in enumerate(max_gpu_memory)
-        }
-        memory_limits["cpu"] = max_cpu_memory
-
-        device_map = infer_auto_device_map(
-            dummy_model,
-            max_memory=memory_limits,
-            no_split_module_classes=dummy_model._no_split_modules,
-        )
-        del dummy_model
-
-    return device_map
-
-
 def infer_sparse_targets_and_ignores(
     model: torch.nn.Module,
     sparsity_structure: str,

From a275f5343d7cb522935ebc923b7dffc6decf58b9 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 5 Jun 2025 23:15:58 -0400
Subject: [PATCH 08/36] update examples to load before generating

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/awq/llama_example.py                    | 13 ++++++++-----
 examples/awq/qwen3_moe_example.py                | 13 ++++++++-----
 examples/multimodal_audio/whisper_example.py     | 14 ++++++++------
 examples/multimodal_vision/gemma3_example.py     | 13 ++++++++-----
 examples/multimodal_vision/idefics3_example.py   | 13 ++++++++-----
 examples/multimodal_vision/llava_example.py      | 13 ++++++++-----
 examples/multimodal_vision/mistral3_example.py   | 13 ++++++++-----
 examples/multimodal_vision/mllama_example.py     | 13 ++++++++-----
 .../multimodal_vision/phi3_vision_example.py     | 13 ++++++++-----
 examples/multimodal_vision/pixtral_example.py    | 13 ++++++++-----
 examples/multimodal_vision/qwen2_vl_example.py   | 14 ++++++++------
 .../multimodal_vision/qwen_2_5_vl_example.py     | 14 ++++++++------
 .../gemma2_fp8_kv_example.py                     | 13 ++++++++-----
 .../llama3_fp8_kv_example.py                     | 13 ++++++++-----
 .../phi3.5_fp8_kv_example.py                     | 13 ++++++++-----
 examples/quantization_w4a16/llama3_example.py    | 13 ++++++++-----
 .../quantization_w4a16_fp4/llama3_example.py     | 15 +++++++++------
 examples/quantization_w4a4_fp4/llama3_example.py | 15 +++++++++------
 examples/quantization_w8a8_fp8/gemma2_example.py | 16 +++++++++-------
 .../llama3.2_vision_example.py                   | 11 ++++++++---
 examples/quantization_w8a8_fp8/llama3_example.py | 13 ++++++++-----
 .../quantization_w8a8_fp8/llava1.5_example.py    | 10 ++++++++--
 .../quantization_w8a8_fp8/qwen2vl_example.py     | 10 ++++++++--
 .../quantization_w8a8_fp8/whisper_example.py     | 13 ++++++++-----
 .../quantization_w8a8_int8/gemma2_example.py     |  6 +++++-
 .../quantization_w8a8_int8/llama3_example.py     | 13 ++++++++-----
 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py |  3 +++
 .../quantizing_moe/deepseek_moe_w8a8_int8.py     |  3 +++
 examples/quantizing_moe/mixtral_moe_w8a8_fp8.py  |  3 +++
 examples/quantizing_moe/qwen_moe_w4a16.py        |  3 +++
 .../llama3_8b_2of4.py                            | 11 +++++++----
 31 files changed, 227 insertions(+), 129 deletions(-)

diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
index 7706db7e6..d456f2b3b 100644
--- a/examples/awq/llama_example.py
+++ b/examples/awq/llama_example.py
@@ -61,6 +61,14 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
@@ -68,8 +76,3 @@ def tokenize(sample):
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
index 5775284a1..fa171f0af 100644
--- a/examples/awq/qwen3_moe_example.py
+++ b/examples/awq/qwen3_moe_example.py
@@ -65,6 +65,14 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
@@ -72,8 +80,3 @@ def tokenize(sample):
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index f19b0016a..4992f78d1 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -83,6 +83,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
@@ -92,15 +100,9 @@ def data_collator(batch):
     "input_features": torch.tensor(sample_features).to(model.device),
     "decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device),
 }
-
 output = model.generate(**sample_input, language="en")
 print(processor.batch_decode(output, skip_special_tokens=True))
 print("==========================================\n\n")
 # that's where you have a lot of windows in the south no actually that's passive solar
 # and passive solar is something that was developed and designed in the 1960s and 70s
 # and it was a great thing for what it was at the time but it's not a passive house
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py
index 3310d82d4..8f4db44c5 100644
--- a/examples/multimodal_vision/gemma3_example.py
+++ b/examples/multimodal_vision/gemma3_example.py
@@ -46,6 +46,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = Gemma3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -65,8 +73,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
index 71434868e..fcc4559d8 100644
--- a/examples/multimodal_vision/idefics3_example.py
+++ b/examples/multimodal_vision/idefics3_example.py
@@ -92,6 +92,14 @@ def tokenize(sample):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = Idefics3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -111,8 +119,3 @@ def tokenize(sample):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
index c5c370096..abb781d48 100644
--- a/examples/multimodal_vision/llava_example.py
+++ b/examples/multimodal_vision/llava_example.py
@@ -47,6 +47,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -66,8 +74,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py
index 5ad1820f3..50b7e7dd5 100644
--- a/examples/multimodal_vision/mistral3_example.py
+++ b/examples/multimodal_vision/mistral3_example.py
@@ -60,6 +60,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = Mistral3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -80,8 +88,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
index 9812bcf44..7cd45c85a 100644
--- a/examples/multimodal_vision/mllama_example.py
+++ b/examples/multimodal_vision/mllama_example.py
@@ -47,6 +47,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -66,8 +74,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
index 537ff4dc4..772001cb1 100644
--- a/examples/multimodal_vision/phi3_vision_example.py
+++ b/examples/multimodal_vision/phi3_vision_example.py
@@ -78,6 +78,14 @@ def data_collator(batch):
     ignore=["lm_head", "re:model.vision_embed_tokens.*"],
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Perform oneshot
 oneshot(
     model=model,
@@ -95,8 +103,3 @@ def data_collator(batch):
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
index 996eea885..71c1a0770 100644
--- a/examples/multimodal_vision/pixtral_example.py
+++ b/examples/multimodal_vision/pixtral_example.py
@@ -53,6 +53,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -72,8 +80,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
index cb64e3eb9..27e0954d3 100644
--- a/examples/multimodal_vision/qwen2_vl_example.py
+++ b/examples/multimodal_vision/qwen2_vl_example.py
@@ -95,6 +95,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = Qwen2VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -123,9 +131,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py
index 83eea10dd..68798cf00 100644
--- a/examples/multimodal_vision/qwen_2_5_vl_example.py
+++ b/examples/multimodal_vision/qwen_2_5_vl_example.py
@@ -89,6 +89,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -117,9 +125,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
index 840e10a41..cf5501bc9 100644
--- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -77,6 +77,14 @@ def process_and_tokenize(example):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 print(
     "Note: Inference with the quantized kv_cache is not supported. ",
     "Please use vLLM for inference with the quantized kv_cache.",
@@ -92,8 +100,3 @@ def process_and_tokenize(example):
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-FP8-KV"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
index df866e117..0cb52e0fd 100644
--- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -82,6 +82,14 @@ def process_and_tokenize(example):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 logger.info(
     "Running sample generation. ",
     "Note: Inference with the quantized kv_cache is not supported. ",
@@ -94,8 +102,3 @@ def process_and_tokenize(example):
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-FP8-KV"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
index f22e0ea02..20e7fac4c 100644
--- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -79,6 +79,14 @@ def process_and_tokenize(example):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 print(
     "Note: Inference with the quantized kv_cache is not supported. ",
     "Please use vLLM for inference with the quantized kv_cache.",
@@ -90,8 +98,3 @@ def process_and_tokenize(example):
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 7d7bb0448..9b0b0ffaa 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -61,6 +61,14 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
@@ -68,8 +76,3 @@ def tokenize(sample):
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py
index 4bd0f16b0..378f4f988 100644
--- a/examples/quantization_w4a16_fp4/llama3_example.py
+++ b/examples/quantization_w4a16_fp4/llama3_example.py
@@ -17,15 +17,18 @@
 # Apply quantization.
 oneshot(model=model, recipe=recipe)
 
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
+# Validate model generations
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
index edff1a04c..d84b749c0 100644
--- a/examples/quantization_w4a4_fp4/llama3_example.py
+++ b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -64,15 +64,18 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save to disk in compressed-tensors format.
+SAVE_DIR = model_id.split("/")[1] + "-NVFP4"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
+# Validate model generations
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = model_id.split("/")[1] + "-NVFP4"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
index ed1efe2af..713ce6609 100644
--- a/examples/quantization_w8a8_fp8/gemma2_example.py
+++ b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -18,13 +18,15 @@
 )
 
 # 3) Apply quantization and save in compressed-tensors format.
-OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-oneshot(
-    model=model,
-    recipe=recipe,
-    tokenizer=tokenizer,
-    output_dir=OUTPUT_DIR,
-)
+oneshot(model=model, recipe=recipe, tokenizer=tokenizer)
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
 
 # Confirm generations of the quantized model look sane.
 # NOTE: transformers 4.49.0 results in a generation error with gemma2.
diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
index e4d8bebac..d3a63eb0c 100644
--- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
+++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -19,15 +19,20 @@
     ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"],
 )
 
-# Apply quantization and save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+# Apply quantization.
 oneshot(
     model=model,
     recipe=recipe,
-    output_dir=SAVE_DIR,
 )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR)
 processor.save_pretrained(SAVE_DIR)
 
+# Load model after saving
+model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py
index 5227eabb2..badd6ac5e 100644
--- a/examples/quantization_w8a8_fp8/llama3_example.py
+++ b/examples/quantization_w8a8_fp8/llama3_example.py
@@ -20,14 +20,17 @@
 # Apply quantization.
 oneshot(model=model, recipe=recipe)
 
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
index c05d94a80..06d1483e1 100644
--- a/examples/quantization_w8a8_fp8/llava1.5_example.py
+++ b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -19,11 +19,17 @@
     ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_tower.*"],
 )
 
-# Apply quantization and save to disk in compressed-tensors format.
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+# Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
+model.save_pretrained(SAVE_DIR)
 processor.save_pretrained(SAVE_DIR)
 
+# Load model after saving
+model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py
index c364fbb1e..1cf73f527 100644
--- a/examples/quantization_w8a8_fp8/qwen2vl_example.py
+++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py
@@ -19,11 +19,17 @@
     ignore=["re:.*lm_head", "re:visual.*"],
 )
 
-# Apply quantization and save to disk in compressed-tensors format.
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+# Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
+model.save_pretrained(SAVE_DIR)
 processor.save_pretrained(SAVE_DIR)
 
+# Load model after saving
+model = Qwen2VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py
index 7f504a41b..51ac804b7 100644
--- a/examples/quantization_w8a8_fp8/whisper_example.py
+++ b/examples/quantization_w8a8_fp8/whisper_example.py
@@ -23,6 +23,14 @@
 # Apply quantization.
 oneshot(model=model, recipe=recipe)
 
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 ds = load_dataset(
@@ -37,8 +45,3 @@
 print(processor.batch_decode(output_ids, skip_special_tokens=False)[0])
 # Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel
 print("==========================================")
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
index ac7ff5f49..a0ebe0079 100644
--- a/examples/quantization_w8a8_int8/gemma2_example.py
+++ b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -54,15 +54,19 @@ def tokenize(sample):
 recipe = GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"])
 
 # 4) Apply quantization and save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8"
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    output_dir=MODEL_ID.split("/")[1] + "-INT8",
+    output_dir=SAVE_DIR,
 )
 
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 # NOTE: transformers 4.49.0 results in a generation error with gemma2.
 # Consider either downgrading your transformers version to a previous version
diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py
index d3067de6f..1894932f1 100644
--- a/examples/quantization_w8a8_int8/llama3_example.py
+++ b/examples/quantization_w8a8_int8/llama3_example.py
@@ -67,6 +67,14 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
@@ -74,8 +82,3 @@ def tokenize(sample):
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
index 261ac93f2..69dae05ad 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -79,6 +79,9 @@ def tokenize(sample):
     output_dir=SAVE_DIR,
 )
 
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 # Generation is broken for deepseek models when using the latest transformers package
 if Version(__version__) < Version("4.48"):
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
index 1b8d80a66..aada68b31 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -80,6 +80,9 @@ def tokenize(sample):
     output_dir=SAVE_DIR,
 )
 
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 # Generation is broken for deepseek models when using the latest transformers package
 if Version(__version__) < Version("4.48"):
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
index 3dc821ce3..15e3e67eb 100644
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -43,6 +43,9 @@
     output_dir=SAVE_DIR,
 )
 
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 # Generation is broken for deepseek models when using the latest transformers package
 if Version(__version__) < Version("4.48"):
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
index ebb4a5615..d025004f0 100644
--- a/examples/quantizing_moe/qwen_moe_w4a16.py
+++ b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -73,6 +73,9 @@ def tokenize(sample):
     output_dir=SAVE_DIR,
 )
 
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
index 3952d0a90..d51ad7670 100644
--- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
+++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -97,13 +97,16 @@ def get_recipe(fp8_enabled):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save compressed model and tokenizer
+model.save_pretrained(save_dir)
+tokenizer.save_pretrained(save_dir)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(save_dir, device_map="auto")
+
 # Validate the compressed model
 print("\n========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n")
-
-# Save compressed model and tokenizer
-model.save_pretrained(save_dir)
-tokenizer.save_pretrained(save_dir)

From 9d6c227e3e75f8e2736828246da5dd767b74bfe1 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 9 Jun 2025 20:42:44 -0400
Subject: [PATCH 09/36] remove hooks

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/entrypoints/oneshot.py | 4 ++++
 src/llmcompressor/entrypoints/utils.py   | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index 9659b7d7e..f8f4496d2 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -14,6 +14,8 @@
 from llmcompressor.entrypoints.utils import post_process, pre_process
 from llmcompressor.pipelines import CalibrationPipeline
 
+from accelerate.hooks import remove_hook_from_module
+
 __all__ = ["Oneshot", "oneshot"]
 
 
@@ -130,6 +132,8 @@ def __init__(
             # TODO: consider renaming function similar to "offload_dispatch_model"
             # TODO: modify function to remove any hooks if they already exist (making
             # sure to move to cpu when removing hook
+            # TODO: remove hook in util
+            remove_hook_from_module(model_args.model, recurse=True)
             force_cpu_offload(model_args.model, model_args.oneshot_device)
         else:
             logger.warning("CUDA is not available! Compressing model on CPU instead")
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
index 0186628f0..23758b5f3 100644
--- a/src/llmcompressor/entrypoints/utils.py
+++ b/src/llmcompressor/entrypoints/utils.py
@@ -27,6 +27,7 @@
 )
 from llmcompressor.typing import Processor
 from llmcompressor.utils.fsdp.helpers import is_fsdp_model
+from accelerate.hooks import remove_hook_from_module
 
 
 def pre_process(model_args: "ModelArguments"):
@@ -105,6 +106,9 @@ def post_process(
             "Ex. `oneshot(..., output_dir=...)`"
         )
 
+    # Remove any existing hooks (maybe added by oneshot sequential onloading)
+    remove_hook_from_module(model_args.model, recurse=True)
+
     # Reset the one-time-use session upon completion
     if recipe_args is not None and recipe_args.clear_sparse_session:
         reset_session()

From 8351ac9fa41d3f39137a9e6af458ead1765ab8a4 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 9 Jun 2025 20:47:22 -0400
Subject: [PATCH 10/36] name change

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/pipelines/registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py
index f472c0f0d..2ac384866 100644
--- a/src/llmcompressor/pipelines/registry.py
+++ b/src/llmcompressor/pipelines/registry.py
@@ -18,7 +18,7 @@
 __all__ = ["CalibrationPipeline"]
 
 SEQUENTIAL_MODIFIERS = (AWQModifier, GPTQModifier, SparsityModifierBase)
-NEED_DATA = (SmoothQuantModifier, *SEQUENTIAL_MODIFIERS)
+CALIBRATION_MODIFIERS = (SmoothQuantModifier, *SEQUENTIAL_MODIFIERS)
 
 
 class CalibrationPipeline(ABC, RegistryMixin):
@@ -61,7 +61,7 @@ def from_modifiers(
 
     @staticmethod
     def _validate_infer_pipeline(modifiers: List[Modifier]) -> str:
-        if any(isinstance(modifier, NEED_DATA) for modifier in modifiers):
+        if any(isinstance(modifier, CALIBRATION_MODIFIERS) for modifier in modifiers):
             return "sequential"
 
         active_qmods = _get_active_quant_modifiers(modifiers)

From ad71c5bf2178d18f0278d06cb7e494e6aadcfffa Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 12 Jun 2025 14:01:41 -0400
Subject: [PATCH 11/36] cleanup and nits

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/entrypoints/oneshot.py      |  4 ---
 .../pipelines/layer_sequential/pipeline.py    |  4 +--
 .../pipelines/sequential/helpers.py           |  8 +++--
 .../pipelines/sequential/pipeline.py          |  4 +--
 src/llmcompressor/utils/module.py             | 29 -------------------
 5 files changed, 10 insertions(+), 39 deletions(-)
 delete mode 100644 src/llmcompressor/utils/module.py

diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index 5575f5327..0a7cff81e 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -128,10 +128,6 @@ def __init__(
 
         # offload to cpu if possible
         if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
-            # TODO: consider renaming function similar to "offload_dispatch_model"
-            # TODO: modify function to remove any hooks if they already exist (making
-            # sure to move to cpu when removing hook
-            # TODO: remove hook in util
             remove_hook_from_module(model_args.model, recurse=True)
             force_cpu_offload(model_args.model, model_args.oneshot_device)
         else:
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 3130f75d9..3a0cd8cb6 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -16,8 +16,8 @@
 )
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
+    disable_offloading,
     get_targets_from_modifiers,
-    keep_onload_context,
 )
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
@@ -88,7 +88,7 @@ def __call__(
                 prop_desc = f"({layer_index + 1}/{num_layers}): Propagating"
 
                 # reduce memory movement by keeping modules onloaded
-                with keep_onload_context():
+                with disable_offloading():
                     # do a preliminary pass to trigger modifier hooks
                     for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
                         inputs = intermediates.fetch(batch_idx)
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index ee282f73a..6cb63acdd 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -28,7 +28,7 @@
     "trace_subgraphs",
     "Subgraph",
     "get_targets_from_modifiers",
-    "keep_onload_context",
+    "disable_offloading",
 ]
 
 
@@ -494,7 +494,11 @@ def is_ancestor(module: Module) -> bool:
 
 
 @contextlib.contextmanager
-def keep_onload_context():
+def disable_offloading():
+    """
+    Keep modules onloaded and disable offloading until this context exits.
+    Affects modules which have been hooked with accelerate's `AlignDevicesHook`
+    """
     original_pre_forward = AlignDevicesHook.pre_forward
     onloaded_modules = dict()
 
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index 4af40f772..ab794daa4 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -11,8 +11,8 @@
 from llmcompressor.pipelines.cache import IntermediatesCache
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
+    disable_offloading,
     get_targets_from_modifiers,
-    keep_onload_context,
     trace_subgraphs,
 )
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
@@ -86,7 +86,7 @@ def __call__(
                 prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating"
 
                 # reduce memory movement by keeping modules onloaded
-                with keep_onload_context():
+                with disable_offloading():
                     # do a preliminary pass to trigger modifier hooks
                     for batch_idx in tqdm(range(len(dataloader)), desc=calib_desc):
                         inputs = activations.fetch(batch_idx, subgraph.input_names)
diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py
deleted file mode 100644
index 0867b3955..000000000
--- a/src/llmcompressor/utils/module.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from typing import Callable, Union
-
-import tqdm
-from torch.nn import Module
-
-
-def module_bfs(
-    module: Module,
-    func: Callable[[Module], Module],
-    pre: bool = True,
-    progress: Union[bool, tqdm.tqdm] = False,
-) -> Module:
-    if progress is True:
-        total = len(list(module.modules()))
-        progress = tqdm.tqdm(total=total)
-
-    if pre:
-        module = func(module)
-
-    for name, child in list(module.named_children()):
-        module.add_module(name, module_bfs(child, func, pre, progress))
-
-    if not pre:
-        module = func(module)
-
-    if isinstance(progress, tqdm.tqdm):
-        progress.update(1)
-
-    return module

From 819df1ccb3a408c1f78a4e090341b7b9d6c0f92b Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 12 Jun 2025 16:12:53 -0400
Subject: [PATCH 12/36] rename function

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/entrypoints/oneshot.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index 0a7cff81e..8bcf061f5 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -4,7 +4,7 @@
 
 import torch
 from accelerate.hooks import remove_hook_from_module
-from compressed_tensors.utils import force_cpu_offload
+from compressed_tensors.utils import offloaded_dispatch
 from loguru import logger
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
@@ -129,7 +129,7 @@ def __init__(
         # offload to cpu if possible
         if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
             remove_hook_from_module(model_args.model, recurse=True)
-            force_cpu_offload(model_args.model, model_args.oneshot_device)
+            offloaded_dispatch(model_args.model, model_args.oneshot_device)
         else:
             logger.warning("CUDA is not available! Compressing model on CPU instead")
 

From 7dd71b94cb5b55895f1f3c97e4f2566470007a2a Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 12 Jun 2025 16:51:37 -0400
Subject: [PATCH 13/36] add dispatch utility

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantization_w4a16/llama3_example.py | 20 +++++++++----------
 src/llmcompressor/utils/dev.py                | 17 ++++++++++++++--
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 9b0b0ffaa..d0f3485d4 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -3,6 +3,7 @@
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -61,18 +62,17 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=100)
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[-1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
\ No newline at end of file
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
index 4af08448b..b1e5c014d 100644
--- a/src/llmcompressor/utils/dev.py
+++ b/src/llmcompressor/utils/dev.py
@@ -2,7 +2,7 @@
 import logging
 import os
 import tempfile
-from typing import Type
+from typing import Type, Dict, Any, Union
 
 import torch
 from huggingface_hub import snapshot_download
@@ -10,10 +10,12 @@
 from transformers import AutoModelForCausalLM, PreTrainedModel
 from transformers.modeling_utils import TORCH_INIT_FUNCTIONS
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, WEIGHTS_INDEX_NAME
+from accelerate import dispatch_model, infer_auto_device_map
+from accelerate.utils import get_balanced_memory
 
 from llmcompressor.utils.helpers import patch_attr
 
-__all__ = ["skip_weights_download", "patch_transformers_logger_level"]
+__all__ = ["skip_weights_download", "patch_transformers_logger_level", "dispatch_for_generation"]
 
 
 @contextlib.contextmanager
@@ -106,3 +108,14 @@ def patch_transformers_logger_level(level: int = logging.ERROR):
     transformers_logger.setLevel(level=level)
     yield
     transformers_logger.setLevel(level=restore_log_level)
+
+
+def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
+    max_memory = get_balanced_memory(
+        model,
+        dtype=model.dtype,
+        no_split_module_classes=model._get_no_split_modules("auto")
+    )
+    device_map = infer_auto_device_map(model, dtype=model.dtype, max_memory=max_memory)
+
+    return dispatch_model(model, device_map=device_map)
\ No newline at end of file

From 8ba0f2cf2d582ed6442b242f01755aa0c006e6c0 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 12 Jun 2025 17:07:28 -0400
Subject: [PATCH 14/36] apply style

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantization_w4a16/llama3_example.py |  2 +-
 src/llmcompressor/utils/dev.py                | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index d0f3485d4..d487a911b 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -75,4 +75,4 @@ def tokenize(sample):
 # Save to disk compressed.
 SAVE_DIR = model_id.split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
\ No newline at end of file
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
index b1e5c014d..9c4bbbe5e 100644
--- a/src/llmcompressor/utils/dev.py
+++ b/src/llmcompressor/utils/dev.py
@@ -2,20 +2,24 @@
 import logging
 import os
 import tempfile
-from typing import Type, Dict, Any, Union
+from typing import Type
 
 import torch
+from accelerate import dispatch_model, infer_auto_device_map
+from accelerate.utils import get_balanced_memory
 from huggingface_hub import snapshot_download
 from safetensors.torch import save_file
 from transformers import AutoModelForCausalLM, PreTrainedModel
 from transformers.modeling_utils import TORCH_INIT_FUNCTIONS
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, WEIGHTS_INDEX_NAME
-from accelerate import dispatch_model, infer_auto_device_map
-from accelerate.utils import get_balanced_memory
 
 from llmcompressor.utils.helpers import patch_attr
 
-__all__ = ["skip_weights_download", "patch_transformers_logger_level", "dispatch_for_generation"]
+__all__ = [
+    "skip_weights_download",
+    "patch_transformers_logger_level",
+    "dispatch_for_generation",
+]
 
 
 @contextlib.contextmanager
@@ -114,8 +118,8 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
     max_memory = get_balanced_memory(
         model,
         dtype=model.dtype,
-        no_split_module_classes=model._get_no_split_modules("auto")
+        no_split_module_classes=model._get_no_split_modules("auto"),
     )
     device_map = infer_auto_device_map(model, dtype=model.dtype, max_memory=max_memory)
 
-    return dispatch_model(model, device_map=device_map)
\ No newline at end of file
+    return dispatch_model(model, device_map=device_map)

From fbf2a6d1b035aa98d117f635e745ddb19f8543b8 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 13 Jun 2025 09:56:33 -0400
Subject: [PATCH 15/36] update examples

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/awq/llama_example.py                 | 21 ++++++++--------
 examples/awq/qwen3_moe_example.py             | 24 ++++++++++--------
 .../fp8_compressed_inference.py               |  6 ++++-
 examples/multimodal_audio/whisper_example.py  | 22 ++++++++--------
 examples/multimodal_vision/gemma3_example.py  | 15 ++++++-----
 .../multimodal_vision/idefics3_example.py     | 15 ++++++-----
 examples/multimodal_vision/llava_example.py   | 15 ++++++-----
 .../multimodal_vision/mistral3_example.py     | 15 ++++++-----
 examples/multimodal_vision/mllama_example.py  | 15 ++++++-----
 .../multimodal_vision/phi3_vision_example.py  | 15 ++++++-----
 examples/multimodal_vision/pixtral_example.py | 15 ++++++-----
 .../multimodal_vision/qwen2_vl_example.py     | 16 ++++++------
 .../multimodal_vision/qwen_2_5_vl_example.py  | 16 ++++++------
 .../llama7b_sparse_w4a16.py                   | 10 +++++---
 .../gemma2_fp8_kv_example.py                  | 21 ++++++++--------
 .../llama3_fp8_kv_example.py                  | 25 ++++++++-----------
 .../phi3.5_fp8_kv_example.py                  | 15 ++++++-----
 .../quantization_w4a16_fp4/llama3_example.py  | 17 ++++++-------
 .../quantization_w4a4_fp4/llama3_example.py   | 24 +++++++++---------
 .../quantization_w8a8_fp8/gemma2_example.py   | 21 +++++++++-------
 .../llama3.2_vision_example.py                | 17 ++++++-------
 .../quantization_w8a8_fp8/llama3_example.py   | 15 ++++++-----
 .../quantization_w8a8_fp8/llava1.5_example.py | 17 ++++++-------
 .../quantization_w8a8_fp8/qwen2vl_example.py  | 17 ++++++-------
 .../quantization_w8a8_fp8/whisper_example.py  | 15 ++++++-----
 .../quantization_w8a8_int8/gemma2_example.py  | 14 ++++++-----
 .../quantization_w8a8_int8/llama3_example.py  | 15 ++++++-----
 examples/quantizing_moe/deepseek_moe_w4a16.py | 13 +++++++++-
 .../quantizing_moe/deepseek_moe_w8a8_fp8.py   | 17 ++++++-------
 .../quantizing_moe/deepseek_moe_w8a8_int8.py  | 15 ++++++-----
 .../quantizing_moe/mixtral_moe_w8a8_fp8.py    | 16 ++++++------
 examples/quantizing_moe/qwen_moe_w4a16.py     | 15 ++++++-----
 .../llama3_8b_2of4.py                         | 13 +++++-----
 examples/trl_mixin/ex_trl_constant.py         |  4 +--
 34 files changed, 270 insertions(+), 276 deletions(-)

diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
index d456f2b3b..9d2c724d7 100644
--- a/examples/awq/llama_example.py
+++ b/examples/awq/llama_example.py
@@ -5,10 +5,12 @@
 from llmcompressor.modifiers.awq import AWQModifier
 
 # Select model and load it.
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
 DATASET_ID = "mit-han-lab/pile-val-backup"
@@ -61,14 +63,6 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
@@ -76,3 +70,8 @@ def tokenize(sample):
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
index fa171f0af..3c16d2f43 100644
--- a/examples/awq/qwen3_moe_example.py
+++ b/examples/awq/qwen3_moe_example.py
@@ -3,11 +3,15 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
-model_id = "Qwen/Qwen3-30B-A3B"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+MODEL_ID = "Qwen/Qwen3-30B-A3B"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
 DATASET_ID = "mit-han-lab/pile-val-backup"
@@ -65,18 +69,16 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/compressed_inference/fp8_compressed_inference.py b/examples/compressed_inference/fp8_compressed_inference.py
index 57debe2fd..f0d0381d2 100644
--- a/examples/compressed_inference/fp8_compressed_inference.py
+++ b/examples/compressed_inference/fp8_compressed_inference.py
@@ -19,7 +19,11 @@
     "def fibonacci(n):",
 ]
 
-compressed_model = AutoModelForCausalLM.from_pretrained(MODEL_STUB, torch_dtype="auto")
+compressed_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_STUB,
+    torch_dtype="auto",
+    device_map="cuda:0",
+)
 
 # tokenize the sample data
 tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB)
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index 4992f78d1..8a6f5d748 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -4,12 +4,14 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
-model_id = "openai/whisper-large-v3"
-model = WhisperForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+MODEL_ID = "openai/whisper-large-v3"
+
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
 model.config.forced_decoder_ids = None
-processor = WhisperProcessor.from_pretrained(model_id)
+processor = WhisperProcessor.from_pretrained(MODEL_ID)
 
 # Configure processor the dataset task.
 processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
@@ -83,17 +85,10 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 sample_features = next(iter(ds))["input_features"]
 sample_decoder_ids = [processor.tokenizer.prefix_tokens]
 sample_input = {
@@ -106,3 +101,8 @@ def data_collator(batch):
 # that's where you have a lot of windows in the south no actually that's passive solar
 # and passive solar is something that was developed and designed in the 1960s and 70s
 # and it was a great thing for what it was at the time but it's not a passive house
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py
index 50c606377..2b9676cdd 100644
--- a/examples/multimodal_vision/gemma3_example.py
+++ b/examples/multimodal_vision/gemma3_example.py
@@ -5,6 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "google/gemma-3-4b-it"
@@ -46,16 +47,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = Gemma3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -74,3 +68,8 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100, disable_compile=True)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
index fcc4559d8..ede27ac5b 100644
--- a/examples/multimodal_vision/idefics3_example.py
+++ b/examples/multimodal_vision/idefics3_example.py
@@ -6,6 +6,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "HuggingFaceM4/Idefics3-8B-Llama3"  # or "HuggingFaceTB/SmolVLM-Instruct"
@@ -92,16 +93,9 @@ def tokenize(sample):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = Idefics3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -119,3 +113,8 @@ def tokenize(sample):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
index abb781d48..a2bf9b020 100644
--- a/examples/multimodal_vision/llava_example.py
+++ b/examples/multimodal_vision/llava_example.py
@@ -5,6 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "llava-hf/llava-1.5-7b-hf"
@@ -47,16 +48,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -74,3 +68,8 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py
index 50b7e7dd5..9413359e9 100644
--- a/examples/multimodal_vision/mistral3_example.py
+++ b/examples/multimodal_vision/mistral3_example.py
@@ -8,6 +8,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@@ -60,16 +61,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = Mistral3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -88,3 +82,8 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
index 7cd45c85a..2d92319f7 100644
--- a/examples/multimodal_vision/mllama_example.py
+++ b/examples/multimodal_vision/mllama_example.py
@@ -5,6 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -47,16 +48,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -74,3 +68,8 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
index 772001cb1..2b6f66714 100644
--- a/examples/multimodal_vision/phi3_vision_example.py
+++ b/examples/multimodal_vision/phi3_vision_example.py
@@ -7,6 +7,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "microsoft/Phi-3-vision-128k-instruct"
@@ -78,14 +79,6 @@ def data_collator(batch):
     ignore=["lm_head", "re:model.vision_embed_tokens.*"],
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Perform oneshot
 oneshot(
     model=model,
@@ -99,7 +92,13 @@ def data_collator(batch):
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
index 71c1a0770..035af6061 100644
--- a/examples/multimodal_vision/pixtral_example.py
+++ b/examples/multimodal_vision/pixtral_example.py
@@ -5,6 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "mgoin/pixtral-12b"
@@ -53,16 +54,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -80,3 +74,8 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
index 27e0954d3..b072b6ff9 100644
--- a/examples/multimodal_vision/qwen2_vl_example.py
+++ b/examples/multimodal_vision/qwen2_vl_example.py
@@ -8,6 +8,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "Qwen/Qwen2-VL-2B-Instruct"
@@ -95,16 +96,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = Qwen2VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -131,3 +125,9 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py
index 68798cf00..8dffa5216 100644
--- a/examples/multimodal_vision/qwen_2_5_vl_example.py
+++ b/examples/multimodal_vision/qwen_2_5_vl_example.py
@@ -8,6 +8,7 @@
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
@@ -89,16 +90,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -125,3 +119,9 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
index 76a5f2972..6ed01e7d1 100644
--- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
+++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -6,7 +6,9 @@
 
 # load the model in as bfloat16 to save on memory and compute
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
-model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16)
+model = AutoModelForCausalLM.from_pretrained(
+    model_stub, torch_dtype=torch.bfloat16, device_map="auto"
+)
 tokenizer = AutoTokenizer.from_pretrained(model_stub)
 
 # uses LLM Compressor's built-in preprocessing for ultra chat
@@ -88,8 +90,8 @@
 tokenizer.save_pretrained(f"{output_dir}/quantization_stage")
 
 logger.info(
-    "llmcompressor does not currently support running "
+    "llmcompressor does not currently support running ",
     "compressed models in the marlin24 format. "
-    "The model produced from this example can be "
-    "run on vLLM with dtype=torch.float16."
+    "The model produced from this example can be ",
+    "run on vLLM with dtype=torch.float16.",
 )
diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
index cf5501bc9..44691914a 100644
--- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -2,11 +2,12 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
-model_id = "google/gemma-2-9b-it"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_id)
+MODEL_ID = "google/gemma-2-9b-it"
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
@@ -77,14 +78,6 @@ def process_and_tokenize(example):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-FP8-KV"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 print(
     "Note: Inference with the quantized kv_cache is not supported. ",
     "Please use vLLM for inference with the quantized kv_cache.",
@@ -95,8 +88,14 @@ def process_and_tokenize(example):
 # Consider either downgrading your transformers version to a previous version
 # or use vLLM for sample generation.
 print("\n\n")
+dispatch_for_generation(model)
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
index 0cb52e0fd..6aaa809bb 100644
--- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -3,15 +3,12 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="auto",
-)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
@@ -82,14 +79,6 @@ def process_and_tokenize(example):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-FP8-KV"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 logger.info(
     "Running sample generation. ",
     "Note: Inference with the quantized kv_cache is not supported. ",
@@ -98,7 +87,13 @@ def process_and_tokenize(example):
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
index 20e7fac4c..112f0d0b9 100644
--- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -2,6 +2,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
 # Phi-3.5 is a special case for KV cache quantization because it has
@@ -79,14 +80,6 @@ def process_and_tokenize(example):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 print(
     "Note: Inference with the quantized kv_cache is not supported. ",
     "Please use vLLM for inference with the quantized kv_cache.",
@@ -94,7 +87,13 @@ def process_and_tokenize(example):
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py
index 378f4f988..b6048e086 100644
--- a/examples/quantization_w4a16_fp4/llama3_example.py
+++ b/examples/quantization_w4a16_fp4/llama3_example.py
@@ -2,6 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
 
@@ -17,18 +18,16 @@
 # Apply quantization.
 oneshot(model=model, recipe=recipe)
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
-# Validate model generations
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
index d84b749c0..0bd484f9c 100644
--- a/examples/quantization_w4a4_fp4/llama3_example.py
+++ b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -3,11 +3,13 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 # Load model.
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
@@ -64,18 +66,16 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = model_id.split("/")[1] + "-NVFP4"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
-# Validate model generations
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
index 374add135..9509d1505 100644
--- a/examples/quantization_w8a8_fp8/gemma2_example.py
+++ b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -2,6 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 MODEL_ID = "google/gemma-2-27b-it"
 
@@ -18,22 +19,24 @@
 )
 
 # 3) Apply quantization and save in compressed-tensors format.
-oneshot(model=model, recipe=recipe, tokenizer=tokenizer)
-
-# 4) Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+oneshot(
+    model=model,
+    recipe=recipe,
+    tokenizer=tokenizer,
+)
 
 # Confirm generations of the quantized model look sane.
 # NOTE: transformers 4.49.0 results in a generation error with gemma2.
 # Consider either downgrading your transformers version to a previous version
 # or use vLLM for sample generation.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
+
+# 4) Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
index 105e05483..a79214d36 100644
--- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
+++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -2,6 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
@@ -19,20 +20,18 @@
     ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"],
 )
 
-# Apply quantization.
+# Apply quantization and save to disk in compressed-tensors format.
 oneshot(model=model, recipe=recipe)
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py
index badd6ac5e..440f0b584 100644
--- a/examples/quantization_w8a8_fp8/llama3_example.py
+++ b/examples/quantization_w8a8_fp8/llama3_example.py
@@ -2,6 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
@@ -20,17 +21,15 @@
 # Apply quantization.
 oneshot(model=model, recipe=recipe)
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
index 06d1483e1..9c1731f03 100644
--- a/examples/quantization_w8a8_fp8/llava1.5_example.py
+++ b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -2,6 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 MODEL_ID = "llava-hf/llava-1.5-7b-hf"
 
@@ -19,20 +20,18 @@
     ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_tower.*"],
 )
 
-# Apply quantization.
+# Apply quantization and save to disk in compressed-tensors format.
 oneshot(model=model, recipe=recipe)
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py
index 1cf73f527..2e7d02803 100644
--- a/examples/quantization_w8a8_fp8/qwen2vl_example.py
+++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py
@@ -2,6 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
 
@@ -19,20 +20,18 @@
     ignore=["re:.*lm_head", "re:visual.*"],
 )
 
-# Apply quantization.
+# Apply quantization and save to disk in compressed-tensors format.
 oneshot(model=model, recipe=recipe)
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = Qwen2VLForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py
index 51ac804b7..b9fbb9d24 100644
--- a/examples/quantization_w8a8_fp8/whisper_example.py
+++ b/examples/quantization_w8a8_fp8/whisper_example.py
@@ -3,6 +3,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 MODEL_ID = "openai/whisper-large-v2"
 
@@ -23,16 +24,9 @@
 # Apply quantization.
 oneshot(model=model, recipe=recipe)
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 ds = load_dataset(
     "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]"
 )
@@ -45,3 +39,8 @@
 print(processor.batch_decode(output_ids, skip_special_tokens=False)[0])
 # Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel
 print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
index a0ebe0079..1a11effa3 100644
--- a/examples/quantization_w8a8_int8/gemma2_example.py
+++ b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -3,6 +3,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # 1) Select model and load it.
 MODEL_ID = "google/gemma-2-2b-it"
@@ -53,26 +54,27 @@ def tokenize(sample):
 #   * quantize the activations to int8 (dynamic per token)
 recipe = GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"])
 
-# 4) Apply quantization and save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8"
+# 4) Apply quantization
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    output_dir=SAVE_DIR,
 )
 
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 # NOTE: transformers 4.49.0 results in a generation error with gemma2.
 # Consider either downgrading your transformers version to a previous version
 # or use vLLM for sample generation.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
+
+# 5) Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py
index 1894932f1..6fa738656 100644
--- a/examples/quantization_w8a8_int8/llama3_example.py
+++ b/examples/quantization_w8a8_int8/llama3_example.py
@@ -4,6 +4,7 @@
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -67,18 +68,16 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
index ccba53d2f..6d311858d 100644
--- a/examples/quantizing_moe/deepseek_moe_w4a16.py
+++ b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -4,6 +4,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
 
 from llmcompressor import oneshot
+from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
 # Please consider either downgrading your transformers version to a
@@ -12,8 +13,18 @@
 # select a Mixture of Experts model for quantization
 MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
 
+# adjust based off number of desired GPUs
+# if not enough memory is available, some layers will automatically be offlaoded to cpu
+device_map = calculate_offload_device_map(
+    MODEL_ID,
+    reserve_for_hessians=True,
+    num_gpus=2,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+)
+
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+    MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
index 5cdfe995d..03eaafc69 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
 # Please consider either downgrading your transformers version to a
@@ -13,7 +14,7 @@
 MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", trust_remote_code=True
+    MODEL_ID, torch_dtype="auto", trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
@@ -75,18 +76,11 @@ def tokenize(sample):
     trust_remote_code_model=True,
 )
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 # Generation is broken for deepseek models when using the latest transformers package
 if Version(__version__) < Version("4.48"):
     print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
     SAMPLE_INPUT = ["I love quantization because"]
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
@@ -98,3 +92,8 @@ def tokenize(sample):
         "WARNING: cannot perform sample generation of "
         "deepseek models with transformers >= 4.48"
     )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
index abddef9dc..d4249c278 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -5,6 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
 # Please consider either downgrading your transformers version to a
@@ -76,18 +77,11 @@ def tokenize(sample):
     trust_remote_code_model=True,
 )
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 # Generation is broken for deepseek models when using the latest transformers package
 if Version(__version__) < Version("4.48"):
     print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
     SAMPLE_INPUT = ["I love quantization because"]
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
@@ -100,3 +94,8 @@ def tokenize(sample):
         "WARNING: cannot perform sample generation of "
         "deepseek models with transformers >= 4.48"
     )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
index df0876088..3361e86bc 100644
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -5,9 +5,9 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-NUM_GPUS = 2
 
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -37,18 +37,11 @@
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 # Generation is broken for deepseek models when using the latest transformers package
 if Version(__version__) < Version("4.48"):
     print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
     input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
     output = model.generate(input_ids, max_new_tokens=20)
     print(tokenizer.decode(output[0]))
@@ -58,3 +51,8 @@
         "WARNING: cannot perform sample generation of "
         "deepseek models with transformers >= 4.48"
     )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
index 18a6c74ff..15b26f656 100644
--- a/examples/quantizing_moe/qwen_moe_w4a16.py
+++ b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -4,6 +4,7 @@
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # select a Mixture of Experts model for quantization
 MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
@@ -69,17 +70,15 @@ def tokenize(sample):
     trust_remote_code_model=True,
 )
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
index d51ad7670..5f941e87c 100644
--- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
+++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -6,6 +6,7 @@
 from llmcompressor import oneshot
 from llmcompressor.modifiers.obcq import SparseGPTModifier
 from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Configuration
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -97,16 +98,14 @@ def get_recipe(fp8_enabled):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save compressed model and tokenizer
-model.save_pretrained(save_dir)
-tokenizer.save_pretrained(save_dir)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(save_dir, device_map="auto")
-
 # Validate the compressed model
 print("\n========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n")
+
+# Save compressed model and tokenizer
+model.save_pretrained(save_dir)
+tokenizer.save_pretrained(save_dir)
diff --git a/examples/trl_mixin/ex_trl_constant.py b/examples/trl_mixin/ex_trl_constant.py
index c26e9f41d..ff8a370c9 100644
--- a/examples/trl_mixin/ex_trl_constant.py
+++ b/examples/trl_mixin/ex_trl_constant.py
@@ -7,9 +7,7 @@
 
 model_path = "neuralmagic/Llama-2-7b-pruned50-retrained"
 output_dir = "./output_trl_sft_test_7b_gsm8k_sft_data"
-model = AutoModelForCausalLM.from_pretrained(
-    model_path, torch_dtype="auto", device_map="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 tokenizer.pad_token = tokenizer.eos_token
 

From 91b349b3fd50ab06f1c66f72ab772cd04ca6147d Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 13 Jun 2025 10:04:47 -0400
Subject: [PATCH 16/36] update examples 2

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/awq/README.md                            |  6 +-----
 examples/awq/llama_example.py                     |  4 +---
 examples/awq/qwen3_moe_example.py                 |  4 +---
 examples/multimodal_audio/README.md               |  6 +-----
 examples/multimodal_vision/README.md              |  6 +-----
 .../llama7b_sparse_w4a16.py                       |  7 ++++---
 examples/quantization_kv_cache/README.md          |  6 +-----
 examples/quantization_w4a16/README.md             |  4 +---
 examples/quantization_w8a8_fp8/README.md          |  3 +--
 examples/quantization_w8a8_int8/README.md         |  4 +---
 examples/quantizing_moe/deepseek_moe_w4a16.py     | 15 +++------------
 11 files changed, 16 insertions(+), 49 deletions(-)

diff --git a/examples/awq/README.md b/examples/awq/README.md
index 0a837d6f3..fd4cb4b62 100644
--- a/examples/awq/README.md
+++ b/examples/awq/README.md
@@ -18,11 +18,7 @@ recipe = [
 To use your own model, start with an existing example change the `model_id` to match your own model stub.
 ```python
 model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 ```
 
 ## Adding Mappings ##
diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
index 9d2c724d7..0db10c478 100644
--- a/examples/awq/llama_example.py
+++ b/examples/awq/llama_example.py
@@ -7,9 +7,7 @@
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
index 3c16d2f43..5634621f6 100644
--- a/examples/awq/qwen3_moe_example.py
+++ b/examples/awq/qwen3_moe_example.py
@@ -8,9 +8,7 @@
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen3-30B-A3B"
 
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md
index d3d0631f9..e7ecca950 100644
--- a/examples/multimodal_audio/README.md
+++ b/examples/multimodal_audio/README.md
@@ -21,11 +21,7 @@ This directory contains example scripts for quantizing a variety of audio langua
 To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
 ```python3
 model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 ```
 
 ## Customizing GPTQModifier Parameters ##
diff --git a/examples/multimodal_vision/README.md b/examples/multimodal_vision/README.md
index 9d6d12295..c0d0808b4 100644
--- a/examples/multimodal_vision/README.md
+++ b/examples/multimodal_vision/README.md
@@ -25,11 +25,7 @@ This directory contains example scripts for quantizing a variety of vision-langu
 To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
 ```python3
 model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 ```
 
 ## Customizing GPTQModifier Parameters ##
diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
index 6ed01e7d1..4bf505047 100644
--- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
+++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -3,12 +3,11 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot, train
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # load the model in as bfloat16 to save on memory and compute
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
-model = AutoModelForCausalLM.from_pretrained(
-    model_stub, torch_dtype=torch.bfloat16, device_map="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_stub)
 
 # uses LLM Compressor's built-in preprocessing for ultra chat
@@ -71,6 +70,7 @@
 )
 
 # Sparse finetune
+dispatch_for_generation(model)
 finetune_applied_model = train(
     model=oneshot_applied_model,
     **oneshot_kwargs,
@@ -79,6 +79,7 @@
 )
 
 # Oneshot quantization
+model.to("cpu")
 quantized_model = oneshot(
     model=finetune_applied_model,
     **oneshot_kwargs,
diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md
index 8ec73dee3..826bc6322 100644
--- a/examples/quantization_kv_cache/README.md
+++ b/examples/quantization_kv_cache/README.md
@@ -39,11 +39,7 @@ Load the model using `AutoModelForCausalLM`:
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
 
diff --git a/examples/quantization_w4a16/README.md b/examples/quantization_w4a16/README.md
index 27edb92a6..5a3a07d67 100644
--- a/examples/quantization_w4a16/README.md
+++ b/examples/quantization_w4a16/README.md
@@ -40,9 +40,7 @@ Load the model using `AutoModelForCausalLM` for handling quantized saving and lo
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
 
diff --git a/examples/quantization_w8a8_fp8/README.md b/examples/quantization_w8a8_fp8/README.md
index 2b817ba1e..f0a7e8d6c 100644
--- a/examples/quantization_w8a8_fp8/README.md
+++ b/examples/quantization_w8a8_fp8/README.md
@@ -38,8 +38,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = AutoModelForCausalLM.from_pretrained(
-  MODEL_ID, device_map="auto", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
 
diff --git a/examples/quantization_w8a8_int8/README.md b/examples/quantization_w8a8_int8/README.md
index bda9d2d46..830ec2ac9 100644
--- a/examples/quantization_w8a8_int8/README.md
+++ b/examples/quantization_w8a8_int8/README.md
@@ -38,9 +38,7 @@ Load the model using `AutoModelForCausalLM` for handling quantized saving and lo
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
 
diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
index 6d311858d..5d08dc703 100644
--- a/examples/quantizing_moe/deepseek_moe_w4a16.py
+++ b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -4,7 +4,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
 
 from llmcompressor import oneshot
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
 # Please consider either downgrading your transformers version to a
@@ -13,18 +13,8 @@
 # select a Mixture of Experts model for quantization
 MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
 
-# adjust based off number of desired GPUs
-# if not enough memory is available, some layers will automatically be offlaoded to cpu
-device_map = calculate_offload_device_map(
-    MODEL_ID,
-    reserve_for_hessians=True,
-    num_gpus=2,
-    torch_dtype=torch.bfloat16,
-    trust_remote_code=True,
-)
-
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
@@ -84,6 +74,7 @@ def tokenize(sample):
 # Generation is broken for deepseek models when using the latest transformers package
 if Version(__version__) < Version("4.48"):
     print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
     input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
     output = model.generate(input_ids, max_new_tokens=20)
     print(tokenizer.decode(output[0]))

From 8e58e35b08ed96cdfa833f5b6df29f2e4822bcff Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 13 Jun 2025 12:27:43 -0400
Subject: [PATCH 17/36] remove fallback_to_cpu, use ct utils

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/entrypoints/utils.py        | 15 ++------
 .../pipelines/layer_sequential/pipeline.py    |  6 ++--
 .../pipelines/sequential/helpers.py           | 35 +------------------
 .../pipelines/sequential/pipeline.py          |  3 +-
 .../pytorch/model_load/helpers.py             | 17 ---------
 5 files changed, 6 insertions(+), 70 deletions(-)

diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
index 7667418e2..f648fa771 100644
--- a/src/llmcompressor/entrypoints/utils.py
+++ b/src/llmcompressor/entrypoints/utils.py
@@ -17,7 +17,7 @@
 
 from llmcompressor.args import ModelArguments, RecipeArguments, TrainingArguments
 from llmcompressor.core import reset_session
-from llmcompressor.pytorch.model_load.helpers import fallback_to_cpu, parse_dtype
+from llmcompressor.pytorch.model_load.helpers import parse_dtype
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
     modify_save_pretrained,
     patch_tied_tensors_bug,
@@ -197,20 +197,12 @@ def initialize_model_from_path(
         else model_args.model_name_or_path
     )
 
-    # Fallback to CPU if GPU requested and not available
-    model_args.oneshot_device = fallback_to_cpu(model_args.oneshot_device)
-
-    device_map = model_args.oneshot_device
-    if training_args is not None and training_args.do_train:
-        device_map = "auto"
-
     model_kwargs = {
         "config": config,
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
         "use_auth_token": True if model_args.use_auth_token else None,
         "torch_dtype": parse_dtype(model_args.precision),
-        "device_map": device_map,
         "trust_remote_code": model_args.trust_remote_code_model,
     }
 
@@ -220,10 +212,7 @@ def initialize_model_from_path(
             run_compressed=False
         )
 
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        **model_kwargs,
-    )
+    model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
     if "sequence_length" in model_kwargs:
         model.seqlen = model_kwargs["sequence_length"]
 
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 3a0cd8cb6..2cfda0d0e 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -2,6 +2,7 @@
 
 import torch
 import tqdm
+from compressed_tensors.utils import disable_offloading
 from loguru import logger
 from torch.utils.data.dataloader import DataLoader
 
@@ -15,10 +16,7 @@
     to_next_layer_kwargs,
 )
 from llmcompressor.pipelines.registry import CalibrationPipeline
-from llmcompressor.pipelines.sequential.helpers import (
-    disable_offloading,
-    get_targets_from_modifiers,
-)
+from llmcompressor.pipelines.sequential.helpers import get_targets_from_modifiers
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index 6cb63acdd..b7937a2fc 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -5,7 +5,6 @@
 from typing import Any, Dict, List, Optional, Set
 
 import torch
-from accelerate.hooks import AlignDevicesHook
 from compressed_tensors import has_offloaded_params
 from compressed_tensors.quantization import find_name_or_class_matches
 from loguru import logger
@@ -24,12 +23,7 @@
 
 from .ast_helpers import autowrap_forwards
 
-__all__ = [
-    "trace_subgraphs",
-    "Subgraph",
-    "get_targets_from_modifiers",
-    "disable_offloading",
-]
+__all__ = ["trace_subgraphs", "Subgraph", "get_targets_from_modifiers"]
 
 
 @dataclass
@@ -491,30 +485,3 @@ def is_ancestor(module: Module) -> bool:
 
     is_ancestor(model)
     return ancestors
-
-
-@contextlib.contextmanager
-def disable_offloading():
-    """
-    Keep modules onloaded and disable offloading until this context exits.
-    Affects modules which have been hooked with accelerate's `AlignDevicesHook`
-    """
-    original_pre_forward = AlignDevicesHook.pre_forward
-    onloaded_modules = dict()
-
-    # onload once and disable any future onloading/offloading steps
-    def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs):
-        ret = original_pre_forward(self, module, *args, **kwargs)
-        if module not in onloaded_modules:
-            onloaded_modules[module] = (self, self.offload)
-            self.offload = False
-        return ret
-
-    # use the patched pre_forward function within the context
-    with patch_attr(AlignDevicesHook, "pre_forward", keep_onload_pre_forward):
-        yield
-
-    # manually offload all modules that were onloaded
-    for module, (hook, offload) in onloaded_modules.items():
-        hook.offload = offload
-        hook.post_forward(module, None)
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index ab794daa4..3e0490b70 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 
 import torch
-from compressed_tensors.utils import get_execution_device
+from compressed_tensors.utils import disable_offloading, get_execution_device
 from loguru import logger
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
@@ -11,7 +11,6 @@
 from llmcompressor.pipelines.cache import IntermediatesCache
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
-    disable_offloading,
     get_targets_from_modifiers,
     trace_subgraphs,
 )
diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
index 0ffbd053e..de4b061ec 100644
--- a/src/llmcompressor/pytorch/model_load/helpers.py
+++ b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -15,7 +15,6 @@
 
 __all__ = [
     "copy_python_files_from_model_cache",
-    "fallback_to_cpu",
     "parse_dtype",
     "get_session_model",
     "get_completed_stages",
@@ -71,22 +70,6 @@ def save_checkpoint(
         compressor.decompress_model(model)
 
 
-def fallback_to_cpu(device: str) -> str:
-    """
-    Takes in a device string and forces it to cpu if cuda is not available
-
-    :param device: device id to check
-    :return: device modified for CUDA status
-    """
-    if "cuda" in device and not torch.cuda.is_available():
-        logger.warning(
-            f"Requested {device} but CUDA is not available, falling back to CPU"
-        )
-        return "cpu"
-
-    return device
-
-
 def parse_dtype(dtype_arg: Union[str, torch.dtype]) -> torch.dtype:
     """
     :param dtype_arg: dtype or string to parse

From 96631d16b0121f7f9d1269d6b2145789ddcb18cb Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 15 Jun 2025 01:06:09 -0400
Subject: [PATCH 18/36] remove hook from module within utils function

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/entrypoints/oneshot.py | 6 +++---
 src/llmcompressor/entrypoints/utils.py   | 4 ----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index 8bcf061f5..c1dae7933 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -3,7 +3,6 @@
 from typing import Optional
 
 import torch
-from accelerate.hooks import remove_hook_from_module
 from compressed_tensors.utils import offloaded_dispatch
 from loguru import logger
 from torch.utils.data import DataLoader
@@ -128,8 +127,9 @@ def __init__(
 
         # offload to cpu if possible
         if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
-            remove_hook_from_module(model_args.model, recurse=True)
-            offloaded_dispatch(model_args.model, model_args.oneshot_device)
+            offloaded_dispatch(
+                model_args.model, execution_device=model_args.oneshot_device
+            )
         else:
             logger.warning("CUDA is not available! Compressing model on CPU instead")
 
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
index f648fa771..4bbc31e82 100644
--- a/src/llmcompressor/entrypoints/utils.py
+++ b/src/llmcompressor/entrypoints/utils.py
@@ -3,7 +3,6 @@
 from pathlib import PosixPath
 from typing import Optional, Tuple
 
-from accelerate.hooks import remove_hook_from_module
 from loguru import logger
 from torch.nn import Module
 from transformers import (
@@ -106,9 +105,6 @@ def post_process(
             "Ex. `oneshot(..., output_dir=...)`"
         )
 
-    # Remove any existing hooks (maybe added by oneshot sequential onloading)
-    remove_hook_from_module(model_args.model, recurse=True)
-
     # Reset the one-time-use session upon completion
     if recipe_args is not None and recipe_args.clear_sparse_session:
         reset_session()

From 96476fe07a10738832c8a86f06c54743a1c9f774 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 15 Jun 2025 01:09:46 -0400
Subject: [PATCH 19/36] remove unused util

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../transformers/compression/helpers.py       | 27 -------------------
 1 file changed, 27 deletions(-)

diff --git a/src/llmcompressor/transformers/compression/helpers.py b/src/llmcompressor/transformers/compression/helpers.py
index d02a08809..acb71b986 100644
--- a/src/llmcompressor/transformers/compression/helpers.py
+++ b/src/llmcompressor/transformers/compression/helpers.py
@@ -104,33 +104,6 @@ def infer_sparsity_structure_from_model(model: torch.nn.Module) -> Optional[str]
     return None
 
 
-def quantization_memory_requirement(model: torch.nn.Module) -> int:
-    """
-    Determines the max number of bytes needed to store quantization scale and zp data
-
-    :param model: model to calculate requirements for
-    :return: number of bytes required to reserve for quantization
-    """
-
-    total_elements = 0
-    for _, module in model.named_modules():
-        if isinstance(module, Linear):
-            for param in module.parameters():
-                # assume the max of group 128 and static scale/zp
-                # TODO: base this on the recipe instead instead of assuming max
-
-                # potentially just bias term
-                max_quant_shape = param.shape[0] // 128
-
-                if len(param.size()) > 1:  # weights
-                    max_quant_shape *= param.shape[1]
-
-                total_elements += max_quant_shape * 4
-
-    bytes_ratio = 32 // 16  # assuming float16
-    return total_elements * bytes_ratio
-
-
 def infer_sparse_targets_and_ignores(
     model: torch.nn.Module,
     sparsity_structure: str,

From cb965c91fd0309ad8f5a590f0a767d04255c6539 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 15 Jun 2025 22:45:46 -0400
Subject: [PATCH 20/36] docstring

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/utils/dev.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
index 1c67f6678..e773b48f1 100644
--- a/src/llmcompressor/utils/dev.py
+++ b/src/llmcompressor/utils/dev.py
@@ -116,6 +116,14 @@ def patch_transformers_logger_level(level: int = logging.ERROR):
 
 
 def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
+    """
+    Dispatch a model autoregressive generation. This means that modules are dispatched
+    evenly across avaiable devices and kept onloaded if possible. Removes any HF hooks
+    that may have existed previously.
+
+    :param model: model to dispatch
+    :return: model which is dispatched
+    """
     remove_hook_from_module(model, recurse=True)
     max_memory = get_balanced_memory(
         model,

From 8769b8591174fbb68677e57e8ca1ce743bdb1e24 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 15 Jun 2025 22:51:18 -0400
Subject: [PATCH 21/36] remove big model example tests

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../test_big_models_with_accelerate.py        | 74 -------------------
 1 file changed, 74 deletions(-)
 delete mode 100644 tests/examples/test_big_models_with_accelerate.py

diff --git a/tests/examples/test_big_models_with_accelerate.py b/tests/examples/test_big_models_with_accelerate.py
deleted file mode 100644
index 019017bdd..000000000
--- a/tests/examples/test_big_models_with_accelerate.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from pathlib import Path
-
-import pytest
-
-from tests.examples.utils import (
-    ReadMe,
-    copy_and_run_script,
-    gen_cmd_fail_message,
-    requires_gpu_count,
-)
-
-
-@pytest.fixture
-def example_dir() -> str:
-    return "examples/big_models_with_accelerate"
-
-
-@pytest.mark.example
-class TestBigModelsWithAccelerate:
-    """
-    Tests for examples in the "big_models_with_accelerate" example folder.
-    """
-
-    def test_readme_has_install_command(self, example_dir: str):
-        """
-        Test that the README has a valid install command.
-        """
-        readme_path = Path.cwd() / example_dir / "README.md"
-        readme = ReadMe(readme_path)
-
-        code = readme.get_code_block_content(position=1, lang="shell")
-        assert "pip install" in code
-
-        assert code.startswith("pip install llmcompressor")
-
-    @pytest.mark.parametrize(
-        ("script_filename", "visible_gpus"),
-        [
-            pytest.param("cpu_offloading_fp8.py", "0", id="cpu_offloading"),
-            pytest.param(
-                "multi_gpu_int8.py",
-                "",
-                id="multi_gpu_int8",
-                marks=[
-                    requires_gpu_count(2),
-                    pytest.mark.multi_gpu,
-                ],
-            ),
-            pytest.param(
-                "mult_gpus_int8_device_map.py",
-                "0",
-                id="mult_gpus_int8_device_map",
-            ),
-        ],
-    )
-    @requires_gpu_count(1)
-    def test_example_scripts(
-        self,
-        example_dir: str,
-        visible_gpus: str,
-        script_filename: str,
-        tmp_path: Path,
-        monkeypatch: pytest.MonkeyPatch,
-    ):
-        """
-        Test for the example scripts in the folder.
-        """
-
-        if visible_gpus:
-            monkeypatch.setenv("CUDA_VISIBLE_DEVICES", visible_gpus)
-
-        command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
-
-        assert result.returncode == 0, gen_cmd_fail_message(command, result)

From a389d1488cfb29e549a1f92c4b65d917d763fcad Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 15 Jun 2025 23:16:52 -0400
Subject: [PATCH 22/36] big modeling example readme

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../cpu_offloading_fp8.py                     |  26 ------
 .../mult_gpus_int8_device_map.py              |  81 ------------------
 .../multi_gpu_int8.py                         |  78 -----------------
 .../README.md                                 |  12 +++
 .../assets/sequential_onloading.png           | Bin 0 -> 71199 bytes
 examples/quantization_w4a16/llama3_example.py |   2 +-
 6 files changed, 13 insertions(+), 186 deletions(-)
 delete mode 100644 examples/big_models_with_accelerate/cpu_offloading_fp8.py
 delete mode 100644 examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
 delete mode 100644 examples/big_models_with_accelerate/multi_gpu_int8.py
 create mode 100644 examples/big_models_with_sequential_onloading/README.md
 create mode 100644 examples/big_models_with_sequential_onloading/assets/sequential_onloading.png

diff --git a/examples/big_models_with_accelerate/cpu_offloading_fp8.py b/examples/big_models_with_accelerate/cpu_offloading_fp8.py
deleted file mode 100644
index ded5ff8d6..000000000
--- a/examples/big_models_with_accelerate/cpu_offloading_fp8.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-OUTPUT_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
-
-# Load model
-# Note: device_map="auto" will offload to CPU if not enough space on GPU.
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
-)
-
-# Configure the quantization scheme and algorithm (PTQ + FP8_DYNAMIC).
-recipe = QuantizationModifier(
-    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
-)
-
-# Apply quantization and save in `compressed-tensors` format.
-oneshot(
-    model=model,
-    recipe=recipe,
-    tokenizer=AutoTokenizer.from_pretrained(MODEL_ID),
-    output_dir=OUTPUT_DIR,
-)
diff --git a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
deleted file mode 100644
index d98051d21..000000000
--- a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import torch
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
-
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-
-# adjust based off number of desired GPUs
-# reserve_for_hessians=True reserves memory which is required by
-# GPTQModifier and SparseGPTModifier
-device_map = calculate_offload_device_map(
-    MODEL_ID, num_gpus=1, reserve_for_hessians=True, torch_dtype=torch.bfloat16
-)
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for W8A8 quantization
-recipe = [
-    SmoothQuantModifier(smoothing_strength=0.8),
-    GPTQModifier(
-        targets="Linear",
-        scheme="W8A8",
-        ignore=["lm_head"],
-    ),
-]
-
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-INT8"
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=True,
-    output_dir=SAVE_DIR,
-)
diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py
deleted file mode 100644
index 9c1679eab..000000000
--- a/examples/big_models_with_accelerate/multi_gpu_int8.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-
-MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic"
-
-# 1) Load model (device_map="auto" with shard the model over multiple GPUs!).
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
-    trust_remote_code=True,
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# 2) Prepare calibration dataset (in this case, we use ultrachat).
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-
-# Select number of samples. 512 samples is a good place to start.
-# Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 1024
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# 3) Configure algorithms. In this case, we:
-#   * quantize the weights to int8 with GPTQ (static per channel)
-#   * quantize the activations to int8 (dynamic per token)
-recipe = [
-    GPTQModifier(
-        targets="Linear", scheme="W8A8", ignore=["lm_head"], dampening_frac=0.1
-    ),
-]
-
-# 4) Apply algorithms and save in `compressed-tensors` format.
-# if you encounter GPU out-of-memory issues, consider using an explicit
-# device map (see multi_gpus_int8_device_map.py)
-oneshot(
-    model=model,
-    tokenizer=tokenizer,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    output_dir=SAVE_DIR,
-)
diff --git a/examples/big_models_with_sequential_onloading/README.md b/examples/big_models_with_sequential_onloading/README.md
new file mode 100644
index 000000000..f10e1e394
--- /dev/null
+++ b/examples/big_models_with_sequential_onloading/README.md
@@ -0,0 +1,12 @@
+## Big Modeling with Sequential Onloading ##
+### What is Sequential Onloading? ###
+Sequential onloading is a memory-efficient approach for compressing large language models (LLMs) using only a single GPU. Instead of loading the entire model into memory—which can easily require hundreds of gigabytes—this method loads and compresses one layer at a time. The outputs are offloaded before the next layer is processed, dramatically reducing peak memory usage while maintaining high compression fidelity.
+
+<p align="center">
+    <img src="assets/sequential_onloading.png"/>
+</p>
+
+For more information, see the [RedHat AI blog post](https://developers.redhat.com/articles/2025/05/09/llm-compressor-optimize-llms-low-latency-deployments#generalizing_to_multimodal_and_moe_architectures) or the [LLM Compressor Office Hours Recording](https://www.youtube.com/watch?v=GrhuqQDmBk8).
+
+### Using Sequential Onloading ###
+Sequential onloading is enabled by default within LLM Compressor. To disable sequential onloading, add the `pipeline="basic"` argument to the LLM Compressor `oneshot` function call.
\ No newline at end of file
diff --git a/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png b/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png
new file mode 100644
index 0000000000000000000000000000000000000000..a499cb66a98205f3a13ff7b54fc9622b3d903015
GIT binary patch
literal 71199
zcmeFXWmwc*_b5DsNQsmnjdVzNkBa2bNVhO_m$XQWG}57TBi-FC-Ce@a-Ml|;^?si7
zf6sM3pAU!Y8fMRqwbovH)!u6b$;*mAK_x^5fk00rB}5fLAOsrV!-<Rp^vKe4r-4A|
zGNy0d$V<L?LoRP;4K=ke0)ZrgzN$V}Q|iG>(Nuhai0n&2eL@o`fkIC0iwAba5~Zes
zdmY5~B4=v2<C#`npq~2hJIoG@z$tlEZFO8X-IqCZFoTw&dK5T^Y1KxaMIPt7>!EnB
zK|be=1p&}cfeS@b<~$T|3Y9J%5q2|SYEmnOBOLrsIL>d6#f~1?(bMwt!#A@}-8IxV
zAYmH0O)KP{4&B{rl>1WDfIw7)D-Pea!XF>PfexVF$}vwt3aBo@rk9G;I!{VM(fN^W
zE3l38Y|5~+^A9>a6ED=5+rvQQ`Nx-~5D=#;`49u9;cF89FL=Dtz6L1mcbL~3>oQoc
z1)mydnjmauFrH3tEp8d!$#8f`L=6oQx8|u`q4CMTjNLdgzDeD>SfplAW#bKaQn0OF
z`cuuQpVd0dF3Xp(gui{CdMiQ!(;VUBih+Jp12#t)?spUhw=os+&DJD1n)dhWHMaF|
z!YbWQ(F3NCSI{M`4BJ1-DERZLnIaqq!~`F^9Ad)WXfZ`T?szgq94iw<l60=&|D}&$
zDfP(cR+#RziNOw=m%#%fAl**%vnvU;Y$b|3>ou_&x$GBTHG~xj7g?=~k1}TZcszDL
z#j*n5bsC_QK&Vhq<hR}%5tJ@X!CjNM9JNf9tg0L6tbrP%Q=gIL^*%e3Xx=|3>-=;Y
zNR{OKiX_6_y<P9+YX!Y<2B)N#kyP!uAGCB6$T0H>gJ-4f(;x2~f)uO*BY6UNb#yR!
zcq_@tm}S2T>%1HY3st=gQZ+noOM7>><OW7u`UGx<M^zhb!3}@XZaM#C{y=sd2mJz!
z#O{sEi^1wc8F}SKbwKzSwIvlZ=%c%d7TBv885exl$Ewm36u>(^j%A2`LjTUL`F-Fq
zn77;S{Ie%@Px#}YB`m}C`-8xz`Jm2lwEWWh#bcix9)E=RKsYfrUdmu$6G?OmB<wdq
zEusQ*nCBf0hsYmJ1I8a2ucr9ex|h)k(v;(nVisZZBlFM(2OGB46F=tv=6L}29BCSQ
zYW0L`{s1)hIPZ;7QV<cm_}sI>wwBwaeyi|xjElm4RP(sAMT(0(lqXXPN%VAY!&c-<
zNQ4#JblNN>-p0fBkJFF1Jm?nZzNai7+c)(!VpcA<F_n;&+)fefAUa`~cM^io1pK)S
z3a7BBNi}T4u2koiEJ|+g*1*Zj<{Pt7@1Tl%TClnG8};#6mV}hzxi53{BLwWWP!-`|
zgr?^dcybgrggth)PPPc(%Ugwp1)Z)FFI7w^jK`{l40J&ZgTVxDLeK<|nsuIDNb#P8
zlo_?j1rvgdpQ7NF!aoaOWI?>x_|`UW41#=o^uzD48sxn6<mMG7GJ*m*DV2}p&+C`f
zgGPFUagR294CGN`e7L^BTfl3AxuzbSc_aNqm;I>Q;$nlP^{8rjdkb;s)zc4X!Y`0t
zzcxWeU;{(27~A1&UrhzFVEc-amq@Tj`WsSUi+cCJ#v{+{mdb#i{xU}H7V)MVWsP_w
zbWU`SQoEaRjq>yN?HmlVFB<9T^4z_z+J-+kP$UG&zC|1M`=wY;T=?2K?erJQiJ<GV
zU2tl8!}I1bRvM^3pMk}f;J5npYDhF~8+z1g1S)>x`Z5Q6x}E$#tacD&+X;BloP8O2
zapjsHT`0R_G$Ix^=O4P>;Cd1?g`F?yqHun~YVlrryM)z=!;UhF+~LdK%H8^^)wDX6
zAH_OIlp@OH&B;eMw)i-iBuTB;bKp4{L0Jo0VXCu82`Yq0gq{H0w<}Rck{ub28Oa&%
zGo&+QN8CyYr=xdXW6)%WQ?F6j5x6|Qq`SnvM7hLy8)fkJFilj!H<}>YD7rA3ruUH)
z?K;l8V6!);-#eAca*uP@$N0BRw@J3mxwyFOx#TLIbCt{pR@hYtnHi7o<$K6as#K|N
z=4IzGj>YC#tBMqPD&2C0h?Qksjw+g5mS>g|&PdJ(&A2`B_m4u7dy~Vany#3xvl`in
z#_wnKcJn9gFxr;U$?XZ+iIvRftO^>{sA4|xGR5QY4aWf;CjSYOLb^i1LTU|x@<CIf
zaT-%i^T(!W75HY>!@{|@;ofX7pKJMlEcsaS)iz-`_sv+z@bZ=+EP8uq#B_9PcznWf
zTxKk}NHf<>w8|%ltuS5?%4|<yPj*aroSsrbDlV5%rB*DvAGsiKi*<E(MRALLJAb8m
zDnj@J{S&$!y2^9T=W|@`Pj;WYN6#j(;uN%;5_)aydyd=^7>RW)Hpl)tWGBQR1WIVh
zX2!M6#m$Dn7DHIfA#6Hl*~)py)5EQ9X*ZV97p=c-&cLm0>DqrnARXJGzh2s_o~<SX
z6@z|<qLvLnEupFXmi;*W!TrvOixuBAelW4WRM9L~J5fu}2rWx8Qa5+$`)trr*;}e%
z?%27VToK|Jc}f98f@O1GaSJBCPSi*gsN~cXna!<ySE;%a=wwkxT=&EYe^<ec=lbQf
z)HNDLj!&J>l)lfKYl&8gp@_(cca+5fc>;FCXvF3u`vPpq6iJ_x2$OWaloEesH|xmZ
zNN?n?<J;u7c5D>VaC4aYl{PGsu3Ixzhpfe*UG(L92ns2OGh{U`wg{t$Vlt$NXYRSx
z>KtB8MYV_RxlL@%WR0_nnrntj!Fl!IguZxL?2jQ=Y(WIUO;>I=&jY*j-t(Ew)7YwX
z=g@PE{p|Oaou50cNUnc&%?0e2O=q{ot|x9>mstm-#*-};_wIW)`8ECYO1#GjC813$
zi#n!g<^$u|ur85I1`8y*31Xt5qr611KpVl3Le4|3#kU|$BG4k5!&$*&=UDqd_kvT<
z;q5QS6gIi${N+<$7vBL4foDaer1&~mcjT?YuknUCF-Y*at;VYbB2*$GyZz}fJAyTy
zIr7ZVxSE|TKR19lK#clVr-n<)42Ig-R%p6*1Ko(oFq>Z9IeoJHMDd+EA|)<CE$V`q
zp4HSyvFk!%)i_Kf)gtbttQF)%CRLV&^j0z^r~ZKNpmNpN%9uHhC5>$A2dl{Cj5$eh
zno-(g+I>074C>5XiR=Eb71cwxQI{>)s2ieIi$;t6_a>W)AJS`-8uYG_%TXoltEO3U
z8+rXjFPjQV3#`~wwO)9}lO#0IUu(-M51u78&SlKayR6lh*T2EGlnh-P!;PXJ%`TTF
z$><mf&DBwwgzQH6^;Py@U5dKAZ^XD6wthFo;3+@Ha=?`H@`DJI?abYl#whiurt#E}
zn&7IWzRfZV$6@@Zc&4&`=s>EY7q{`*248p52_KU^(*|?WEFX)Z&Z+zMP10o(J|BMJ
zOV=`T(^sRfSN)Fhgl}G~1<V+P7^okfzFi3Bz74#myDPCgjO&2mc`;$d+^B48X{cPe
z1nKoyi-g?e-efms5=hU+)UeD}Csk{e-prsjJsW4NP{Av5uQr>@yBaSTr_Nt2h_Bz!
ziZ`Fmsu-*ITtTvqwH^X{Imx7ZUPF<;VrB$h-#Zn9;TL;Vy{cXw5xWm@O#H(0)#~b^
z>|&EB@eTK+s=De|%NJJkQ%4`nS0*@`et6|hL#qoNW@C3FcS*)e#<R@?&CHyh@2aT;
zPcWF(Ic4qDS2NnEzE6HXzcJrGpXt_2+L2=HHhQKri{DsmB;w{Z50{Ssd&-UxGe&vJ
zvF%1!O2|(bMKt@=kHpOM$cq2N)Yicnv}~i7vQ|8XB$HJ8L;1eia>>H#$m%>TyPTj!
z{G8qO+C`MAY)6ur0Ey?-PH{_9ieMX`{Z7Hq^oG68CdO^&UEy;5_2rdH)r3z|-L2=3
zfri4nPhEbU!72EOGhLNr4QtL@?d#2bO*+fkd|q5fX4e(I<~&8&I{Vcx>JLw#djqjm
zs8$}eLi-rYvOU@Zbw=l3_TJs<pLgwHoQG1LBkWfR2fEo@e_vWY7C#bC6~GtH5@L2U
zxXReb?Hk=FS}F=2q7j6-Gke_S)?LUIicB{B@UUHCxo^^N>pXX8jBeO*t5~aOf$dsr
zEta2%oiDg0o}Z1@MV=$==ZpLhNxF<XIqb<cO_faTYnt*fy&}9jmDSbJO^s$*J-<iQ
z1G|*>fR@IcKqfsP)}@*V{#TB|J1l|QklC<iuhu*cLoeIM-!Se*yByk@5@Dd7Y1EHg
zW7*m8m@)|D3pn52ccPEClxf}9D5(b?SRB}&#GceU99VzehFm2twrPG-c6`(}>jg)?
z<wGK^>%t4%FFqToNkV00Kny?|8H5N&2zmsx;DC=X9MQjRF*rI9!ax1+AW(oQ2=UJx
zS>XHOFC6$htoi+o5a|zk4E(|dJ|Dip|DBCM^9|wOHYYF!dZqM6QWE%9dT(cBWMyw+
z?NIVzv=it+v5`=>2Z8XYA3kuBic|*x{0UQKH3u~r2>*L)OJ;pTYXc+Z50*9$a6p0|
z_<^RSk%K<@2TKbpd;SkX6#vZN2igzaEEMGbOmQ$5qEM5OCx2sYXGG4;%*xD4A&g2+
zPA+I?2<2B475lRs_$5SP;^1Jz&%)yD?9A-U!E9}3%)-XU$H&6T&ce>l1k7Nvcd>HN
z|G;ErPx%|jUpS&h_V4XXZ5&Lkt;iqX>Kj-)ItWovJRth_?>C)BA58y)$;$rEwtx+?
zJdCigF|)G#3maG}_|VHQZ~DQ=LS59<65tttLztJHRq&tr|G$y{VEnI@YX7y8mHWR}
z{@2L=T=~x4$nK4`C4kaF_&>+>XYqdz{#hu<^04#&;>2$@|I-WbSr}E2<=-<CM*X6|
zUke-~v8kw>GVl!u*~1@P0`QOS_xD5Fd%(hGtO5iAgCs>?DSv?5Nq(G!`*tP(hT9XT
z_X@1{784F3jqckg<H76{zB!i-8`ll@4Vx5WVf(>lNa`UZn~pq?j^b71%Sc2~Ol@2Q
z_fzYF>Ai)N>G*jRA&#Nz>sJ9=MF+}57PoZ^Rol*tDlc{QS&_jYc+8I=L~;<^L-P^U
zfq<_h+*TCV-=2R4m$1Nb2ju@_D(y#B`VTnrp#K$wek;dsP=BG~CV^TrmSn(ie^C7C
z_eB}{7wbP=@;30uQJ;9=9{q=!hki^NL4?1c{viO|f<g{KT|mVBUkJwp2t@yX1peQ|
zLGMUxn(=XIQ-yy81e5${+S}u1w#S$s!Gb8b6|Cw%^t^2iGHPr2Q+(;4{S~>AhiPb(
z5sC^5Siwi~$jD%FiEng%L5Mv!muR23E_ZXLk60*&?6LnyuJ;k;K^sFdCF6YMu9WR3
ziJ<m2Nn~7KumVcd5Aq0Pb=c;)kPUL=CqWTI)PJNLlMWRgn@*%?rh^g26H+M|tq%Xl
zHnO<|0})l~n~AUFyK!__WyOeg9}0H6#H&;i_JH4_ip%8vHs8`b2^<luYP2pjtXTdm
zZ3&YO4*u;6$W2;CTWJ^irwqgt!Mr~jK*tK##H*mdz{uF}JQ^N9c;|(DC87Zf1e4n6
z+CtIxH($Uy$o-~vaS{3Wl(a4_GBlVHQsfh@?PN-=5|Sg_YY2b2M<@ZZ9RYuw!|;BI
z8CbzeAmaH+wWGp|AOYEK=ykFr{J(09h-}g>x+3Z?tqOYb7_*zmdkPhYN8=M+&ukf3
z?6)d;m(na5$l7c%z`8#6;)jDR{7}-VAGtGh6aI!GpM{vkC9s&!mri4J1(M(WgdLd4
zs$ZV*Z3r>`&2PQyjRBR=7|ot23nC#J&_m`RTGh54ODNxw|6pLJSPJ9Xny*tO7*EQ3
z{q5jkQW6n9hwZaFxbKL6CfQMb6i}_H;5`jN^HxKPI`yiiVNRlZ{8tc+UN_g+V;L*b
zJdx#k|AySDHKW!wH#aZu2=oGCaGWHM`G<qkm^Q5$mb0iz3JMC`r&(#3bezDZqo93F
zoObK2|Jog}FiN$l4;Ia3hO7=IE0TVz(tj+KJ78#ppYOaT4+9!5zA`F*ZBBbCsFmu@
zQWQlx3b13GvtKn{{V_c9a)7~d{j|!B%oz?0d4p<5{tDy>1z;Ns&oa0>-hlKK1ufr7
zzW9Tx8v)K&l9W8^5h4XLIfUYy4eH-DN7TTY#mejrzlX+4FS|cN2PTICB(a%eepccJ
z8b+gI%D8{mH~=t4zPEE>b_)Yw&*CE~{<7{Z1VG##B0e0P0MM3`7}omd+{nW%0kSP{
zsD+R-#6kLzq_jn!{uttK5CB|n#`j$44$v@|q{2o1i|92rI1V>b*d>+&z{6K1><AF|
zIkI^yelb2kCkeVh$~TEVQ=iP|9##ZRw=%~w(x&4?T#*3jV|S^1hpdAEVlWCJBrn2W
zc<aU%vu5iY>n{~?+>P}WEatvj(&8hYyY0t4-pa7Xem7sV$iMM1xsEEdtm1eur?{WP
zx?h57^1G$I{c;HrZRo9l2gZjUNS;O{@3`Z94D{yMOM<uz0S=k-3$RoDn`$EXQ-FiH
zEPQ)DU=*B-%h+}3DV!Xl*~OWVK%%z?1N>(1@=|lzA_q%@0EfbO6vnoGk%)P4yKSY@
z-R;`u#tuxJHgXiXJBRj6g19dEsmS#Vy70E6bKF#;+dVThKV+6FblMw)xGe|l%FAgE
z#oHh7KAgU!=P`ob0x2nm%r(2CNR#$uDdvfj`*S5NTWgE}*Pd)uqeStc#U$*xHR{F8
zDli#`sN5#UU-dR-X~Up<xSGh|Z*af?Kl#{Ih-m@>Rv;|dns4D$ua^XE%q0q2*}Uc1
zAD=<@FquTkjw%Lba%HU2K$J_{hRDoPJn}ntVcjAH)_c`CUNYaOeL#qWq9FYda`!oO
zZ&w^}+!zrFPt1ex$uO?=Rz$x2u3e)QOxL+~8@3t0TV-L5`~DLk*wd6BS$jyO6+eM#
z00i+rS7u6qo?KHq-ySJETPzwbrZ!T<|BVPSD8VbVt~K@{CA?F;)D{z+&`A%PHU(dQ
z*`r_UsJ0vb8J5ndDzh~OkY)D@kUF%y3>5ELz>#l5+n*Uh$fe@kc~<1p^;Q+R0SaLh
z5HqzOkG<_)0GbdRla*o%5D}yqET5u=6&S^Oxjy*}+VS}rv_dT3uwBAD^5ylG>S(=b
zdij2zXq@u>LGY#PA^YfXQC);(OIE2@!!PkNy<zszQWwfH*+giT>FT@DB5{6v=5iz5
zv*Y5dH*b%-`^u;xjk;IC>KD80Wma_?5tLS2viQX6-7k}qF5gv{j=OV}ZES~>MW>Yv
z6|f!S-2q3Kd}f)DxcId)Ylx5so8cwETm8!@w@W}(qOE6@jyaLnG~(Rf3*?Dw3ku3d
z5)JOfB12QStaEFYmr<Y>*~bH$qfMtpuxUq`(d;gbjW7WxHi5G+r>BW^OONgQR&g4>
zw$Ki#8Tj?s6_?cvUzsPQpY3kU&wi*_gU@ExXZM^jDAsBdj<;?sPgZvnUSG(Pt?G5?
z{VpUE);o@HDs>Kq-)Xr|G}9?J52)vHO;L4nF6OV$v>NMVwox+;oo#aKq>$OV`LUyV
z{Hss*17o&aCEVH~)9tjh=P6!({FAyO{mNMb68o8Dp#hC1$3Yk)q<yWa4N4@#Cj=d#
zC|gP-Yj98C)akJJY2-LsWo&MLh1Gza#EwOCF3i(WFK8GOn{qdLMZHOe)9`%Ng~!W6
z_}*Z>EZzKRS$=!nBMR!6=$IMXc4M>xWUVY|K`h3}w)gi_^E*G@op5Yu$E}G7c{z&+
zOxGVY#b=qFoo|RUelg0cOIZ}z9UmvIbyF=9>l?2><`HIZLBBLfsVcsADtEa*U3!Ak
zwF4$!1jP4}M%EKkO%kNv8U^ykMYLmrjCsRqukF3D-4D90psP<N*kk8}JLk^YGeonQ
zZ6bu^X>V6ndAK%7D->3k%Eg{DH>@h9+uffWA003Sty*4Ja^2T>)`@np9=2Eu^!WCO
z%};XDYV(Z<o}Z!1eevD5?_!v}6L37t`e8Zz@-`wNNX1<;C*-mon&}t|{g7XwPg?B`
z6YUbdaTz;|x-VomzJpOxzKZLmQj`@c$mYB5Uf4K5^M}?quT1p>_ouQD#$ZPE97V<n
z7@nW@uhX8g4QXU;z5J0=K3IE4X+Ikkh268q^+EgiqzUQ4E;fl=J>f9e`P7~(Eju-q
z<a{Cq&Ct#{%n1&NXMuX8Uc5Bhv{uQ`_UMH|zjChRQGFb0T&6s3MpFtwgfeKRzk6>#
z%6Hq}Qm2wYc`w{AXf)eNFiIwXZGrX{Fm!ZBYnpoiQ`heh<?o{rQC~n7ylhnMN7Hc-
zV9FT`YbcK`^sLHB>Z!*?5-E4C@T}cgX~)-thAJ2BNRE3g1*iZ50TVH7UdeV}?^RDA
zU=(pb5`CSh=6yOBGV5Zclgm72;9VtnSM%)Mdw5E!+`VDrfrJ9HA|Vg^{Q=KBb1qlg
z>EzS|^L?_|?NW`@+_qwjGxImbw8Hu3D$Ir0_xOxOYb!BDg^RggFWdG7T+-$DQVbcJ
zV8{NbI3Y3G?z7J+UnGWV8~ODRcx`_@Qe?XM@N;@UKCF`k;-Y==Cd8fTZqNAm1okc{
z43h>0cLz5Ra3|oUp2{7-O!eMjfxry#ewZ|x8dXV8cV{h{=qq1)UTy0I$gt5W=`3_L
z=8{_89d~|d?y8kzBkWnwx!=5MfeyWbA?*#nie^<1=HG`{|6H#fH%-Qa9!942JHSfI
zPP%524;LmpET-D(T&x8577kR>mdVb?@t%C9Rk+&K;Nem-_B3w1x(YZbTsSnZ+SjXu
zRmrXy^XHa-6`#VDE7{$!l$i`IEC`-t5wo&x>wJk^kVo1``<0czOgqhqZ@Ez8+}zL*
zHZbGLtYv46%~+G$^OYffT`Df8LAT;{Um<aEsUvKzw_u^PBwkPZBn|QFikq;TlQl~x
zZC-s|*T**9(;2#(nw8Ko|Ie>RB;JiDZFg_Bdy%!a%lX{C2HYO*Cw`aL&8|KLY7rZZ
zWrD+bMLK4Ad3lhsb=?GvKGHs=B0~gklLy8XQ4tkY`PW>H4fHn)TMET8J~D0M*k+Xb
z=qvC`Xt%=7Cvt1|n#&7jURgn+|A;Mr^kTWXc;K0H$dl@mUo-QSTfag~Oct#bTHEh0
z53jBDWChZ)b9dy+ex$A-lX^MU*7Vm#4%{=Pw9co;J^4!CtGss?&_HM-6!ookWlb2G
zx+~!JYJacMR3%C9suzEuJ3u$XalgKn;Iy4IVr6qR2yI}gsa{7q$rN#k7?wvJOVK0=
zN_N_%V-fE1TG*w<p&=~UcJ5}}=&KpuQ?#z@YsTm#<5?4qj;p(B9O^H0%Ao7DqlRed
zJTvFvnw;7+zfWesnWIvnnfegzA?Mab*VtoC_;vJ`Ry2w8^y3XIqN-J%klv)r&4rB*
z0u&=fj%DrQuCi3$P6ZjJgnKb*6qI>m9|BDG5gHccF@V!!KlGl$S0Ia`L}}n>^1jgZ
zygZ>T%XyBYoxfyQ7>mt?5Nm+oBz!jj7GKD0nsiF;Z&S4-;j{nxsrd2TRs}df$+~s3
z-UD3du80y<yM2=T<YZy1ng{3AIzytEs<5Zokj5c7HogGqxj%uGIDL{Nq`~oAR*aIV
zXQ!sBkyBikT@cgDe%LlK4lS&Z1hlZ&*rOVM69a_(=A33f(hJ`x7f6@74b|8>Fh!KS
z-JjS9sdGBg8q}Iu-ld;(|Mlowz|f{*oJ4)E(a%R00mq~~jwRE}aXf<O;PYwHStrh+
zxgW^_5DV%UXOXSl_7bPDg{Rsz75K)u7tPM!rebz2NeNNViCU>3zN_v)IJ1_>d-+O_
z8qOC*4jxev(<FygLGLP)7D@w|fK82jad&sNo|!Ows}x=iwLMDIG7>ILN6O7|9oMvM
zm_M5pm)ld2Mx?@_*>ZZmNXRz1<9t;IH(6|dR~k{XlR+MA<AO`aQdQvTv{6{Q(_7b-
zkiU2N(l&msbfdvd&wv8t@s0eXaL44!qEi)##iP%Ce)L~WmCa@;?};$&YVX<++@`R?
znc%BFPrB9#8FS{X7f4^@>1^s!;^acCoUW7aLS2r1YP|cX`p38xCc>ZN1kIn5PIa72
z+wA6K;B1Cg3<W`1?6a=i?^cBOSWATFI=O_dRet&%Y}i+wKxL%0q7668=z|AjRSdnZ
z=jFT^M?41Md2B)6FM-&@5Q|3g6*&SRI_P?~rk7ZBzN|GD7c!Bux7!*^Y_LIx{>)e2
zSmTGzlXZpgQRCBRvoPm96C}%pK`#dOPSHJsbPMsCB8&#<U)ydX$8(NzB`fGO)1if4
zB*EHp?u@K57Nx_;vHo6~T$^sA=Bf8+?eh>%Q!<v67VYfnL@yBq$|WlItb6CC=(M5v
z{KEFuh3vZ_F9yZ9dc*8Jp@V`K-mp8ju2AR!-m4;SH225bajJV)T6&0ds;#1IkT-ja
z9kFgFd#6ujSn+SKquK+oJ&yy3<9H_qDPW<M3syr&G%qjugy;Kr@vWX!Gmp32H*ky+
zyAQf~OJmZ&5fkHn1k5(*XAd@f45-z&3j|M(jQcw7W|66hN8stN@HFL4dp1Iwc5`p@
zlv=Cs>DgMlO!kgf<+FQR&D3&t_TfB~(f5X9fk7+U<$GeaSr^`<ZcUh&7}zw)yNS;6
zbD8`*2dN*W6Zk8{cc!`KUkb)@d+FT=0|9jToj*rNjLBI0w0E7dE3?>Mbx|R-aNA7$
z^$i<`0ws+;Tt){W*Na#DN;|de#CChvPV-?b^WS+se+|aMDST=-AD>8jBw2ZU<W|-X
zXN=S_5&0AYrTVtv=+2_-o?ss-^0^f)|4-$?t%MnC4*{dfRJK+sZC9;WM&P8iBB-^I
zGiW}tN)80b^(BD;e<R+gkdR;OT-JD#i%0n7GutpfjbqI;oleFv^AM7T_#WR@HMoN0
z#)159)tj;MY$Ec27LOXjzH7tKuyB2+tC#%6d2e?5WHF<)kD+&<SJ(e)=YCX{#J%@B
z7us=Q-6#j#d7gvzmm8M=e6=swGyy5+jV^b)gbLp~ShyPOXz+CO*i<QLtVp72d%1gM
z<5;_8374vh7X`X6CNpcU<LNi7Cq3h;McEiXtM`tz?&%>%K~U^GDgj?xDieJ|9!?5`
z)2>O?c@Px9;_ZtOn{hre<1T1y6=igAp1F(%V~sqcA!Hlw4k~l->-jQyZ!$B1#j-r&
zp!ul9997$RyC8p*+0AWDt87!($4YUp_R8uu9d5zwTvG%~LaKyXBasSc6kXd{h=b2o
zI+~TheGKMJpK|XSNb;38eNQ1sG|fpI?lvI#-g8Bn4H>7<&u-pl6~|~H<61+<U3o<`
zL11&?q@;>2r;b>EMdAk&E<o=0?kMXTz&1r+<A86K0971Yiocz>gYI%Aj3r6Dx8@Ph
z9IbPjpfP?rGaT=;BelrRB$_iEXpx{|-gH*}brjpH>clvG_(fBL&GCLn*8V_ytCfaA
ztEiRZ9YH<kvl2Y&{`OW|h7NK_>nDR_ox*{i0Zs!MGYb{^-af27&tLEu#shB^ePBrp
zcvAp-2FmBY-iJqvta%Z4_orpH=W9wA6I$wJRa<u~F2Q)U90~l+vyGMwZNoTt36^)n
z=<Q^eQ+rM_A@Anb39L0Wm&c%mgon7KB(<w)cVfv4ke549Txtkz@b@@$<A9yYl$!e^
z?>*K+%PFS3=e|KSc;PnoS*ONXO_iwv5V!mwV$SbA9KJ!6EtT>>;OIvo*SIbVM3Q@Q
zpi?;@Dp&NO9hGfRxtxF~eO9&Q-m7V7ctq|<{EYUv$<3J?fm@rvXJyZ{yK>)~g%&SG
znV*(yuI&cazogvcn7O$5F}6P=k2<I?%~x`2UK(EZH8YODDH&OUTS{_dbtXX0r_*QU
z`w)&I(@zKk%`riio<I8*T0{3p|4#&oQ<4xKN=)L$e#fZeC@en83{g5{A<X3^=;-6U
zK3;$&3fwO&l5l@;8|!Erz$uoI%Sh+B8r7NR*_|#eAMA_?XpF4ZeF$@bdlyu;-cEF>
zRxV{>)WRfNO}2}Z<7?LZX&D`oJ~o$;eQnEWXt35&D@Xqe2r)^e-@}U$uFDaVGoa3P
zUhp|yQb2<|1vWI(YEl?cyK}s?=$h|po{VNJzCSx~bNy-Q3eT&mW%T~&k(_k7NymYQ
z^RFE%^Pk)d&BD{Dhs$3)uUu-TVbS;T=VC)>?!dWm#)C7Hsdod6nrMpV_4Cif*qF8;
z(KbE1sR>@ftjxB>PM=@J<s>avm?ZF68tT+oUbpX<*V4o@RStCx($<?&fo}a-4@dQy
zFdt6yn`{cE9tlng9C8NLM^==T;acLb>G+DXY2g~1pOE(nnNqO0i`{`bXZBf(O7_O~
z<W#n*8BHqN8f}*Lcs!f>>qxkbpYxOU*^yQvDb+u3RvT?MBJ!&$O6c>)YGt>~n7*nX
z#~=yU?U#BxW|4)u*#LF`$Kx!v=_!ZYT~)94hUMs%p2*1xr+w?QmSMbkt(UE$jw<uF
zm16mw8!`g5F$^@4u&Eu-qgn6iel5Z+8~e#pKQJ@!ux0n^E9VP&9yqc8JU$3o;<IZR
zjn5=6DaTEel2mwmn03kGjykjydCFMYT#-!7Zv!t<V;YZitQ*4`4b=*Hmg=l$hXvi*
zoS1`}&u$V$xOs&5*=%Ite;q|##l1h@w^Fe#(OBXjG08lEk)`NaxwxlQZC6h)4-?WT
zWSzb(9}HY1EUx_QvR&H(oh&pT`bk}t*SCMe$@-+|e!(z*R4(<lVS4+gDuQ|YuST@6
z`KctY{>}=uEslY9$6q!31ospX{4BFMvkA~KoMBrAW9|Ehn%y2rMwaq>b5uhaKVqYQ
zZ-e4nwHldap+VT=6wv}?_gMtOSu{_RttV^aYhWvP3kKgm3mwk-vNUoTYw`4=U()ds
zt{1(E<Ne>?`v7(8C5m*9vmhKdz@;hr{FOu_p>P*jBHJ+`@$KTBVW*6yh5zNmOWR4)
z;fAo(nNhk%%HFY(`L;&Bu<Z;fM9azI<gJm$Om(*b8t6>_et$K6aD9B^O|pxld&Ty3
z=&iYm$N8q?oDo@^xK2_0X^#g@N%i@XEVRK<Gl9(WW?Wje?2RcEZ9Fv&P3@gRVpj=w
z@sGmEl1x5}u%axtO_ha$_t<Qx#AKJ#lgrXLT^$q<WxWuI)3Op?y8+bME2oIzMkneE
z49i5fU~MxBxFFId(`d+$_y5sOi8Djag+w~&&M34s)7Q-wy4#nSW-{xImRF1xa>RI3
zDA#2gpN?O8tmRquLn{%8jTTvMJBY3DS=>D?l5)bLQk0sTvRFLL+vnQ3a(#oCYx_eh
zyBl8OnClMr-{{xLZVcW-Wam^Mt)hN0bV5_1cNYj3IA0}8?Auec2jrYKDzn|KY>=aH
zn=j~C0NrzGfzV*)JY34rP-w~JAD)=oR(9Q-Dw)Om2C(qaxsIDIOE!F{+_Yz^Schhp
z_ESs;A(<b-fvdA|QSD|~^XxHIk6ZR>)@@)z?aaH0DpsaM9x2D%*sQruE03Gw=*gO=
z=fvejF3rryE@RYf4ISk3m$kk@MQ&H83@lddWa#^QPp#b?wc7+ecgc@80{MV9Q}YQ#
zdT#wRSNft~-|QDSQXw}qS_2wgX2o7fl{Lc|$l^24pEa@anWb87RHFIdP>)G{#KG)-
z0k}<!<w#B-ga;hf$2M_HI%ECPVNVt$(vMyDC4~_{q&54ro^?DJ4drn)%-_oC<Tc|D
zGOI;j9`ndcgCMEn<=Dm__gV2QQPs*lArK|Bd(-`*hZj9CAB<m~_zNDA`}XOUj{E0(
z*BW1X@cjtRHI>1qa{dODT<4fj*5QhWr@TE^=x6OjL%TJ)PrUSJYfIXU4O_wYw;9?6
zKf3*bwwYoJRCf)coCSrZ>wz0<GOAj~j7-B@?qYr;j&!H!U61Yi{zS5|FH+i`c4YQv
z5e*3EKrHSKxZt%YZkJcz`~e^A|HutUdif*bQgRH-J`r@gQ+#-3%!1VxLTKyW9k{-+
z^n1b=$5lKNtSgW1!(Xp$S>QicsWJN@%0B_>cV#efE@p1)Zk&BRZ__XiJn(Trf}y$M
zI_v|gXb=XdQSAVXkx^xgV3JOfdpq1z{m7#MZyI-zz&<Zz#{PJr(6U}W7|iC-*cN(^
z(w<Eht_yej91-<@zUz2B<Wn9|vAT3(mjN-tBPmwEDGR|7^7g<X9s!T}k>XQ^kKoAQ
ze1o1M)%}wo3D<=`*Q{!eDkhBYR^}*Sa=-sp@THa-+vqVo!NXky`Agr6aB9Glz4WFc
zc!0hx3%1g<oUqW?6h#b;$_iY#?{^M?%Gk%4$>2FZ(L`PY(iK#bzLI3{!+@j@Yw@D}
zld&Vfo&sE%OR%|)!(68m+@MtYR$A0_{{9emz_cHW*Ui>sJGBPncs2W+lVmr67TDWE
zfFW_UracINS)aQad&tdzV~|ZgFL`^pGDXaog{U0jb?r>ng>0G7imH_vy|+&DWlQyX
zFI;lZssDP)O2|vvzTb1cEt}6Dya0ryN1;jRFa9eL4)#GIhT{Kl-z#K+C5KLg3|nX=
zloI%WzXNko=i%3Miykn~9E}k@V8%l+nfZlN{9Iqm^UP5!x6*vRpSgT$Qgetpov7wH
zJRjggdtPWo`h!scX5rdg{LZ|vQywh*a@lCy%ue`DHpF9(H|8{{Xh!%<fw-qpz=0c|
zq$)_k0tLyN7s$`I=W@|W^8t-$C37X!2VT2S9n6oU4&c_w_5h(0v{IX@tV~-=Ay~}`
zHpc)%Q{O1jFnosuP-*nDTbV5eXp~ssp+5SZBK-*d4){vcw<+#@$$<RMrGmDf{v83X
zSX3W<5RlhGfQEHW&%bFR19BjW@mERvk_kwQ;^#uC<iE+c93DW^8-{Hu|C5&Z3oZA7
zW{qIQRD&0o0{Rp-t6g#j7lrq2Tpg_ucCY^?+b9q>IQjZ-en=TBsjHi6l$+>KRHPAN
z28ui+4uO{@j)9m-FY4us)okB?Q+FZF<<L#Hx3A;c3ytXHa&dqpW^P`BeqMh5lh%we
zGLh}azi9|0gQJDGp6@+tYE_^>j%Ecw0#Ak#NqT=Ffyv1bwcI6x@t6u2Rb$-^B}p+q
zHd{ZWH1qOf6Y}yHiUYqS92EQ|Hk<|Cs{BtvGNRa`=P^9ykB7-)?1Ra{3OFfBzw?$L
z#G_ZuAu)6;bJa($k_5>ysW3J5TXQ6TeCPd}k|VD|e9qLvq^lSJ)<!g7gecpH$!wbm
z3;&xNBzhP}dOPwF+=kq)5qBXjW3y$Z_Qsy)Zz6CQ@g~i8D$+zO$qy|2Fa;U3LPZt^
z=kljm0=OZhWeCnDYjj0{cl?5WTy_M7PTm>iJY>x)g8$|^LEc|bE*=G0K{Ke{`_6&9
z3vlRA^;gUq^jhFKyzbzgf1yNYH4W-HULayQ{XKFCv88}_{BC5D(bXP>s%fU)n&l=b
zJKTdnBGQFIUb$5iLM3bH>7Dkz<J(2m0^I#yT=El$Ng!Lz#|W>0i<KYP9T_*7)%F*a
z{MyPpZS7sz5<d`PZ&LG$b!^<G&Ifw1epHBCBGc6wO42GID|EW>$dN~uZf2xj@NXF*
zttQlpG0bl>d~0mbz3nJ|xL=}{Ez{h>zw_KKdE5_8v?Av1f<q*i*Fj9+v0i@n+_~_`
z%-KxS9Z^aw4wuYT3LOsFFtPb)y-x;q2746gvHEpWritkaY0&m>>KJ?jsL9GCO*Ehc
zIkbR~3$@r1{boM-gKn{zAbtks8U_HWB0wzp=yyg{9vM*c*YdP-Ky3ppFpIB6D($zh
zdDjD;aP1LCJa9gjfI9U&HahxK5wiw35-J`o$!EYeAM%56-=e7hV3G!4sKZOL1D0m_
z!Tu~FMgLVCfFV&=k*mT0CZ`7#H^)Xr{4Mqpi2yuHWi4bl<aR*EsrHyeFaI1~4*(>x
zY#9g0Np=JK`&`gM$NU>rGmtc1@o52m$<G1cI_5p2N&neXF90p0el~UH&9@9-iBuk6
zF8V)9q6q+6<GFap{lN4Ow_`h)BF{exK<5aMrf99d{n(rR;rf%_nU(hs6Xfjx$kdY=
z%E<8R(tsgpw3Hk8CxQO$0-)=2)014_%npcv$G<T6-&B43@GwxdOUH-U8`)fOUrTS=
zpjx1&-<nEhJJpi)KHyAppYOb2G}5yZC_ed}?_LW7_Uu*Ag94A)4e#A>Ce)2iB20ls
zPf#|{e7BLgnc6F*K?VQUO0@xdBJ2#Sdjawm#0jU1;O+5Gc0Swk$BMEDr+V^Rp}}nc
z6@-J~xa1F$x+ija(p)%>wCqZOwC6vJC4XQAJ9`qKz6R3dMCmkNV%jTR=%4@Q6a539
zxCO`k!E6AZaP46VPJiHX0wbLGgMfWuqx~uG%sf|Zu+WVA^f%u_0lx1fB%!|f#)-^n
zIEKhmH1Yu{=h1JNc)&{23fc;9usW7nzILqZIKI9ji^bd@5$JxvDSH^n3nX==;ZGem
zTSFLQ3t!Ltk;N;(G>?T5IRmQ6MlMh0%q%8t@Y4nt?jOYDm_Q~M8_rJ&5V}Lm?y=<H
z;m8$JO{w2CHtP?;h0d;w<qP06*=;vxpn6`%Bgb-Ii4hq7Ag6ouaQ-H5w?W?QG~q!4
z_*Pm*H{awP|Jih)5J1^ZVjw)QX-6yx=ZAgON*wY1Id2<4Lj2y#4gTZQGdRpzO(bLV
zqJ9e;m|Pqf<%5?l1xD|Hb35uvNctn#HGrV?JX0S5jL4DHH|7Uc9?cLNBK<z*?l?ds
z-W$#V3fmq5&Qrjdt6rHhhz2C&L1r)?F1klcfTGhG!1qjWzKX-OO;>dIEz3l}UP?{P
z7*M@OXv6!H(STF1b~6K13((_^hf0APONl3dTr47s*4VRYw64PaMoEVOEL<_6l|lh-
z(vsvmgB>Q42}{-JzfsET0i*CdplQHpPE#*A|I0=r9t6dIrRqNx1aR6H^EEcB49bk*
zEPrIq9stoFZwpk_VA{i5J@7G(2e;$T;tPO1?yOS)JEVL*ipls-du^sTYM3eqYW(#N
zAHfP}QQO;@Iyg+)8kTdOj*gB5oJ8+@L5PZ27i*&>8&Pbj@#%N&YI%JZ0M&Fj@Jt%z
ztV|l5=Vr6@E(4zN%Bg0rznLK8X0igcSU|zoGs?j5G?x)Nx<B;MASTJ@=HvuQ2GJoQ
zl9R)~rG}&-N=Er9)yG&I4JeOZRCy%e{bmPnF1+v}6?=P51Ce`m)Mpi=m6m?5z9EL&
zfV@9p(GlvWVMSgZ<5Php!^;zn5^P(k4&em;>N}YHk@r6Lln}W*CM~_*Ar=tQexvgZ
z(v2pUIJ$5M|5JYU%@=TVs4E%1L<xD%o_J58;WW{y&xZo=Zp?<8Gr#?X)}9FX(S_YN
zTfM>H0zZ_rjCxWrv){ogCLN-;5jWC0>@M3?ox@Hs5RY6Axqu4O9LU+<AQne|i9V?Q
zH&l>@zl(_K?d@$pX|3sxDhJF9`9s0hy@_O(sw_%O0_ATL0El0>5j-K+@UNgOoT7rA
z$n`PcnGZ83_;>A=BNs{PzXd)r@Sd^EVlt(YG3CHJs<#>{j+pd_x!}KbKIC<X&$)V-
z<rIB@Dh~rDNFY;B27S$67JK_+AA}{G?9JzY@39pD(82)_qVn<xvTAE*$_Z@I(CDB)
z<Fx)%(gAS?O8>&D3Jca;4dGxUE_pc0144Y=*FW88WO^$8vdhrtqy6UVI71&AQ!u#)
zd<o7&X`sxzcVATzd?l}*^nCqy;=hhE9y_p*)wI;+^Jec?Q@`rpUgklKIekBBj{0`s
z5xtIZdlAkD*olq)YJ@)$;*AYFgVQf}W}1}%jUdNkdEUPqJdy#fFxD$;kG;PF_CXNk
zFG}#gKJ7o3oZoA}H-JH3Srm)@!3i{^%CvL;EmAxMj6p6fuayrN;lb!5htK{k@uFh_
z6mG;!&P+)VK+`d*68&$zmf(YLhhtX<lYasX4LoZ0AFK3F*b?;+w&<t<i(8Lqo&OST
zAO;xOGNkXkMi2LVZW)=kfAL5>aEe4q3&DE_a2JND^z?s)O1>RH{C4gGZbJ@0+vnO#
zI>Fz%1{e{5DLvc@BR*hPV9u893d3JSy}p1Sfe<E`bpj5B%1I_N5q?0YzF+Iy=^go@
z`Pm*cp|H|yiTI&*(-Q?fzP4NNe(OdwYj@JbU~s;K-L@YCkQFb0`Wp^fapZ^OIV8nr
zXv5taX<m>)es(L?%6!6Ng3cxssAwc_Lfj+f?@IzI07SDICQSAHlN*gz;z)pzdw{G<
zW7$wlOyFF<r1(53CWhvDU2!R-DIqy>ZyIh5(6>@~SC}|=4%{y6ICXWI#x|Lzzk?*k
zgve-K7iaJ8?A7a=mpM*y$>6w=20!(-!>6ganGcp@REjhJ%82MI@zU!vUjs}{kJ)iJ
zblbQ#;%Z&%->r)acG^n#icJA2{Z#TT$esUMplqIrj4KD7DY3vzcA+wg&&bAjG>y&5
z{mG~J%)~yRLgmPrMOsFb-RR-6;~ygX_5FiG42G(y2Ws7Ui^a9)e21w=TOi}aR){HB
ziTuc({@yjShdT=oc-6H%KZtiiw1xCTx(bmnf&mxEWlDZYcmmMX8K|#NN)J(P5X-+A
z)M~PRdYsKP+x4SzU5UyUr8UDn<!mLFzBS{+%;(6M2mk(M2ItSJG~XUu&0_N_M;)Qk
za{Jhq9IKluL>*a>_@=R&6WCEVY)_!PYTA0VG9uNM{T$#la@3rN(OURJ6p+;psMs`6
zan*y-Rgm$CwFk8`Qie(@PS6WNR;GPkv0ZRnsvDE&RnIp+$8MOV-e~Z4=}JY0DE>vZ
zmHveJ{7ah@pBq5F0e5?x5h4AjeoO<|@FC1{dr6d}$c7w6=6N^%^5V{T)bo@ETJg?n
za*01X>LdSnh^ltS4RhO$!y?03mPO3*rmpJq5Yckek_6L~&80(sWSlOSi<6EG-)x$M
z_+U?m?E1p?5K&nV2uIJr{T}V{>QJA|+x*Tf$K!E!t9<24DRB>@_`aE$6V(c4lWF!4
zih<iR+A`PCcLN%gb2Y4o>T0)TTPBmVK@4t{zLG_=Ekn?~sy+zi^=<Kw5~(K@T6cSy
z44N;R`f_`}ZW#y1-0j@bz|L(+u5W(^`)kGNnDlFv_j}3_PVY-w9d*C0fhE$kW;Cu<
zKZSn+$e8K#rh<j!K9QeKP&~=5It=VLf5t<{Vx{Xbau^B+8y9|@Z%y?kRlLdE(ixt6
z&(^QwHd?1kQ_ZPq`Rw=ri-<aOK6Iph+FAR!Ge0f>qqt^|GI5fGTh6gFo8@NMggZB6
zriwGs!NN3@^yjdHc^SJ%Y|8BR)$Bs5so3mwexv&IWsylu!j^80zw=@xY*MIpX~b^U
zRQF!bs>-yH%k+D^rGQ}*v+lBE{iJqT?$&UnAD@1r;{XOx0Xl^$?Q%6OZLMa*ZiUw`
zt6Hc&Ck0Wf=p#j1uVa{4LS|!buH)UV5;u>>WQY^m=%#1l!Mh5VUu}FFr$M@j1WmS1
zsa2I}mQ^~I{f83+5K9%Z#P@uQ9PTkLMmLii(u`?T>_*#D#bvJV%k0A@p!Wka@o-<w
zp;Vv7YFy1ncs9JT?EACr8##_+Zla&R8sSi>apD*%+&%Ezh0Slpgu9LkGde#_H0qo=
z^}%z$k5YA#8J!#Ol;hT{`pP#x_<5s|;|Uhcw8i7Er=RaVm%onRo?oQ$$Rb~#XEU+O
zeYxxXRR{Ae%=%8c(9ET3cUiZ&y<9nc!<>3@A}WVjU7i`cFO;F$*}BAho%AdIWOf*V
zKTg}N@-P*3r|I(Y_3;VqasO;o6Wi6m;p~-VLVN>v9xLta*2XD(CbLqa@TTTcnfUC+
zMz2sEwUwL0nAY)(Z*ho@*~`0S{QVoYGoFiD4{ix9rOyID+-vb}TNxctK{m5Yz*-cA
zu_^={<aBi=4s*n2WMq(UH=XR8FfV#|%tmPGg}NBLUU8L?6o7qx74*L9xIfNiLv7J<
z8+WkYJ$y1qVD++RW8D^QF0-Q2Rns|a7i6N1VSKgfe&i*5e)qA~UST$BN4Z9IZ<wiY
zus{qX>t+?X3vxFan}!%A%x_<LxW;9v1e$6iH8s4S&!Ac&G@HhzVd!dFC}LM_U=xzh
z@Zd2u-xDctUedY%r$Ig)%~a(@b{15-arQdi4Z@~nKhRpQ^(M<zd<F5{chGNO%@SO7
zY`lw=l$iHi^3M{l!*kT$c3c_!v75VD%d9lbMgNiCB);G}ZGW0g?h8+pB}9%Ps+s>M
zYJRZuu9?pBI`Y*=c=p2>`;g1d9q0XQZv!vKalPU@*J<)$Y$t@dOk?xowTMK-y`B^D
z*e184mo*p6y6?3q?@gN?d)7gV_ZEUt?`xE=&AJv;pYj~<1g3K|oDev-fUR#|I1gG<
zqYt|3m%2>cc}^H5xa?0LaHHSIip|@1z;xq??szB<hw-?fy8}@?=Q>)ycuxIFj&V|J
zk{Nfbf9XDUv<slFc^emPSf-du0bI|66e!n^o;qZ;8j6lrZ3G60aOi5{M>|;VhcNc1
zG%)T2zPk18`DK5V*sxFfo@OAwh(o%@WZun$C4I1*-}|u$??h<9jr^uZ?b7R6^$9vT
zhS8kX7f(H*ol{tG4l?N!VRxqQISx9XB2=|2#O0(<?CVkv6m)PUI|Q`>9u+n)@hqp=
z`ms`zj+tz^l^)?hZ#z;!Rn|F8jgO;|l#$(Ifqr58_$-!Xn^jMt{uhbGIPvwO>y9dc
zk`O<Ei3pwxSt)s8L-8B_;_)$OZdcWG0oSuON`+z7V!?c4G3zR8QbmqlwaMQ78nXby
z*0thF`#gLU1*doqg73qr{%Xu#=gLJ|9<iIV5kF*#F3H&vvg<iD*tQGqOdBqKp5B*H
zl6uv%&>?YQ`O#z`CyH6fxj+?L3)ya5O5Znd)<TQd;02`OBrNtk&S}Lc?G$2qewmHk
zcGzI(E20;Q5=ch?u0yOa0kEX;$Wbg7kG*CS#ojvoIN_E{<_z)uHBd%0<NSObRq0eF
z1T5j)o-u8fpHOrbcJ)EPCz>Oqy}<JJZX7O<-EnE*hxHlKbZK7y;0d<|Gx}Xs_#TJl
zGa~k@ox%cXc?WTI>)ooR)AVkUUTjf2YBOVxnsuv;(o1orvEh^0x0f9N@3pF?1`nu<
zWk;=APqLJ)A#F6?wVBV@^{cRex|cAFm`wLME`&Ju^R}#Ko4Wi~Wf>&gD48u)Up^C?
zs->$_mU+T9#56PawbE}~e$}{d{E`{?hOFAqr;~_h!{TtBTaPWBJZT1vxw#0=EgcG9
z;cU$^U%1v3p~SB+q~VEe*dtVuzWAk1tT&ImUO7I6O6S4jFRMu$A0{pEm9@u60LYHk
z&|HdxwF&0Ar;k^J<%|>i78|s@VyS0HuT8AJB@7W)n9jfu=oetQVNIxW_eY0`1RlCK
z+=kl}WkW*mZ?fw>Zb=L4pg8+gki`Ldv^4SU!ReBbq^42VioMAoI{lHV(d`sQgRv)9
zGR3toFVVi_xP3k(Zq0bE8Nwb5{}6|0u*G_P8+}G&$24^s8fM;55u@4cBJggeYGhZ)
znL1#1Rb9G3^XHcgX5}qZoY<5~yJ7yxjd8c+m=vNQvDa&>?j^gT`R<-kXkkS93ZQd8
z0cS2EXEm2p(KZ{{Q?E)#59F{Lt;Vja4P2NHi4lGi`@r0l&H#~xHf2P{_=wyOTb%7c
zfy9On@iq2S5F#J;e3k4mZ5&noBmYw+kzd{=MR5x%Mba7s!WLb#5unTR>)mgSun&90
zcf8m#p{%85L44;OJr$yzmXA+31FW!w@@%sl7wf#*ExeAhqMkK-EoSCVcZGhI{J^V}
z^bz{p?QGNqP9Y~Zs(8MxuC9N!7yq7IAYlVP>}sjIkbkPw`E0?ajtCc-r7ADKz+8H2
z-dvn$^yoyFI|x^0QC*>|;#n8-UTbEy+nwoZaBEqzWWPD0xUIB8j8xoODM^}d&`k5V
zbUGZu?k`nqBAyI>civUap^&~sZiUlam=>UWcDLFS!O@l$v)7ii*VmcGnarcz8t07u
zk90s==N9b*nn+e(s|i1}1$g6=UJ3CJp@y{WocG*+dfQ}gJ#BnP48zFX9EsP$2}oIB
z!J1qfG$+ENam_t0Y?y4&`!#9w#9GQus7rU|1AEXp?3`)$0^9agTp&D>_YH90SW9&o
zA-2DK%&)C~^l_J%p|j08VSRlZhg+JkyFfPeX8RT{dhZV67sMa4ttS@$X@<qs#cWJ`
zBXWL%C)*OAiBtH8b=ke%B6qDwxfHl%+qI39J17Zm;u+&y#BJ5-xUfa=8XDHZd5gu+
zxj<4%=qdwbtYvYf7R#HWj|nJW@^o|Vr6}8PXsN33S}66Y*$NZ7iG6g|R6#N+BEmoT
zOQx73#51In^4-jpjFnHFI@=CUP}6<NRFz8hf>#KCXMpfYJkRQQOy%*($e~C5^z0lL
zq2Khcgyn_Jy`;+A1yY#(qvL(M>t7m_R#iJq-}zdRIs$fIUqk%gc5a-7lr6Y3ZqUWk
zFRXh<*qeDILE_eGAC3$)TXwhREaZA|=Ukc+JYA`EySEOjD~Le9w=asQ4jbIl-Sox3
zFL}MVf5uRx(B!46@!h?F1xa*k(M6txMT}lu6H?BshP@`Myj^R_X=a+*^TW?C=vs><
z-Ku=?aDG=5Z`}ka5V3IFCVmXuwRp>0t&O_J$xNtIi&&=*7t;}1_O_!1J;rFDHdSL~
z>RXn_aOWXh`_u7mH!(r1!-<EFw~-r39ggXWnePNSTaT1Utm(LXLZPX)S%Zh=(v`LW
zLA#kw%ANcB#ZQ@ku+bY&VnwCFT1k+TqdMNj(N)*q`n`dFwSldRtv~1xqE&r!DDWAk
z;2t(7p%}kVV1?=W|8e&gUR^!WzqcYvDIzJ|A>Ex)0v}rG5R~q25CJ75q#L9gq#H#-
zx;sA7-O}~U2k?7;_dd`42cETXt&i)RGiT49nSJKWy!QK$6JW996Fux-d{mr))_ldk
zG)d_cyAm={u2NDmskyaXm0r-~!E5DqNacRaC$y(kKYf`n=^Wv-CHBZOVtmVGq~ycN
z*Zc)dubKK@^@f(Cw|x4UaZwl897AseCrRU&Yc78zOD#BEEEVl|7I_};A8Lo-xeQa_
ztDYKG2I`kJ1vBhbHE2Pob-0<Cv5Myd*fr+8a?=#6-yuje)$gsOPYfN^ZeU#J{)VQ}
z=#hyq9E)?=X_0(o<pC?xQliQ|<KX42B}dBk5JBS&L^Afbkb06rlgVi6kQcPoi*~J1
z3xTurXX^f#6U?1CU*Iy8)g$Grcr%-J8f|N5-ychx=cLa=lGY!NJw~-JTB6F@uj62|
zw<!IJ6(i@I|Hnf%f4!<286!%O4eo_XTO8c?%OO5GNByT$BF9rWD*c&t%aUqnpZr3T
zqlg@_*@7*(hsp=yUy*7n3%f<Wnp`>x6>uVt36Wr#<*u<>tk}l8Rw4f&DI_@Sm8a6a
z{boboIhL?uZz`z$kY!CyrtmA}i0(FTFQ7JxrbJvMkuYS{&s)CY*QG7JPY8lGxO>KM
z)Ygt1l0VoZL?1)P?HJ`96nomR>D0FqkH>+WtKP(Z9#i%{-DKPDT4ib7ZVTalRSRFi
zZ+>oeDX~aqspY{n`KICHXH=Q9&BIKv4Nn%+IIRnB#}OIHSYz$Lq2#1R%e`sNwkZ=j
ztktBV%dLsNo?pbH@IF~|km2u%E~k}J*I{)R>C-iXYczBc;+8_5C{UcG$&58Sj+bFY
z;x$GsZ(?(lhp*B|y{o<l-dCjeFUye&(kd9_usIGDuE~8hb2(SlV>>s<MooP7DOO*b
zSL?41NdSI<ZoMIm^cXvG&NJcw1f!VE(UIE6qlkwnI`Qj;RSSYvRu7iilY~et_R`N6
z_mAd{P_)pgK-{e*jC6@)?)ZBu|C-DvN$X;J>Aa3$2`*vXC1N8*>)s`2ONfHG{=&iq
zHuhM&mMFQ}+N5|f<Vg$4`osv^W>t=n**>(Y$gZU?MdHOaIYj#Vn_OiU@K)TC6J@)H
zF!_dwP!ByhMu-+|C^N<!Unhk^TW@}pMJl?8@47xoS=TI|d+7Cd4YO~IK^hO<f84s4
zPre3$L20{FkwH~bW?xyILM1d^bvUL8U8g`dsxdmeG-7TO!+<l^zh{K^q(4Wv5@Ho@
zG>+?$_~MVY_f9WriJ9B9hc%Ov+K$EWYRbH3I74?FefooK;ths^HNmHK9|=vwTdz;W
z#CgqrR+mq6*8k844shVhp;v5v-(B+n5>t^-o3?!0U%+(2*G>0Q<n3tf6AO}agm1!*
z2-bC)g8SH^4()nBw|kvx7B7Dwmh<UC3$X=9(&9W(P%sz>&f~DgmMrhUw%dJWK{6<;
zqfBIwL*!hR?b6?PCRtcLXh{i1L~}}!w5ZrWPZn%iHV0Q99mco<0@GArye<2LfgZwc
zkv)%<DD}6y!Pa^a0@;q=RY^zNGN1kg6(=uhUbLv2{CjnbFs6Xmr-JSI4!_9CBP4%Q
z<3;?Ls$-3I`A(aYAw_tAYkI;7vrkfvTqX8y<h|aO<|3at=d2`4d_re(k9p3d?h(fx
ze+c~rL=q`NtIDSnzqOm8Bv#6UwSg0-$<6PC@pFoSjlb=NvVx}<KG%E|4GvxKctzb?
z|Lf96J7$j>j^#ISUQyFS!UHmX$QoFKbtfN*tibPJ(J(-<Y;pc+dA<&B^P(6&%|!;2
zF#n7RlQz+$X%>Y;S|U~i^MEV-df|fU<UcI{f8L=<TDu~GMbx3m&SE!0Fm`gSu~qeK
z+xz1rie7oT_0*aiR&9V-;mV23+?RFw@S~x6gj+IO&sBzr0={Xnyw)m1?V5bHVAAH$
zR=4`oGsE7&$l*TXA?mN@Cq2i|%>u)iPlg;aT8Zze39`{CA+|H9v>GpvB3`=3aA?$a
z2nixqOKh9Gt0el4Vt>X!|Caq(J3M}98V)I#tm!Rf?wreWI18D=A%JZvmD{Z<k}(hs
zYN!%L&it^c=7eH3o{F)Rz1N?sYHFo`q^amdPko&Z9poq^@4wvt=H0bG17Yv#`+zTj
zcRJ4(fu`(nxuHa0Y!p1GLDI|aV4Ov0<y1V?nbaw}iiIsX_s7{uK1(y;g~ReC=TZIr
z*-4r!JBO{$+BwZj%G2TV#)*W++5_92{a6f}y?|<l)y4}}8;_V@Y~3cVODro#RY4?0
z_MeA>Py%AqgS_lRzf>iMLp@mcyk`^>Bd1#uP1xl&7KX>-HTKdgOfF#HW6p%+Z4$;Q
zXvGfdE#7_Fa%QCB)^9J!voAanzD(;jE8Am7FbCLvcb*#;bdZbMv>L6VVqg!H+0`WQ
z>(KsX7<aHqX6_lW>P)}rMWl1W%&-5asY&o0`AlWEicMDjY))OX=9jBg#l1hS->Q0C
z?FzZZ*EC*Jv+}3eMQhh9L-CC?+^vm&ziIP`HqUqP!Q|hfzl~W=x3x>xpD1q(A5{<c
z)7GZ!S1xo(n`or!D^~BhB&++g8aFitY5YSu`QsXpHTRy~&^6v#u(@2k^`kW$kgWK3
z37tH}rAho$@LcG4$82&`_L_xkm$M|+Hz3P=WVrB3J+|5E_vK;m<k&}(__!`B8w{%T
za=KNIfx(o^*1w~u1D&&;C_!&}lC!ve%vBX#xv@@+CA@18iAJ=}@<Xbs4Eo72{95w*
zjBYMBY(rXTI$-!vgGthcs+Z_Hi+c9tIDJzvza3qf$kg(cvea>fpp^W1w@F5i`b|Wl
z*-=8L^1EO(bG4A&+3eoQ%QO=2`%+dre}XBWt!Qkd;o(uPT|W@aU3>6kf((+jMhZ!*
zvBrk0fvtIS%sn1tDrZ(ytU#}ZHb@RtHM;CCO9Zuz7|i7=J9w}lVMM78QJPJ@H?nPY
z-jv5r&~_cQmzXh{+mczpjwL51w4&(0z^2wq^Ls#<Of??$?J}DF&l@rc6vVSJ+yM<X
z`6*P_Ebqmui;MHQs4}`SOG6z#8@96Ebq=<y)8gZud@ZLZDF$*CWdVMCW?T(?zqF1f
zaRRyyPq!Rt7k7D415n)gPzn{sN7|%Ky!ng>Hy>NtA{-QZ7FD=$A{PEi<8UaaG2vAv
z`poCoNe3D|olsgT*8knN#;&<o(kStu=xu0r64zz9R}yuwLY9WQHVU&TsYFv|nREK}
zS?$rUzPF{&kF|p@D*C|FdNge~TMg{|=?^Q}piy1B)^%zfw*9?_RWnxasRWG-TE9GR
zh+E!q?7b+#N?^uSJS?2p3W@*YF(Mh1(O~aRJmfSp@@qtfAhi7WaxR(0+i!pRGbjjM
zrfQP2_Uv%Y`-gj!qDYlSt`^&on7VT%r*?X87n`I-e5^Lzx?qwreOG3&EStu(b(3uR
zn2(ENH@{ss1_qpzao)P>=5MJD?EHtl=TD<2qn4g}?y^~lbRAgyxZvTmp0zq?y6!N4
zkMsSYIG*8qYHzC7@Gq~>qo|^+U?KrKsXyr@_%Y|qrjeWeo?AT(=T@tX0n?_B_`g5V
zjAS0*sImO4ghAzZ#<)>e+3G<4Al<=(w{jml@34-+QL8NAbXcx^6Q_30NMwhbvx+I`
zg50ZCVq;IMsqaxmi}T6V!y~?3uJclH6e`mvku;*QVu~z)<?it-L;KAEUVn>_jKVzC
zVvS{eADmZqk@!W$bPA;I<^9Cf8H|gPaq?=~RE4eN^mL1J{EXv;7I)z-j|v?zuQQ*?
zS{3Y^zKyPFDas?lquFJo10dx~;=SM82c?9UyPD!%ZOgqru6W-M6V@-zR&*00lCSEc
z%_8d<W11HW2x%>{o1(gCOOomqsnegKUk)J#T+0?-H+xN>_6|G<XX4K>5-F4Sz_7g>
zTtwu$wh=9xs2w<c3U+X4mW{hNclht8kE7tgZ}#aD@*v%>kvo-TL+prBAUrHtD=Z%z
zb){qt{={$F(q<SsYXKKyN~<rs{)eko?s6*dlgH*U-Smss2)kcasN<Zn>Gz@fqgF%R
z^=_t3zZKK+tUPz&whm|Z5`4QSDm|KAR`H^)`;K@b=v2qU3%8Rt9Q}isJC=8JKqq#Z
zdR}BF%A)1^=^}jqXR|)}*xh$Z$h`n}8P&zuyb!7y{KNQrOU_PYa(=teS*O-z8;)VJ
zwt)`QqboCAM)BZzQb==)DiQhrG2)-1L_D^9qJ>vM^R~8G3@KN%W~bqu=_Tpw6)PD!
z1y{A}Q|w^;I3}FSw}P=(7KNz!P0Xz{kiXG3AGD2xxB2yyH$U)pZxsY3@Lr{iSFP)g
z_nOmoWim@eS!t)_3NTI9<nxdRztwQiIBv2sXW-&k`u41%v2tN(wpOR3$}1D?Oc0Ih
z+XiM++E?n<TwjZ$c>}|Z1i{BG$JJ()+FlIg&xYGVl5^f2cN9=oXbsp~yq1)2R%FvK
zQTkZ&LQcY9P{l@E8T)m@aTa6IoAz3X$jEIB$M~&?P5UXm2+eq{_Sy+KlXrZql+O+Z
zL9!$bU!fzb3>C!QgK>(Kv6%ulrnXn|avN*CPRl}a#&v3d*Q-9h=_3AgyX*kLwTb-d
zpyQ~xGJSk(Vqml6)GK1D!c>1*^BL<^eiRcK`~C{o!yY<kL36|i=C4uhh!Mt&fFUaa
zJBqfl+&+I709pOMP{QM9m$GR<#Oo2oO>dQL!Cys}e+r)a1$16{KhY6rk$ad>@NRB>
z1u9)&oTt_bIbjHC7TY7`+8+CL;C39tG17=#$ah)O;k_oKti5v$A<SvAoSL}s7%TE#
z^S>OUd3)mjxjhrJ_G;tuJojQ7D`hXHy=?^V;T|-_Tvu5fYWJ!*vPgGZr$)FFixoqe
zpeNZeazzK)tphOY!QMA~k!j4&ios)_BhbAojOmT9M%F+r0_ukLhI=Un-Wz^s5l_a~
z*t6Kiow1#%U&g!}C7>ILsr<jXQD_9S;`i%5xurap(?jff#v$U#Df186qxQS)D46-o
ziXI$Gt5=Tey4Ud!I=EMZuO581SgbiSH}?5<2_6|_I8Et~>`RnGR9wl?u57c)n5ZNL
z1O;td$R;lFzE-!^x|`(9)k4UTHSpi-({Kw-c<c{duMl&-!WAxX8~Hw@xq@F{79*iJ
z$kIB^Z=U^p6!K~<e%|29bV1@G;vq$yljsI^?q6u~-{38KIVmlb80G=L8spDOSX3dZ
zjZIx=Gvk|+&%SQED$+Gm6Mewsv%Nn|Rx*css5Gtrw|VvG0T(-7LlCR46egBFBn?a~
z;c9@iQ-{Dpxt)Ikv*OQ+`0pRHD$X3nIfNSz_>R=3G;=k7CY4%~*s-8sfG0wN0K09G
z!ER>U!4_*0ax8qnJmS%&_FOB^?-V}9W--QBfx#wb*A~@scITr_!KV#a2yyh!adS4U
z5cYLr%!bK{dpWGeTXE9oDHiie&H6YcME(SJ1q7D)OKYw~CXaFr%m>9c^L8#O<y1A<
zMhvn=7rpo-)>jJG_lt#1T9lkG{*4R^SKYXYH`|LUI-hA1cDS?(sf(d|b$wW7sYVj%
zLAGW?tM9X##Y_z92&L6y{Z&aDyCgNAT~B9o3|;0CJkidVWcw*#HZ;seken*A)-`H8
z`kUbGJB5((QlE*ID~U|t8+Jm8|1GqjOX*sD;Bd%uzwoLclx)~~gCLUxwPhZuSx+GD
z!HM?Pm#>|uMsA%f?|l2_L}@YOgVjTWzN;firr#=L#scSB#;$9-@Q)93n+!+e^;%mV
zCwwMT=AFbU=lDokRr4v<a6R4xYV2lEwa1Xpr9BeFv8E5DEjHnEm|$Mo%VemM@Lry9
z&R{g-I&9?1HN?xeY6Q==vuL`I<E4W-f(p*zH;$j^YlsYgQMeyI-pCA?mg6fNY3Xeo
znJ(8&w|P&m%DGP;!w&f5!NFlROE6QnF?-etG*kfeLn-pdZsRN`fIU&FK)%}gZKp}e
zOocfLjKfpATGVL-CofGM4HMk(7Phbc*z-N}*Fs3T)LTG4_%nxJBU;G<121eA6w1TF
zj~8ppnYK5i1z;YL6?P1=A0Ns4rix2BmDsqb{a^SB%&;G_JDw(tZWbD8@bMy+n=TLh
zFJGYr9HTbhFgMaMVAtG!Dmwm#85O>dNb|#U$V`Y?gL^p6P=AOlRm}#xb7M;KDRvIR
z_WT<-GjElT6olb49<2YCY^Rh?HOUd;G5Wy8ra4c+_m35+{*NBHF&Xd-bu)Y4!34;H
zY~kn3C@T}e6tzva-2ct&4?nWYn%Jgh_7zE1+@7u(Tp8o=w*&%C<9}_|-@te<9MBsM
z-5D%lvbmql3nTSTH_A-VOW@}?KO+XjM)e+zt0Ig6bU09S{s(B3!tfP~v<w9AzhLS%
zxX|+EAHWic1g0Nx?Cc|72*Hodcwe|1zks_ayaJ1aF9)f2Y++5YGMc^p@0wu%!sYt;
zf*cGKaE%Ug1kyi5q!qvsO7slkJTk!mE(#EKa$ky6T9Ebl$Ex`Aco(0im_}D@TTWIM
zGGk$3u`n^T`UtO~`6n?yo7`KVcJ`h6Qr*g~&Yfv*tPB{^TNG7a{_PP_m8(HPK><6!
z!$Lh=UXZ{``w7$g!34LIwY4qKs519s6`$T-0GL9c>g2Se)2eJNjqHt10{E%40rUWj
zJ2Vb}K#M*ff|wL_K$1Kf^;3`}597OU^#|&e(2bOa)HeV5AxslaE*4EUT(dU=)td(H
zJ0=xvj@^oeN2giV$f9a`XD`n`D@npQ;N&7{L?mQNKSlYHgmJ=XED6~`OHoFEyxsVK
z1O+}3s9{l{Fay=CLlOXOr`0l>DurS%VgD<ImFkaJfaUvgig)fXS{f$UwHF%yXD}-2
zi0awDlU~Bm?16AJ0#wp4Nx1LWRP!ZwJoQtgd5Tc^N9uRxcYXkj_G7#_TI?Gj?d#(p
zj3jar9nAmK`_@H;>FT`3Jh-g<oMxavJWW}BB2!Sx<|#mODB*xQ*=yA%?t$+T@_17Y
za%#9b?EhUkNQ#C{QHyX~X#d;h3B0X5kUt9$eTGegP9@tIw=Q_pB0c{BU^%xf*qQu8
z-qHU3#N-S=NQmxbmj()LdXh5nb;{;|TLY46yn6F$eh&lr`T$HuV+F_7n239?-!sWE
z%Vwd?+l~Mfvm|>@L}_+QerrJ{)z~jUG%v0$nz8T*wL(|B!4~S#3%igQu1mFm*~f8Q
zKw*yW+QVleN44(<4=NZk`=lh#$cA!W9V~O4FNuDLf0^oxh}ppWf@m1VSpDYkUh0ZT
z7~L8}d4JGG*O%a_`S-~U6@EkfdY{0Sc8g_co`G@kpLW+x)r23zUWWP+VPxxXR4^VK
zxx<lf8$0#CY{w76a^Ni6k5OWG1eOa7mO(LO1zJ%1Fj1qYor`x+=kF(=49n_pT5zX(
zU<ASmQ_I{s5u~7i3}^W8_n<b-uo2$)ga*LOpfd}n!b0mIWxEF;to{!Ht}sqe-egWS
z8XTHGANV<~=@0IB6M@4)ArwZxsKG^FLH#h*eH68BZ}kCyH!TQ0+MwiW7zS2%Nh5G`
zr$|^XhD()D#b;qDa7pBAJEfc495fm>U@NPcbsmbh5-eUUjb>WjhVFG>8f=aCDVzaN
zl~R%jmVI*zWG{m1NEb=vqQQ5<VzT>Ga{LX4D7*)&oEN?lpuPTZ!69N0J-l<k9>@#A
zw3<@11+W(=3mvYR^sfH?psc{0?=-+`Z@YEQiQa!gCH~5R8MNH<TSJ554jTR-d<iPE
z-WS3L>vCxSi^N`@E|cq$P|CP#f)D?L%L5x$F!s6N{TTp|<18K>_5EMSZqWLl_+sD#
ze39;eV4zb7jG7-njRg97zds0bP6N2M^B#B19cs4^l+D5se!mAdeV-yMDk+QT+UJ=N
zb|=G@N($We<Xd|KXq=yj>|(LZ8cHRG<nQi0p#;30VSEDG(VxxxQ?(r_WPZ9d^2trJ
zc18hjYH23Bil)LN4&Ttv=*I4M!h)25!B?`CmG&c15QJa9fuB$LiHC17!@wYLW%Xpx
zATW^>E)L{BGb7z!1U=^lVa5RWjAe&!1^-!>BvFR(&+F}Rb$7A-cBMpb5b)`?(j*u#
z-rIH(lLGY?RW^npc|N-~Y=9(2fLp7zGs$!+JPG+1W6^{_*(Vi9lpuM9l_D(U|8yj1
zw)qLhNFaCz9Lc*4^a2g;MatZbrH8Q0p?mhFAjX!;rxqz<e4H)oOT@f8Issd8qmms2
zB_m6aM8Tv8-}bz0yM2~<mxGcAC0hM}mlKgPYyoE4QnN&JmxS?8BZNeO6+S#T{!iN*
zR#}Mn?SS(I7fM&#<l@111`7}801d>YBTNO|o93ZW0kzS$;OTsFA5jb)u^6~9pru9X
zuIc{A_%ufJE`g!zPrh}4cD{MCUw~1y7|Ou|JmW$Fpl$tfil;w1?i3pTM>-sdAbcWU
zjEd`6;LdFd00>5VVD25@g|GM$3=dH_+m&#s*#FXyFcG}N(O}@Lz@$u!3J!t;3>sgI
z{L!yR-c6kn!jYY5uO)Xt3WOy=Tx7SJZ7w$+sCPz&NRcXhf)7-Qef5Q2t>njRRdR{W
z-I~nKJ8NArjHfD_U%!4meC}xp{BJ9ox(c!sL<Zl_VTLM1SY<7L!p!{-EscbaE}ZG1
zJ1!lad9Wmz3U3WVuQ336jZXPWC1pR}#Qb3Z2Ia$SssHp|{W)OR^$h)5qh{)tsHmuF
zmxun~qGq7NsYj-$1Q=iv3Xwt6{rrG;|K$#S0AzAAoZmnMWYvZNSVVHv+`=Ty&N^t=
z!capha+@Xsf@UP){U%|N#7OM}<+<2r{tY*>n4iBWhQ|u<V1l$OSbHdt!C%b7R~4a~
zyv|dMgaOa&8>2u;wEj7|gmmEkX<GlCBhpBA9j=vwKQ4eoB|TcYI!y7n7ZmW)H?trN
zO1Rm*C}rtjzC#f1Z>Ir<@MzRGx{Qu6QBUhR+B&0f<U0x*`K2&96*_h4Y=awC#^d+M
z-nLX|GI}6Ofk!C(CKnNA7&pN2ebpkGEAI2eFu{_}BxE{|e`YM-+4B<3I-RN|9`mE1
zOElmAfUtoa)#YBs)!kVa7_klQg*sW)c1tM&JkeY*{F+O=#Bg5!&GEjcaPDo)hbf|2
zuXJ+R?ECzXhRrp)+gjiK$8a}-fv_)|ctKJUF1+a*pt+XxX4d83V-Dc#AU#eEsS)jO
zM_D2`v$N(fm+#gG_kbc%b=03{R2BW!aBw_8cd_Gz4mt=Qw*zb(7SRh^X{>U=bUad8
zM(J*D2L?HqWQJpXp@0L4Wh4+*iMA)V)d5Y)3Zl%5Dn^GOnAT*X=I7FX0VWKljZjZc
zEsE$2;wt~0(kO^c|Nr2(P3V;qVeHEGR-W)g(L`N>N#$hT?X2mPk{Mt+2N&4~7;%-l
z=>a@W?VT+wAhSXE3f6Lci3Fr1z&ETesDcRx4Y>9PIl>m%y0wMsW$Nj-Gp7smYP(f>
zyDJ%G^tdC9Y(HGbv*?@gb_a9t@VpcTPGxaL<*c`j&AnP16AO^T9iB*x$mGgKjDR0a
zo5AHeb}O4AB+k~p$_iFtvC!b;wAgA&t}LrN_*B?4sy)UoyXitN2;X^2Mhg}~qyj;L
zXvVn4m;GpD(3W2`N{3o^;&x(25F^;CcJvu1GtE?_{b8i>N2>j`&7E$?D)-@3)^hXG
zJRX-Rx#h^8NwPt0)q5O>hS8RcPI}>hXrkwM))5Iy8^`73eG~E!{GgB5Ga$7+MF1fv
z{Zo;VVks#^YZOl$1_~H{CNcw)Qxq`I2(QCoeQ2sw%5Wbz3f)*NPazUkUsIqr`2~;s
zlXyoKv*ICM;Xa?;WuriMN~JmRugPfnuP{V<-(@l=I}2p#n9xaUXxv;Se|J>?I#s(x
zL($rp%i-_AUhUHEFDaT9m<g-~S4sy%#a-mZbvymDhE1|l5}s`x(_l&jCXm;WUq!P(
zDRJlvuK)4^%fgYHDd?WdaDJuH+rMD(=yseL%PZ^c@>I9H+FQ1_>k5lAJPpM1-q7T*
zc>gJ})=$0Iq3nkX>OM#c0H}6fgal(87-H{V2APqD34j`IemA$421+rWC#R}qf6h~6
z&qoqsM^rAgeKr#1;MQMr={R5=)IBa6^2i@gC{^!G@zT~~M<2*)HO+EK_G!l_gAlCD
z4dEnUB*z;YGM)`2?N_36q4ONF^iXhJ*QIoa3NdkE72+j~_NaDM7b-2$5gtYvFq)YS
z>5kN<8g#J-vL8>Sbg#>*a+pont~9q8=^gfErE8~9<ZS}Bw8j!aD4_<*$QHbL+|=Ag
z%P|aC+?YIbu(^{>tD2R<pmA1qoKOI5ci21@Wx})7bN$9*H^6!>VDi3+siuP*XQ{|4
z{w}TYPWH9b)rs+1n2X}XV-+W;CrQ*OUQw5Ek2f{1ocG-I*KGDXI=fNSs4O>eyU<o<
zw=J-|5cg-$yy8vwFG=%AZbv@p14F1o4}P&e5pVr+ZK%19(>VVKZc5{>Wcow~rzv#M
zd=Q9CxC%=S74~#qXt`SEZ&<l`W#L-v6rULytN^0IG|OuVZ`Na~NcVX4qfD1Yw#TJU
z%}mzmXojC*<yP2bt-IgYo{m#bR+i)x_Oh?VC<mW;39G+WlJ7!t;;Q9g<=EA?y0v&e
zpa_JBf%PisqNf<&B>F5XYxJ~Aeq#EsHrFHrB<0+qS#@Oj+tZcUF=|f>!;7{FkJ-t<
z${E)RmUc}pwhDC*^N=O?zFqa@s6E{Trb7Sl-;ufRntIoRdwK;l5-z#Sj2^u%iu;PF
zw6b0(1&u|J=w?ajJ?-t*+*j6zm+Dhnn(X1Ps{D0cQS6kh`Ep(MykJ_uJQ~+*?5W1?
zrM%7rhpZjfxxj?^jLPzBKDzkxUNEnHUq<+Pl<w->PgUg@y5Kd-O}Yt9sr=2)az>ZF
zQ-o0&|H;+)!I)OZ(oS;z=PAR3I`_dL3;~fpgV8#T$6-3SC_Z(Yc4<a~R+3Lw-#Tth
zw0Y5%TraPn^bDl{`Cs{~wpFfntBooX!=uTZ=0x-CgfW!JT8PD2qLp_akwroR-SK90
z_11u*U5^<?Bd2Tb_mj`M^#|?a&_5eT6C^gWFSfx%?|-5grwFxz&0>Vj=|uQe5`9|X
z;o%>1t_PdaYut|yEhKHdd6ol^0ya1(w+iSU0UW=V?cH+RJCCtkW$uH05gqDnu-}mo
z)6QTWGdj6$F(Xj<Ak~2<aV_+q<N9>^{K0y!xP^SF@{&%CWK}tD8Qy*Fts1Gy2fI;<
zMr=_gj`<tMYkKh-q*)HQ7e^ru5#b-qatvf*a|r%z#0h^rn#m}vxh|RYwwrqpz>3sq
z2F;VkE1TaQpcy;mD>mYrORE1$=Oj3y0=Ia9lN~E*ZnTP%psGzp>TvFjA5XNUG3<~F
zxbR<<*~uHV5QDLdd+VtNrQ1P`M(zA8vye&_fm{gOhA>?L<7`E~i{9qPFF0!5;(YaN
zOZa8bhLlHvmm_a{8qPKhe8gnNpIFBv7xqL*TB27|5BY%9h?>&H`8xMthv19sOQMsl
z_=Z|HCfbV?rLBBIH(i10?lSU_-oJxJ(+?js9DbhWsfuFe-tg;KRzE*ik1<(p(0n@1
zyvv8{(1%s*rDu5p{n8?-IUZZVB|kguX7dd?jMLUD@4oC<RMPa#i*UGuFnSSNPbs`d
zIx`R^JRlJq3FCp0Bc8p1#*-6Wb-P<=D-2dNa1}AT9Tk<3KRU#qO7B~<I%;(L631R#
z+e^Qc=CJdv;S2B3Iy}+EA2q(xebJ=jwT~zkM(XMOxsY_^&-^EGxbGz24srws@}Fe~
z_x!*YdOVFTsh1X_(c~4(UWPqpKS+rRd!lk_s;0s<5kf9qb9n}8y-nEZFz`!e<7X2V
zQAeX*M0F5uGM{%F#5el>3O+w%{%qbT_kJHeYD}G!iUn(1O>*`R-wTn`Fv7i!dKG2p
z#Hx1i_UgpbP%&}chW&zM+=iJ(msdSB@3iTfR`LkLks_KVdvkRt#3l@m416WL@bp0@
zlX8vip?0#BWHV>YvQLX~^8TQ4Yfjl&_Tjwf*mYls^Tl3c>hobS%=j=ebG+{YOL0<2
zRc*T~9U+H<tMLutD4PI#`(ATAlkz<Hx41f0dcNW@&MzKcDVB0{1A1}r=y+aX3L>cy
z8Kkj<@WR3h4{PG*;1s{ZAUk>*SD&RZR4<YaGQ!0HM>_K?!Z5$kw02TxePQddQZGAt
zS>@rmDgsI8u(#W<#tHoQU3sj0LsP~z%qP&7wSDDp%zv+zRh2S|3=kSLR6wsrL&>gb
zWY9XIa*__0qU*kPE)&ZtN)G8_FW~^++s^3o{fepQm90Y>N>FNGd6EFX9<B1*cSgcK
zX<m?5EqdQ(?D5<EnWqUViC$~fJ>iby6p@dhthBZndclUl-Diaf_!d!(cES5A-&XeH
z(z!T%$g~b&Ysx<w`e(BDrRg1X^5fL^7nrg-1w{jg5EF`AcC0SHXbp$QO=}(1nn4)m
z(v56BIFLt9sl200{X6rV*TUw#Ami5SW{Q|CG)l-~{37vv2Lt=5_kBU<{7DwKNN%*T
z+h%%=x2d4raF$8&(+ub0Y{4s9>+7y|E5pAYdsAV#LEVDKU-nY=j}G0+x8r5&p{yQU
z+4TNIxqL?L(soyJ8zZ&4)PUK8%H_Ki`73g*ay6`%E;2EvZNF%<18=3~9GzOoH}jBI
zkUAo#8vDEA?HHe6(c!3u<^7ed*CtTus^jS>g33@MhVPalY~i&EnRE~>*U3)t4Xp&#
z9S03pOA0)sub~I6l^BKlr?QOvmcHWmr*XEYN6e@7mORj({3;Z>m>4}I<=?kECE7?(
zE_mt~seRGOZnxpaVO8P~EEcXJBlNRPHivxM9PDZ?_v*1(Jnk&eNK$f=)$;4WAC<pM
zMQE$lch#(6G%+|SA+(Z&TO8W1@;NJ7sIfa)c*V}s{5y|wc^ZcraEDcd{X+z`;(v46
z;zm7ouAY@$b|lED+`IOODKaGHqotaggV?~u>upZRuRIj{mL=EdxU;>@YDK4e;j?KR
zugkygLteH1wrh66?bnv20!jUJUQuZggPv0{p#`LC$ZLW^NzKDZyk}H}pspGvCVBfb
zJD3T%&yk9y_!&VfgSa{^m|8meqf?naP-O~!^te1-x4~jMY<+j`i<JNuqt}$!NOqAO
z7<*lZPU=ibmESFPs**N$Vv?i4uASLT@pEkF`x(%(XGZ3@)o<oa|2!dnq~pALyvk3<
zj#>t#zDB{m>}@beIkhX`%Se_nvXv!pb!`&Uir_1@P>Ws7?5#S|IsW)fH-@ABMaLn5
zm&rm0#cVOOs(eAF<)I!K@^kookU$>pwJc@Vy+phU^bQ|X+T}cd+GG+Pqbfg6Z$eVW
z>#?7IjG$0^&W1&`u9oM6Yk+Lpw7Qh5NE|2Qs0mH1)GqHo`O(;O<>N86deu*R|6%^d
z6Sr{++ed1u1=f;B&{5U%5{7UlF=r@3nE-Uh+Lbp1>}w);1#L*c2hvv&)U2Fr&AbTe
zZajVj9ldhtLS4=hI@;C69={%S)VjV{<xb~n=w(0S=nQ#nvcXmJWZST{1WIvr<)#nS
zNOHaASbt7@)^9ZXMOAaMcSxrFc+b$vZp;|dFx~8vTS}eXWR|O2K_6CkQ`c6yS#b4C
z>Gh{&M0`x*N|Y@&Tx{HgDXw1EgTmPK_gT&{2ok-yhc>Kw;ea0iX-bK_6)Tfm+g@c*
zrr>)9Q?`e7F~vKj608?}H4-39)OjUg=v=fTdo)euS4&)8X`A@%A2vQm%k)|iX{djm
z`QCzMp!Tg)7@uP9M&&?GrU%t8Y;3A+yM`~d1D@?pRT%1#>7<a(?<2W(1hA#Re&nVo
zQdO_iDEHdt_w8PxsPJ|9%KLv66JmePDD@d;R`noEbB)#;8Ex-rQZ<WNwGro#pSbq-
zq}SJ;9P}f0DLd4E8-C0i@6i^=+djlN(MCvB88fMf+XO(c>K|<xiQ1Boz#<eU5_>s7
z_@({XKf*ypf{&ANm-6Kq;;^0DZ{MLQlH3!pC6_XAEvpq-4Yq#WZ|;mK9D!dS{u1&0
zAkr*mS}&Q;a7g24v#C`@9W<o$>S(QgYb0TJX6K^CZqN4E%5*_?wgLn9L{`6Q<;^d+
zdC8&>t4oIiUZX?wDW5I>C_|Nf&0&b@sWB|+=jeeORWNifrmpe4=4_^G%!$WzQ8B_?
zYbjxZHn8W4v$6$xWXqD|Jon?r>-s?ILF0hy%s`2`5W?q|`)w(qhVoMzMe-NfWL`ml
zkB6_A#+m81@fPb`1Z$X7Ph>gXfyGSpbgB*jvA<y<)FZoo@9^?6OWjBA+5_i0H{;xv
zaQycppH5w<I&fz?D{`cqfui8ddgN8(dFPsI#0=f{Zy|wf4tY3QY~1}d(Fw7gJEPtG
z6&(^PTD)_GACEZ-^Puyzkk=ADh#6|QM#wEk@;e3N0RrX-Ggi84A~D<X#Cjsnu?1aK
zT*uYB*=y$@LlS0We)h8Ww-m`tVy1+Srh~P3@mif178}2g)tX&s2HzvPJaN9x)gY`>
zEex8|a~y87Vj>!A&){RIkcfWmRkOhzbn}k0L<m$D4Yn{OEjI(BBP<vJu70XLy1K41
z7W%lw>6MnudkW2sR~Q0+OQ8ZyW~Re58HSgGo4UQ>!C5tZmzq)eeHFy;)95=TWCwrx
zDV`}dQ^d(+3tcKfC1WF9QIwmi!BTolj=?{$IL+aOoSS2KU8~*HbF1va*QrjmM^#2b
zTu;5!FdoMjF)Oy)E-y_{U5ak+huW@4XQvlD$EvwJVQ|-tG>gdU8$lGQ#a>E&26~;G
z;(5>~_}MnvM`GVedee9rY`;s7waf^?EoG589Djy#cKONlCAzQT>jyflSr52*Q&^UG
z?!|wj(H^2nm2jsZO4KgZRNY`zBOhjRHp#1+4LPrGcoIqMboHUUHFP_oYbq#(!CESo
z1~!NlmO;gaR)iD-9y{9Z)y3@W*hZq5=dH*VdD-6oJB7yg@_RsV=z*r1mYMC7;J2A(
zI1Q{EXiPaqs(mHUwB*7LFKBJ=Ga)a$%<53)7p*Fpa;;!h9z9*9vz6FbwZYz3%30?s
zM;H>LQ)=Pt5pM4npxI8LUb}s9;3(%&SU3++ZZ$3RtED%65{6XypdciJE!f_;<(HK*
z<1AI7c;~bLp2eEWh_%zI1;{~JOddH-`S~0}7BMy8K7d4p)8Qsq8>Eu;jQ-kWeQ<i%
z$jA4iXv^mKXT!8!LzQB>SD=+tC9_fK7u0I6f~u0NbN)?a#|$j2>lF;FwStEWATf6r
zE(kbaY%8@CDBZHGpfYQZTCNAi-}aYF?P#=Zk3$4+^$pH)?@eM~xCz1^Tb8H4HTg#g
zKR>MFgD!<1KOpblcK4{#IC&jp2<Ti$x%H-zidIY=h+CIPsaLn!qNo1uc7Q-PhzMck
zS&HimRylR3Y<Pm^#YrYS?LkyA(z)Yyb`f9300FXOy6dEn^1Q>h_$l*MCCmfmNr9@?
zR0C|soj<>_vDBC~YX-k8FSEh-#0joOj;Pu)9Xf{+Yl)KbYpS0FTU0k5@)VVtWg4&R
zPAoYNG~z?!zP_Sb54N2D_1H0ucS7TbG8{JsN*(H_g*eAr3;RF8e*>oMSyiVv$XUPd
zvVk!J<8`et*{Xi%Qyq5vodgs<)_fg-XLKuRXQ8~v>7~WAKQv9NfL(uV`y=U97hZia
z?%!FVLS2%i5@so?a5%20pZA+@HcZ>XVQ&mQ;Xy+EMXqaz5f~P!yAh4|_UyA8>|KoW
z<HCmnY+<^K8_#mc*CA&4-8Hshd=N9brfr7;X2tAgbugD96g9%DAv@jT*vJ}z7S+u`
zHYR9(@Fy(eAjTWWZoBldpGRbgSmZPa+I*{8%@RsFvoDpeVlxt0P2iDHRub}pq==Qy
zaY4Px%Hp;0;z#IhY{HYQ6Qyq<d*e(B2=boxsPP)r^$E!zyqfIb3D*|J{<W%-4+CNZ
z80P$eI=&~ejJ{DZjXOdke=c*72mrmS_Wwce`d+^^6+r9h4iE5yUcoqI4L@~DNn0W5
zvGNe%<F3z-7(>&U2k3<!SUxvd4BCObgd83g%h;qQaY&naa}N!Fi*u}>X|SYoqUHLg
zsz%`FvoIqNXGxhg9Cti5e(=cIEd8W!q=MM-8*+@ijZ9#=jbo~VcpaUnYIzACz$;-X
z0~fH`9BQNXF=wwuwI@zN89!DJ63Q_#Qs@{mTnz}Y&bmr_9(S}LI7-?m*<#pb4R9BR
zf~*JA*3i+cJ$&$0fpkM+pl10|0C5p07gMB^Z~V^hZ>Ffn{+#tCasSwY)z%A1en1`4
zOwrjo@AvL}nCDGtlR7lMyq|{gr<DyRmD7_1gWj^Bl2h5Vh3{e4F3xkmcKyAtug*5<
zRyBV@d)WonJ{wsqp>8ycz<t>8o8{c-shEA6ZKjy?d)L2L-EmLO!z$5a@8>sCRor9s
z(!$XvO1tG{w{dFrwss+b2194k8b(ssjt)`6@)gSghUfA8O@<uDL7N}bjVyK9PCv#N
zKe!f+y%2I+A)*ksY2Pav{4nc6^V{41nZlw2+8<2v=X6h%1*Mf6{XZ#v8OTs#`P6bu
zr!r5h!P1C`C!6+y_#{V7F(Q>TycC%Miy2SRJ15)vgV<ao#@}H!%B-fKvcKKE+gr0+
zoBT%y`OQgjTWf1qe>ZucYgd<^a{%q0)8a<<hKYZqQE7=>+Mq<Nj9PGLu<dA!PZ}M&
zIce%?QeT_q?pVBy_YHViNo${v@Au;5)AW|rcqMw)3)UDjDGVxC<Cr10mCx6932|M7
zY2C9$4_ounve%wi?-u5}$x%+Z+7)p9M!%o@f@SU-C$5&RtcU$UF}x`0@oTSLQl+IR
zCP`1K<en-)b-MzqXtS=~>rz~{jGqFp?KMxO+NRv(*esI$kRo#0MaFu=nF9{03U()v
zcQ)hdUdtQ@ZsYd3?k_|LB^iDbP(C_bvDlZeY8&92jR6a<+ydYqPRL6VQ}D8ygQTk9
zHQg*G`-aNe$tAS8>gu2!0gp0xqvPW@<u`(|&0<?`s^9y`9zkSSwgo)Gd2BC2J}d6j
z{aN{lMY#OMO$85$eJq>;63MMFZxYq>cuRxjvueO|N;La}FEnG_5036PC`sBLyKXx&
z+a3KDsf+g$iZ?yND<Rj9*PNYEsq+NVpN1j9RM_rn33bkeHMVo>?lvA?K~MY$x);Kj
z{+QRB`86w!s!Ahm^G{7lSP$7UbDh`N9)Cev)Kiorq@-k^UbNZ$rZ9$#K~?emGd&W9
z&6kZTg1@**nvv!9%aPw|c84sJse2TnI}3{7zeu;NYVdU6&+H3WAng3q_d;>aqc($*
zLrOW*{!%l5S3)GbTnfa94I(QpRR)cP^t2_8(4-j+*4)`1`64dW@z_%lnFMC_$;MU!
zj}6onk3&hRLF^0bK$96G;S#sgOM|2O1xw9snaCB7IC!v5U>`x4o=>c~VArjRzhs{e
zSqdD-F#Kt3L7V<LMks}chgh3#lb<kxsf!7xQ7>|Ox20q5^evHA<AD41)tg3I%)B#2
zKD$z*1swPGnWU!{hNnBzr?!mWmNG~6uE;jHcP6u*3bJ5Wp!0E=RknO2dNiXBlxHpS
zb^L-oJug+Dsy_|1W@Gfq-j`)RVLLDkISBOiq=ic!f5ZTZ&4COrgmLJhk6EcVRrnNX
zFDDcxu-3nb$r;tI<aQ6eS{BC-nZT<33BLNljatqWG#w`tr6`!Z5lXB~(&n?`G!FeC
zztsh@LtdjsGK<Fg-oV_we(7lbFbPuAW~^aiOsdWRYuk-}WYck-*If_JwiZKomE>MC
zCS8ggXB)N!_r)J3@g=f3xKH}e^vF^NbGW{>2Q0Uh=6=?xUr!!UEX{8$;BLiqqdr|d
zqVHwIMI1Y$eHYu2dlKf_$|=NdON4eu4~jAAgZDedUqBeS-7`E^t@at0&fU#6s3q7a
z&a!dQk7WxrcX`i0*p2?34?u8WqB$F0{xQh(Q1YZlYN31Bqq+FHWg|&KP_m1wJ_Un$
zzhO7=!_3Bph}=tInQSd(cv1$+VISB2zbYCIbh-hMGSUMhdDiyw$46LiX!IXCSj#pl
zb7>A(*I#-(Bt|~G=REtXIKI^)Y5RrY_0PKaR=X^}Ui1p9E1DJKVrYmNIxEiP9+Hlz
z!SL12<T*s9=A#OIofBvYOg*KZCO4bQ=<By0)r_bo)@wwzM(Ip^`rtb(h#_9qy3dvk
zXAYHqLREWy(5UN=TS*^}c=R<`6zP;tnwP6DAL{=hK3Y;5rzdKrc;$XB*i4a#B1wt{
z1M7!<>2ic<Ahg^gs_gVH-qtjWVXb-Z`%cB+T!bn-^+M3}W*sd`vD&OEQy)A1s$Oa+
z^YcOeR4h9(MuS13<-ucP9{K)7zEp(eZ;k;8L>IaxRwJrYpC^>CGnrpF%p^AsIbU7)
zUv|3~a1X72b66(UrB%zyGBcJa91%QuTqaXoUg=ebcJ^b1>znO9P;8F4c>S?*uEEl7
zb=t_|N<b&>*AcNX#m7fe6Bq*H4ZF*uA2W?<@sCoe=W_g2kfqVsPdjpC&mwgdH6>$Q
z_79Jd&&G$ogs(S1KTQZueVRgzkIpn%8#g{znC$C?NVU+b?S-7iJjvJY4g@(Vp|%9!
zWRO6cwKeB)XhXGUm$u_*VQcSb&6TO;Io9d+l8n*fRY-MvkcF;=(f8ttvI!BpYpSOf
z>sFlV&r0{VwgAX|uo9oRyu?q;?kg4c`^e+}jGjuq0r3Uq!!ZJbDkO5!4w8)sozou4
zJ(qvf@%#cN8(VmbdrQB)4H8guzO)mfRKcWG5yU8>R0OH9i)|R~%cBi-77P`I84fFg
zf8&|lO>+0Qom@Zgu9SH6U`cMRph#{;fBm()S*IF_S!?OyR5x7fc539lGmG`GjAZ+?
zwGxZ=tWq6An(=bKcLe<vgI__*qNH6x(o)S#z~&q9p`z`aHqxMGvHJNl1~Y7zS&WTg
zU$`Oj8N%#JOadlH;j+M($wblkfVJSYW6w*Zcfrl-2tY)MqSS*}5B+}|&M6dnX@=J$
zpKRY~c{V)U>G&%&3DrQ5U$VEpd^iZQ%XR$-l1xOeYk3d?ZTj&q`Q(yfHP_jCc}44@
z!IhMrMsm=pPXvT<3EtLI>b;e*i7wF2e%v7a6*CF_WbOS-g<k@3hM1A~i$eETF7#JS
zlCFwF@s$U?)mGMj%2_2@mB`ew2PuVgtSnpN>#KbY>*P4?N_=-IG5z*_YcHWujCkHv
zyY7rRG4IrVt1DR8c;3qW{!OfNz3HXqC^R^CjQ)vs5l-|oL9Yjq<NfHul-^GyhUshq
z2T;;A<%o>f-Ea&6s84$m>#ICR1D^)NNRn+PRA0q2EB=b=QIi|$^d!{6ORYG767bQ)
z<ZAXNm8O#0#rp-Er7s#fF}QWHsu^9?WomwQrxGC(`uI*W-p&(Mqb~^))cw_dFs}Y}
z@KpZK#zSKBM$1+aN+fcRq)T_njj;-L)2lM8IC0KX{n!-W=sW@r&yueg)fcwS7xc~R
zqbwuY7gW-XIcTGLS(JXR=lKRowe6tlVXHdP3oiXe5)Q_Ol8P0Jc6J$0P_(4#8aR!4
zf(?Zu6Iz};*z{o1eK5louRUDK2|jiMUgA|N+DW@$o#BKb#X$T*lxZ9BIN$LvhiWL<
zPXXhNVF7~2{i$aLbvw0Xr<I2yaUQ#~%Q8n+uU~)wKcv_8KVu4;Qq$}`I{UoQ6E9O@
zg2PJHr)sVzg$T+iQ{YQ=Dw>b>1#m4cX&23gUW=%l(A$Rps!A;7GC$>Bc~AF@y>k`^
znThc6kN$)!y3$b%I;zo0v)^n6)jg?&xIi#$Bj|<FnCrO&-<abDl=W>)l5Hau`KIQQ
zz^14+2V_jmXsS>>HF%WRq$O3`wkbjBYn!XvOiuf^sx9exmx>F9&r6?bJ0*S(4*oKY
z^)24kYhWKJ9b!V=v`V+)h}Ek*Mmb!{Yvb6+L<O44U)V1pcpyp+O86e#!i-di_4+Y_
zv|*o&&^PiKQxsVrSE@$$!4y*Gm-<FFV4R+3dvfwEr{@db3&cUo?fc6Xs5lb%<P%o6
z_TylxlkiVbWxHU)qgkJ0dT$U6VFLKYRKcI09y>lDrFpS7H_G%>2KL?2{dRwBL)dgZ
zVL`h17LAdBv%@K7Q)P4~hr)@6xvhn-op>$=P`EwN3*X%P<}d*RMYhKW*=}$ufA0?j
zo7B2xEi^VcMu9C^6sc;wV5x18MXsFH4a{jK@ReO}=|2{91=Jk2ZS}8K-pvY4)&qaF
z-dA3TaH7M==+(JzHn@OV?AZM?$Z=C{j(~T%)nyf!?NEBn6^9TRU+w$6IQ8HRhpnbw
zu%!9jU}ekIgLHRh{0PFI<<d<!PTc9N>WhCXSBmuX`(LoFo?%pAY*GnKD&<K2`{;%$
z(x)h?E*#=7TNcno>Ai0Aua%P6*l-SXLZf;?#2=pwo0`6UN*w$j2{Q@mF+wHnfkp$U
z%nW|HNPx24GBPlzbp3tmn&CY7fzCz2{-2sQH>|}n`YJZSXI(OqB~u#zDLg8G^$n#D
zz7NoHVBohbv)zA4F@XN`!lS>YC8?8#sp*mR|02GW8g(8bzQxw=!K2IlNfKR--4=Mt
zEc%H(VR?Dkw<>yBgoF<sof_inxYYTEry0Nl)5d?{Z{evhY`A1*TprsD6La%dGG!*z
z@JjL|Uz;-;JPOy=*I9jHdf6_a7BT|wd0_}E3`rGsMKtGL&eD3PsX>*IJy4DbhYOQ2
z{y{Q$M}Xdg&l44vsb+siCK~(t^NR}MR4upwOe#V`Lc^h~kZJP(L)riEWD*4wIPS)=
zkE~-Kgt@)NkTEz$4%qZ~R8M}`{3oioj<FkUGk2onQ8|q0os3AK63c8hIMly)2R_|{
z8=?4NF2!M@gac1Y4ZM<8<XlGeU=lk%YwSwLWrzEpawI^HM1CAH=v8Jsev9cX3=faa
z0I65UI830_e)dK1%|8MzjQj^Wqa{3Ii#*9dvzT*z|IbN@M4*cDulgKF&Mq>qslQg|
zNk!L^DByaxpkO$(V^Py@<e7hX0cgToFLB>@I*pbNQ(#HNkR(;0gUdyu`Z&k9F(?Qp
z{=c|xT1pa;{j2+u_#T~j`1rDRc5LdibPuIm5MRd$oB9Tv5n@vFQhAeY7_qfPPzrwj
zczshg=vjZzcHw_htN7bL#Y@7Wp#7L$_aIOfu?0f%rn%)E7L`K%4jMHPlHeU)iMA^y
z6$W2MAwsq(O5zEZi%CVqiG-=HI`;WUH^wOzh*YP_A-PY_STNRHNbBy%3vFQz@fwuw
zJckiTm(Rk`zH)GK#(w%F5p6L7We^Tk7A6^Krub4Uh77ziOzmxvV!?L44V~Y(fD+Y1
z%jJ^h`PzV5q=GIyg#>77l2dqAYDjo!A~x``;<NEei-ZZgaT?vjAR>=(2w%>K)0h7c
z^l-3R9A3}zO&r{r35*X{q>!PTadZz333M~5A0++oz+7I~aGQI@aL34TgP}E_+RFiu
z;aA`%*-z=ryCd;ncn`y~;Q*-56foRMD4kk&{Q(WeaI<UW0<~>_ZxGxbt!mwUa9jcQ
zK?7Rv3_M`EC-`~N{sH^W@vb0EW>C{M5eUmuz+^OSL|^*cnP&j#pUB|AkHGLjV0gIT
z%ko<RXA*@(Fuu9*8-Wc<K<zWU^nQBHt$FuB>PU<C&-yIu{CoHAdrOIlyru$3za_Y-
zeJF9^O?2@-13@o$XD|C0?KkWzR9T-QdT+Z}62MT0Cq4{mBi?=(DqRM8V++X!@QymM
z-Wh^7FDwfr{byGG&Hci7K*8NlUJ!m1z649l!#m?cb~oMil@KWStUZhf6r2L;VNG{e
z-$78{@Y(O_OK3SVaAR6rWBiAI!}}Y(3_}GkyOp3$5!ABmva)rjAvJs`L{|}YU7Y{8
zK^9<K3w&}HXz~W~-AF7VDDPb4Y!U(YMmS0z2afJ;*$+j>fBn<n2lx_oT?+#aZ58%L
zP&{CBhgg%qJ`)i@M1%)C?)$9&O8~sIZlT@6WTVQ@(>os`z77EXl_tprkvoUb+q65r
z<V(KeLTl7r!z_U=5GfkK3L0h}@i75!n!H8$(7CE1!C*EoH|}j1yyw0C>_q6^y?fW!
zKgAyu{u5@{OhQP+Y#nVaey_$#k4Q^T%YzgMhn}1hEBuo{$gVe#r_rdF=e%36)i>~?
zx<WV!I{X8X7z&=waIus1zZzvD4FWE%S#>K~UfeSRGb3-V*OLpC9jr!Br>d-`7}?QD
z?xQ0<_xAP)jbK@z%1GQ8ky*)fXJDeZ4?6<>e)D@1(m+_BM!g8XhvY1OFAaX40;}#Q
zuWKouG$zybMKS5WGk5*(`2|S6jiW~j`)~}eM948`9B}OLi!baR66{a76rKhWPT6)K
z^#R^plmts}FyF`t$YkYMb^iErcg{VeNc2?Zxd>r63N$si+-kdlV7ii#o-YwMMffJ(
z^OO2b7u!niU4rq-aoE>VOV5i;P4LE2?^JLll4%bMltLH|+xxwQEVoAU=TGpjKEVDi
zWIi;vG&%fA{CW?GLij^7#r;?Oes{;<VE?}t)j|&{*g_SS!-q3dt#7wj1<HZ_0nKtu
zs+x>GH(<njUtzLzD*J^_L{qAlH%EoxlAonA?>%O~paQ<4@^%eQXs;*CLTr2e^TxL~
zPV@q%?HM&NtAg^R(EV3>#kq<oy!Y+@xtR4nas>T@l`UXsJt<mOQjn$AEVEJGKLezI
z0dW}}<=`+N<|lKIBxP(h80vX$+zRR=2J9z=@CU%ZB0+?o_C=bPM$>&s{$I0j-ABgo
z8hdId3F-oWpUa7{)`s$aST?BbO|zgPDa0<K30ld5!!B^WiWs#v2FWbS|2hB@qC%{*
zNR$aUeeuf&9@FN5@WeL{tRQ!_m%>P8mcqkj0`=Tw12+;HLp_kVJ2n7H7Bl)~1?+~C
z7OsOeW-8Wi^V`N;r1VxmiU<#X;-Vfv&knjw(vNYQg@@$SV~=a5uO1$Qi}u5XMMQ$@
z>A6s3;Y7&MjP>A1OWjXxgIuJfr25C%nt*?Z&|_IGYVFsiJAUyoo1l|;r#_>hp+WPH
zY;JB&HAp(8^sXmCd!+}*QfjWAOJXxRcu`1~^toF8&O(Z?Z20&$rhhc{C4!|}k;DV(
zh=?e>6Vbr;Ic$*8{li?miZK}^mK7rx7fI)3LB|DFz51~OA5Z6<CN8~`S4cMDC;W{&
z;Lo2&NQ;reZ+)d~_V@?3HPFbQmdTWuWIP^@&USaLj7V{xL8)BWJ4jVg@jy75aw=|g
zuO6SX-%oIAbErW#7%#Kv|KaK@!>Zc4a1}vHB&AbQa?{-<xM@KeHXQ=e-Q5kFR!SP_
z?v`%p4(aZ=i*xjR_ufA|o3+-gF~%HY#{140_B(fN<J=pKDwoEa^x9#Zo+$}r@h3T{
zHy`OpgF4;lDr^@6Ep54;{K?mCL>%-*TJ^B)*Q<U^o59VL%hOJDOURhx?z`tP$Apx}
z`{YAmfW`l#XAokl=17Ol>1wZ(+I`Y~X?a$T#;pIMbL#B_)mJUNw7Roc-mFsp;A=(s
zI4%<MgMZZoBsB<WJ-2_UDs`P7oseHn85=Qk(Qcn`dncGqYxhkqAlq{|yq2W^U3B~o
zq_+*6KW@7zhs{)Pl@y6m^G9M{uq-mq!mDB@dJzXuQ7XZSOwngxg-kfauJ>=(eOzD9
z<5kHEzq4T&aHj*G^?T#2v2V1C|7XCY&zTUnp|NguzsL3&kHqPj)|{K%#_kfU&-Wf#
zePTXXuL3ukjAA_UR}P^6=@NcvUJk#1epkR$ife36KW>D0c4x&8qpo%)wu`lp+O33K
zY#(v``8LW62lj*kO}nP9HHMs0n@^!ldK`<o4sEL$i^|1C7b#Yke_(*&&&RN7UU=qX
z&F&3fai}dHszBdpf<tMoWVz~$4vg|ob3mh(nyJBbR*TQE<NkOTz}4x@*HzLyzv7*C
zTPJuJcz=MXsZ(Q@Q2#kH`AQu+UzflnbwDDs`LA&SRv+r+u=Ud+i+(Q7UJ@kvp<HJ1
z@v?t1FZj?8Q3Z#Ak+Iq=47i^dmC^S7A1j8CfAyDQG^RdD&M$828pSBJhG%r|&rzte
zYb$>@0?3q0=F4(A#0dJQN+QBSas-4%kj#ZzE_I9$$*EY8Z3RqV+^Z)7S?y!(7W+?a
zt7u<A!7qYzw&0%gi+<vO<B<fPtd8oD{r3R|WfW>V1<MOs93}8sS#QH>CGRH=9Ksw>
zUqeqA9N-fF4I)9}q$Ga9uidtMLdw$p?j8Vw{$$JR?o7H*9xbov`Tq!aJw!+V5mOd3
zweb7E<K<*KEcf74vH&6T@uO`U;U9Mem}4{$1+XEfD$SHakj#Pra;j!`fh6X?z7!d7
zmd*-;^ag~$4<s~ysnqX&I{(?SlQa+qlp4W&q(CtGD6Rm7T{ku4_|qKHY~Z1YdKtZn
zBJcx?5h*r19WVW_r=y6106F5l{J9Pz<<%~xOyDm?nxSQs{8y-=fEQ*{?(@kO2Hv)1
zfuz<DS9AWja3L7eCqGF*hKcYTaFWWvRD{eaMh*6_YXAWO;MphQigf0uN>lnvb*EEP
ziT&rKV1U<HUyox(fKa2LEDc#5v~7aSs3#q-CRTSgX!FR^Di?m{?Ir^J-yVkQ$vt7x
z<HlL9Od-xxLWD6P+RWBVApBkc-q;|6`*2$jD5F4@2HfZj`ZW9?Dv^GX$8ek_SSMaE
zQDFHum#g#QGM4M;mFSeI1q(^;v-tkExARZK0Rf;cd#qbT_mV&}C#3|CV<CY{WEz~7
z!F)*;a(m=NvP**H8AC7qrFhQntoL`gLpMTCGi13~-xPd|F*`StuDW5*IVj<k9#6N(
zwob%W{AQEmH@+Oah}r81Sp~qmmsLHye(`{gRm~GNqyvC4MP~eP`}+P1tLPSv&k=mA
z0I`!gZTL6IjGVDAa5%loL#@MPY3O<^EAa<9)_C)e@f<yu*g=cZlZDyw*i1vOyxnl9
zv>y%(a|58#0zf40>rppON_)~UIuc;@TcK_DiZDrNRLPVQ3NG_sir9WHN|US_M?3|-
zX|mWX+6@cQry&AteDB+A8KBi<*`RMAsaqK&Q<x+yDxpfzE~N5HWnaS@?*$QQdfcu|
z;5A}*Y_d*WHmFa_eb~h@73{jHr126&h9GgPR2F>MOl>Qej8#`THH2c=`dAXSM_s|Z
z(T9H0pRy#uXgbzZb+@}X{`U6Q)%kuI%YEJO=N^YVdEpDsv(Nd{1^N3<Z+Vv%PD?C&
zurw=nA+CMEQUG~rc)J*SE>QCEjX1K|jb(fG<S2g*<kqM}e706EtthEM3@xSHa*<``
zVtT1$YyUO_Gw1ptO|xZ9_z<uRpskN_$FKVU`A8+GZVr)H?{rK7;Q6lEK_n$?%%z2U
zOKu3u(IA=838Iypv!37GT{ma+X^IGR-qXc^gkY}5BC(XwT{c1XdeEWqpd@@_`B6Ld
zW9ruGp;6!Tr)J#`-*tOjfcyKt3|`CvqK7<*zEr33YwjTx_=evtVegm4Wv%$^#k5M&
z!`;@&>0@)52iqVrEzZYyQSC-CvxA9NzJYyupa*;d^^Ik9&rtyVi~>;=kI#N%7<ps7
zg2Ykgxb^!`Vtd4(R!gWoXs#XBxeWt_T-H^rx@b<fAAgm%LGw%AzG@{=C!wd?iLZ)!
zw5`LhHhgE(dBf>mGA2%-T3WzR2|2bn<soLUa2k0^Jx@MAGx1QAV!x&4b9_krWYYw4
zQrI`<2y>RCVGE7Ngz)}psYw?&GZg6F%gFEKV73}VCC^0QbVy3d4=TH&Q`DU(VzRUW
zN$VWNH%ir(i?ox<d(x9U>@g*U(94BpEsmnAah@C(-((-MV=$gXG)Ivtoib8^RQxRM
z5;5-8XKkgo?D*5xgRZ%L!CU9LKW4rzdLzZts#Y}P1uD%BC?B~pE^WjKiPs^ZWXKdw
zX&h>v2UNikp=jf}cz;MJNhqi?d%ux9*;wmeItCb{j6Jx~3OaDT%sZWOm^+_%QFmZ6
zeW*t?;13e^z;<I*YHoT^)|hh0`{7G?)ai_fSR{3c8as11r&h^bl4#fD<i#Mv<oK_g
z9uYBLNoF2b>TSBsRF+qI2X%W4rV+fqpW>B*XLnWI-}IkqC<CY6j+<wZA4+u<#OGY5
zQA=K&zZ|=p;ge%H`*j;pe)^?UCom#fx#A(}JXA%RlHEYf>XItK)2NW#^?k7x{!aGi
z43oyklTBQ;>v1==#~#{UUT~7eul)Dw#4}p>jtheh9&dR)8CA(z@*3}$S3b@Buq(#5
z{$<1tgM>dF&m$h5eO`7e9qTUb+dKLm_ww*twZUcM-Ix3v)f{u6%lE#}Yz!cAZ27$K
zk%OH*g5>N@eC+G~(62D_FX$<!D`MCBdrajX*6)nn6()J$%54v;$9ACZ#XRFZvDac&
zn0H|~9;dO=o>exYN^h0$#H0Y>7v)EWzMdDww`Y`=hW%z;r8-IOvG-|Moq}NUG)y1#
zDs=j@x=u3Dh?E1t2*bHV4Thp;*rT$CyK!oL4@tQcqBCx;X}9ax*Fjk`mCEL5M=Byn
zOrML8cACe6`@wZf&<~xP4$W)#PSgZfAMENcGF)eq{lzCaH!Yu$V!_S*n3kTriZv=T
z%D(zb8glQ|aa&2;kNwm*+^&jeO_!>x^*3Lqm#z-y<}GEbZMYa5&((fdx){6g6Hu(O
z^mb0#y|5yu?!RODiG5X6>@i-i?WVduUq|B36xw}9u|4tYWJ_Y#sW8i>SNjtoIyZF~
z0T$Ivsj{8hYu;6d>%#0JB7;pBRw?4^m2jzHnj|`iPh$fjgA+C3%}EY3FXyPl%}LBw
zoQ~@6CF*Q{$I~br#^?~UcV3_I{Da@hKol=THeKqMF(yd~ZBHLRl6wjdI6f1eV1Tzz
zWW*uXW2fj@ypdRUyNe>c?ectk>ymX<kzerqF?X>F-@nK0!D<)vo?=qJT=#6C*Kr8t
zY--=N|5#=1qbIyDXeGkY;M>g${C;Uyf(eoZSP7kn#k)jsqa*b0;WyS?q_4&ebfI7_
zN_Yo8t6W#><@jrl6}=CwNj-`)Lvf(CCga94<+|lYqkA5y(mC0%B=#T{gkch$tG8wY
zA+le6@HcksR6Nbi^v*CS_0UL{1o<>KUuRIvJ*vQfB81=Dw;Zc}AnoQ>UM>t3#BeHE
z!#iodU2)YwsWg~!Z>yZww{lmSdczdzjEqo(4R0Skv_(X5+H%AE8xFfEFOV|UsQ**q
z{dB_Nn&g5dBJIR=rLjP85SKd!{;>NYzw%-3SrQ6lwX5OPbT1y0hbbMakHV-zblx}O
zB@(+t%BXYp?K3>KJs$IWGIy}22rk%V**)p<n$4YNhuUayx5Oo}%tL|R7dP$t=XA6Z
zTE*1k402IKD57Jd1%E~~)8ovx%_v0rDjPd!58PjR`=s%tCBbXq?xhEn^=_2e{kPlT
z5g%L~-A-6?>ONIQf5gryYGZzt11ph!;oCfp4&7<JW&F;WuSzNevug1Cw%!v~v8r3&
zni&`5kTAug+B-?w4X)4ht~cvHG2q0Bk?d(bOt$Uhvr4{-L8p2Vs7<M(eOFWzKAtD)
zF*34nxkQbCKW+jWQGa7N_koaZIBn>FiAZ)AXBW%-BcxIqQ;$@kBk%oT!-Jee>Pgf-
zh9D9q+k^nm%JYrnGOCzQVYr0%eGF5dwW58yUp@$h-42h`icHpa&VHV#^FM1mcgi2l
zi#Vm56OdRt8h}*N<N1F1b(`E)zDwp>uaoPNO4>=9d$t{P8@v&zCx%Au3@5V_|2SrC
z@D0DZr{a(>*e2Czu8Xze!_w=M@m<5LuwxaE%L3v|#8^bf<}8=3@3SYxj`f;Od6azT
z9<&+S$e3YhF2{@=?mPG7hY?|$-}Z2eaj&}L5$)ZU@47IJ`==A<x=5)qTm7(PJA(Ed
z9h=PWIk;Owt0G^Hq$i?;;eXi$0_dV+!3~2)iRQENW;vFu!kgr;?ut-@mhR@-Z9H=W
z*(E#?k~IF7aA*;#8l!v=yHvN$J8+a7Sm^ZkjNC5|x-V`m@IygP8oD}jsFCgl&ZgV$
zZh9&#cm350P?}X>`2&Z_lTq^>xtx1vmAC&QAKMry*U0JT3xj2UZ@MstZ|xqL?b$Il
z8wL5fM;ck$x)$4HLXAEfg2kx#878ggpZF2TmA&vzicp$HCqKSIn75mMRYvwA@U;S?
zaMeO{wFjR7_o?-?;7>HM+7k79b=oO|;?I6oHOvzs0l7&y^R1EXu-77_d`7s;DEv8=
z6>ek_dbJoVLqY<Y8pXm4`Og!~u#nE-m)yUzD|)-Y^Eo(7yd>}&dtc?0J@;NTJ{jN{
zm3p6;ms;kOYGn<$F4xT;wy|a(r*CMT=L)coZEEAz$!^pHgv1xjWH*0)H)DXP(P_$s
z;<FQ#$fz)^XIFz;TPO69*B>=qyf$T0kx}zayjOd}^F*4O+wI$K3@Ys@-xQFyc?>s9
z!D*f#s>^|haH$c`W)7$gAvN)kMQqNB#M6qddGgo$FYctBuHO=WRlXn=lssyNR<GZ3
zn<Zpy4T?^2cto1fBr4uftt@DAS~=I*e)-z*`?hej|AkhIEhg3Z<ok8{jgW1lD;dGf
zag8ho-C8ZoPJN&^7PH&cz7kw{C{HO^+7dq0HoxDDtiQ=AUo%5iy4Oo65o4GJ#OK$)
zxh@$nS02_P_{G0#iOYh{XdrQz$?4}s4J?rXJijYMc99#Hny>=eena(z;c9RG`_J=`
z4u-F5N$>^Qze{{-(Ji3D`nfyd5t}0Md7#nXE0`tl+jx03w-G8#Bmt~A5!Y(Dd;%g(
zMGlw5ole{6*sx#p*|N3!a}!XhU{UbHzPJnd4ppl1H%-}j6^vTqiz)KH!5O)MrmKi*
zdIqWn=`7*vpIdSf<lGRpAjJxH-@nZ&agj0}zwxuGo6|hTefKWlLkfi9^#QF#951|{
zXC(x|r7Q9@j&~|>Lrmh5AR<}|bwMe^>^9Zz@s`E)7wjA}IQZA~{KHfpu3sR>-SduZ
zN#=RAR|-M-v6qU+e7S8nOSD<m#LY{<CDjZhZmcSZ-bbe`$&BL@cc-$BrDN~N<ZR!l
z90d{|d!i<$$UmFjv+r8jFyi%v1|p-aCd|_(fR4+FffctMp`KW;Hxey71(z6xLj0Fa
zhn4SNg=nZ@a05nge4`bBM)Hlv8f-qBb$9k{^U{TY-29Npp`rK%LW^-=lZU@R*G=a-
z39dgoFW-^t#P5L@MOCi`USr=4tt34@d_(>0aq~rn514~#64=T@d}`xbrmRBjKx}&>
z13N`tI(+GHL1wkZ0Sh$Xa*Yn9j0Z7$((sV^%5TZ;g)R7aZHFffFCO$=)92DAK3#A<
zM)lSmbKH@Fk7|%SLqR8|>I#pHW2c*IMWWm9@vLvMs$0^6sJT_U;+Jd4a3mm^c}mcA
z6(}hUE;Dg1Bq8xdI^XcrGxnAqWX_8OPJ_in!{>1~w+8-*_z5Pu^?2CqH!Ee43(e&o
z*|TZ$)8`F_;~v1wz**oV5SyAzm56^&*0=Zi%3vd8!=%@t<-w#7eVI?^aM;7D5krV%
zTWjvSj(_l^sD0z8XpW7AjA7oqEm(s^>U>Ju=&)G)u4{3c&@|m>Zn4}WE39tCURbV_
zBg_NEN&o%tBCVURH-2-=TU&w_gzt#_e?Lw*Z{VMAN}1qBHgCR}X{;7IlHBv^8B!3b
z2Dj>BZQi#zYAmHpckS?ZJ0om9Y6jbG1<eUe6B)vFrI<o)110-W8}zZB^DpHFIcgij
ziSrm~wsDX81y7$ZQ-PyMK;n|m7n#DI8O}ChP|dg`GRHs!ws5&S@ez-y8ikJPe_tS#
zShjeMr{9B(n(vhH9O6BOQzGKuzRAKGoVu(2C{ZX8g&og@Iv#pZnlAO7+due$I8DJ-
z$2~i_H#1Ej?o$TPxeF~jF`GTJ`KNw{Oe<pMwHCd`J8w4e?P=G26A0&>9&{}N%?s13
zBq<}8Wr|v?$)Nw@pK(#dzldzOTd-O;&x4Hl&XH>98%P1`9oU8rybN#pvLQ>sr_0x2
z-ux0e%;m!V17T#W&(>m#k{q0TzMJP$Dp#&`IFIK%o0Tw}?@UNe|ES1cQ}&8=TlWIb
z!i0i^ebuOm;5zjt+vb+}?*14~lXL_cZoGky_Ncl13F0I8((1r5i=mgWuusI#w1fN0
z3XAU%+O+Rgy3_OhZ8di*uO`V~3?@_DNM3y?ubo!sw?$NqW!>8yaSlND#5@(6KR`PK
z=a%aIZ3~z?`{Hs}TQ~yx=Y+ev%&8IeY$%rI9j{x)sXUf<5OAmmUgGq(V)L6I{Nx<8
z$SwRlA<70ByO&@ed&A(-GvE|5r;Ql<xvAfhJaUSHAn{d@)cP)+_K!(^O)$LYmnI8Z
zZ9C$xS)F!~oiv(-S@XPB0|e?7H=1Wf&b^bx6EaXCd3_6V{<%zsXucU>(y1-pv9_9G
zt`j_u=Be=#r9r!ZjuQf%t88#Tco3Ya`=pmS!_*FIScVNIJjpCR#k6ea|7hH?ZLVE?
z&u>{7xC53i?CQDI2(@cs0fs#Z?G%pfun&DCX;)tR*WE?J`*JhG4)n9HHPuIUpL0I5
z=tGDFy4tkOo6+fd_xfwQOV=xKRj=-cnsM?TQjQV%(EyzLt5L^UZ71^p!pV*ZT{VkQ
z36-<UAvxHvw*jqeEl8N_$AF2?U|WfhR&zJ8YOXZPqu7l{5Iftpn|pA<eN(zxR&$mc
z)C(9zu|VLEJXFYhiNjMvaxgsP`qPGU?6)q~3uY<HtsUC|xv*yu$`xZtn??QpOS6w(
zSLg4Q>18s<74;2Fe+emjY?NdT%0Qj8P-|;aKz*G>8ugl^0!_dMpnKxv_KqaA0L|`Z
zVwa<krl^6K#x4RT|0|^m>!gx=%dDs`*ByI0vzg(!zYi@kXmv;kZA%9k$^w(yO8jkl
z*nwQdY=5(rBHwcOOI?`ZW_H@aOG=su>T5CF2{b#_CQJM>(a@zRjZFSbg?<_f%wiXO
z=F&D=4cG#k-PA-dOSwu1*9CwjBq>M?FT}_G9o6Eg_IgslT=&<@($F(?vF4yM$o;dt
zeC;YqTJVT!r(N6VL)7ls{CpjTDUAj{%g_BDwk><h5*erf4moUaa*As}1#^mt`p_v0
zw-#tjBFkjkgLB43AN_@oDw>65c!GhESlbQ!CPG|#$j9%8dlnL;?ku8bJVaa6dRmLK
zk5qFjmY+~m#?=q?d9lsmv%GMRy`4s?dz|4vG$=jfIQG|BlR=MzF5^PIUD7eDtgM`5
zu)~tkxflKwZ(RyV>f$;Q>uoR!lb=-ek|8I~FV$g!)zD=a6MPApohmakVvO&#0j{b*
zIIK#;G>1nawS^@dW=w~S&!-oY3)V9q1Z5;i;MS0V%^u37j0V%`;Nali;aD%FFXSZN
zB!-3KaSC@Ac>eyyF3K3{dOeVoGrqf2*Zn}>-30Qb*0o-Q6!^Z%YM({31@0{!Pu=xy
z4LARRuk76q;X6$MwJjyc4LfJ1UIwZ~xR)rDbi8Ot3bZMgfn@#wqKGV%Yiq+1xijq2
zh-;c!>Lc{4<Fj^(bTTgI0U5)lUkI;w4|eC)w2WkKZ-bN1`znPGb*u(|W=BG@Uyu{$
z5}hB*ATlWCmbRJn{~$k@XqW;*1>$^#0N;gxhISe=m5N*f{+r&oZ`63KHzuO3yf8~u
zYTjNoaW7$Z^z+>^FY(98QjD(eCDowge8cm^_8Xzqy5isIenAOz60db0aqB3et90K|
zKUOGMdW$EDBR7@68+6G)2_J?gr-}@R_>66qkRdVqj(8%tS&^GX_V(&*2_4!E^KagG
zgbw8Qruk^;oLval%(?`hAAb~Fx$zNam9a&_q$hOkbe&HAzP`cX*SE(>-G|K7lAv;3
z+O%UMB}M_|#N)c+^5&B2QDI8MxwH{P!0b;E9!$+c@g$G9TMn=}c1HVUGLSO0;nSH*
zcbNhb#-m4#kKOakvYVm5Jad&BZ{WVq1}R(t{+$}!+^|d3M%yx*ytkluq11fahpXc8
z*nVU_)Cjs(K#ZLxSCjq-)m7e&Hme4d*WRf<FV#h7(s6^^<LREV3kKg1Zc$5+E$$Vr
za)ixv+|h!6%(AWefNh6VHuuWdhh^+<x8<yY!I511rf>Excc!DW=Uwc`b*Rc#rg{U8
z{GZ8or<V8*Ipq!yIgeEzNWU!u&e+!X7C%jLAYeYOCyt`#G;;A4H%@1*JM;(jbv5j{
zCA9Sp77ru+6#OkrAi{}5cUI7RT}tqIE2qI1@9eb#oovb`7S*U&jI)fuXC<h?_XK8f
zkOWti8e_QG7#!#Aw!sFAX_0LJZr<}!o!<@x`%n^pM5ASqu<t2E-*TlS;Jr!0fOTpZ
zf2jjt`$Y5JeH}I0k!;kIi5w>nWgyN^BCeVg5wl3rD<F@fsMuyNQ5s62|Fe`pki+Do
z$!Q_T+LoGIe$Y=DSaLF6P#=Ezd^3qniBva%y>`pA0y|o9{A;dxX!pZsh63YU-)MG(
z8H3NOO^1D{s8kY+ZE+L1hZPlDbIIK&KAqpo#M+0r-f#k5$iJ}&ew`d3nBW|;NPH0F
z+96IcUuSfmD})v+d2#B=KKBjg8j)Ic<5&^78WKL1j3!5<{;|U0{lY((mkS9vGA)TO
z1dlb1IZU;be0KKz)U1YF#1F&gN=W)D-Kv|C5SESO$PME)tBfSiep})2m)C8;W(NbX
z%?otEM_oOBG<y2PFm6{*UkdQN3M60-Js<&-y@>qi0OG*BCJFIT^8n37&M3i=IaJb?
z)*1EEu)88miH`($br@_$PERLgRHd+og^6%?MYD1(HZmqzu&s_lIroDcOVNw^qr<qh
z@=~vFCAt`rgj1mH%EM9_+wQ3cpDo)vcY%eh4hpc%IU}zzR5X=%e&uab2EzMKt2qY0
zlY7@z>@vdKzHy*Y!*McOXib83R7t3n3stKW>l}e?o3z{rX1tpV&?P1Y->mr(u~dW2
zg6(4mi_zW{?BTM<>=8tCp#>DNIU8@@PINKDVqRFq^Rm&wKU(xQSA*}T<%{}0foWnx
z!O^lUPKaJzvR>QAbJt}vGwCA_Gj3m}*qd7IloelK|ApqsSb}XCUW4(1NqY_EbyAh=
zUP?i0>r~HN*NbYEC{tQqsPZ9NBj#Q!KvfGz1o-9$a;vnT=jyTz)*XGb?QG@ajJJSw
zC%@7lP&p+!lDPZk?Cw!1NB`_<sO@6W=u9&Uo)=<kD#jye_rn2Yk-)+}zHs2%>rZ(1
zT9tA|1b)k=7XJvVwdr4BHOR<&pX*dY9qa46HBQ%o+W-ygOY8)`w^RyNWq5pKIgx^_
zMxX-+nNF&V<tco_!p5%8z&gsR!=i$A*#ejR$u@(X^iK#7&Q$}t>}5m=)_PYsyHmM(
z-hs_H@|gl#+^dqn?Q>K0%4Q0N7CE7Z=_QV99(oj5$R~Q%o2mg->PIyci<PHkW4)b@
zSX3cz|B=rd&D<$w=E1?snY+dL?#f~kVv9qRS1};(;Vst}2R{be*Z0Uc`**9u;l@F%
ziKZTyR3??=+Y7HI#68L!-e>WvL)B{V7dG3Yp%zGj+kFvyzcAPxMr^}~4Jx(}kTE$F
zL)&u9X|n1bLTqW*oqi{pixnXM(0;uNt{l8KiE(qBZg82tA^fUxATg}-HXz9aq{SjJ
zTUm#QY5ZL{clY__>sV0V2CycE1wJeqw3}?N@G+LqXyg*;B!*zA6R7A2x@|}G24xAP
z4^ac%ivOQ(#bY)IQ=n<c3Rx%(ZF+X{@r|a{&}-AKNRN2UyS$<A=d$>i>8e@loRk8P
z{$CoCdJ{U}@jkT-T*Q0ZHt8h(qM}%29o0<9`$1uNi4wr?M1KeMrGeUf1vQIdE$c6J
zZ>D@FW>Hny7}M*Dy@)6vJ|~cRgF9U_Upk45208SiEwo|muyitL4kFATn;;?(IuIOV
z*=-Y{YH=pdOVOC2h69?$*$P|OW0w!(G5Asy3j<|qn%oUW<2OF4i90CfyhHRYp>Fp#
zLtLhT+8jNRmvW^MG5K<yiVo2?&4I~CGj$Q7Gl+&7sxl9(RW}XSj1wjwC6`|Z7>f|N
z+%FP1HH4!EEM2-TpZTd!I&dIGmjJ^#34jX)_0lg{G&CMSJMj_!kU~-$cwPiGXI2gE
z>{3-*P!E$~=rmEM#iWN$^VXyd>sBtoY}rGWzJrdd1DHD!xZ~}497Q$Pz2(wz*b)1+
zF1*WUNUT>^L`yLc?hW?Wl!#-hTA0E$r-Z|G9RM=Df#B3jSVX^I({N>KLTrWkdgsD0
zzJ2s-TgZ&qGp^Fzt{1R}ZhysF3`#m4`}EHy+2J#z=IzI`Yi69u+v{*ExHe5$<YbVV
zK$EI~;G?MqGpMn#Wv9z(#KCyZ+Sd5`%=u(8XLttt=lIgCFjR>C9@x=RSxEN2?j`rJ
zye*9X$*dol!fkBc!Hpn0fB22{ZH0NAR-k^6p|`7!@5NyD)9#sE%BAx)i8)v~9FDW>
zttH!By=iDz^NyjpaKEfE5e={zy5z8vtGrwMPIFMXg89;2XQRX@!sQpEBJX>gaS7f>
z6P|_IGkeP{gM_71d&%EQnu5$Ph=1__<KAV$(p|F8?G&F5Sr*ytj-Xlg2s~Dgp0_je
z*b9Cor)sV5R|9)y#@o47BfkR+*BhO+r#CW%)BT+dOR<bNBvh&|x&~>$^Lat?&|2HR
zP6fNLaIW(eFvS}w{bd5tY?Zj*%jneHJG2d<?3!<RscC4=WK<p+<pua}t((l<sp=OI
zG;5aI4NF1|88WXz)Wwc7e)YUz-O|`cwN)8Lil-|Wf}CH@lyyflU=kn51J|};Q5m+w
zn!l~&s{Jq)sTtd(fk<pLt}?7LzL9Q}!FahXH&Iz!Rgm(%Plte`KS+r<A%?QhqKCl2
zgz7eLMUf3xMl|xg$g!s%Yx0V1!TO9_7?Y~^vSzx47Fl9=Li~ro%GQi1!wu{-a!P(c
zN2Ya4?-juP+``WpK2?O;aTV)K;DI?9b*Kv`NuGtp6kS_LO`N0EYdQN>@|6b;S=Pz%
zAw%8GPG?c74?*1RKqPo~K39ueO}R8&CWd%-P&iS<7}#iIs)qI>8Ux=(NBbnT)&3_N
z+h%=Xe8H%^Bfd)xGhbxgGJ{eUnfn2sn!|y9Bw5?XPUu7%Tc<2!yJhBq$Rdk`3qD+m
ze!a?IjPMj=7nOKtlwr~Ao}{oy$hDC-SGd=Xt4XKMb(wITq>OztQ}1Thq|nt>3)KUi
zDhW9GW3qwoL-?xu<49n=bg>*;@&Gy)5C|KVGl<LsxanW-BE&K5v;^9zgi=bt?1qOP
zYu<C{l~1CVDHO9rM!?!J3PX9r+dHMQ?J|I9`GG2eXN@h2%43V}qx9#g&PiK<UzMag
z807RoRx;B;v++aNQ9Vue#*enjT=fuk6(5tiNFY+`Jz8zapiMqefo95@b6lR|<%c(W
zmfE3gBe}EtKbq`prl@%Wj2rLb_i;UVe>^j9gv+zQF*FHm7xT^rm2p}sI#<tQ!+}3S
zmpx46J?v*Z`|rv83c-*bBgF=1@<Q)wk7F~nlH6Nc4%I<Ecm?R!#N)MCV#N(w=1@Hg
za)+-Tg=yKqc4)b<3~Igf`PaL;FNSre1243ZFi!=ftO{=`eN2_*EcryU&doOzSM8KK
z>~R&$M;vhSOH_b$8jnV1a%`8*fm+1Wz^nLs65bwrWxGO7DMY;rPQ>X}V8c*mr|R=t
z<=r=`9<_xYYN|tTP`2tg&L&9w6hyVl8DfHLA=^13=|~PmbQ%xy6uX;Du{p6@33U36
z=#<&#bG>~m!of=4+Za$v^s5}-#UUxv(ymlU2QT84>vg+r<wxI}w8K(6sXZwa9Su00
z8Mv8oM{BX(I<!&nJwt6CjMl;{)+Gpf$Iee>4s=^a4}!YxE17`ZH7ZJxqle;qi~M5r
zfYV}Py*GS^0JV%ddd&SPF8wC%3cCLxOj|Ovy<I+!8Q-tS!f_HN3AY<-+R?yf^2@%2
zNSJG@!S$ot^pVt!-7e<gD?NPk{UW1~FbEpulICSdYt<N=xs?CyDtxoPy6$&L#QBcf
zMcaa;)T(fFkD|ybv#Gapjw+3X7%Vd!z~V$|JH=%?ENXKV{&^NjJPtMUB2p*uR~%IS
zp;;dqaFpXf>BHb;shEZ8_DNF46{Ir{NskJAngtRb_N*y<sc~+&$v(w_UEJ|Xag(05
zY^cWlJHEVy+%`knFn=!xQJM2=n1Im^i-$IrmdGli{zKwpJOov{2%k=AInTiS%*&>Y
zD`<(%jb`EZ4|O6``d1x6N$sGWJca~N8@KkPD6T)E6Cl>~lTL)TdSuI~A|1)!_H$B}
zI(5XNfvvzQH?oADBJ-~MrxG26344HG7W@7;um-@JXcsy3$$M?nr~Lil$R|ceBD_d@
zCi9OcklQ>8{F~8L;o>1R_LD<GXTJ{{&kH)})f=VQ6nq-@FI(}(3^G*8%JX?fvHO9o
zt#4BtbNJk-j6VaLT<Sg+>af}j%(oW#WYDbmyQWkh7E`}JeZi9w$hQQX_ZajJd`AX8
zs$7FNXhe#0XeP9@f_WHm_~hLc+I&BQJ^$t-#D`_I2NH)*37RJSLdP4?R-8X=K`l)=
zxlEF4q{cER*AC0*!2s_FT%V&{tf6bf`pp7_=X(06DM_D6=@pGQz#5q!u!c9E*<812
zxZ$FLt%<vff2T0B8$w=`AEBQX7(nEbX54&YUndA2EfF1%vDzTU-bYQCvkyv4`noDH
z&<8U=)KSL6fdAo;SGL+^Wn_B-TMoL+$FxIM)Gl`V^P(YaDI7JjZhBH{*h1V%>#<B=
z1tv&bH;r|ttNtaC#DW!{1juE}2#d<{Bd-3cEaiQcj?T4@#rpG-R(L^Ag)zje7s%vH
zMg!|ZO>-YI8f1f$v2*bqWI4lPR~F|)f-j6{Uze+lzf9;S*EC+?XW8;SgguCo4XqMs
z62Q>|fITF1mzS^7Fa|Gn^W<#r@5E)KUL|slk0`uMWCS~=GGu`ivH>ZRgO8T=+4awE
zAm`?=a47*Z!JxPqh2n{Dr)N;sDDK{N*p0UBR!#N~I50g6n?!-6!h~&%K^F2^Fjvn{
zLqXoi7<&36biNWGnC6r`ysQ{C5XP(2myh;E(>@UE&gt4Nsa9n;{!a}kd$}%g>JmiS
zj=ACP*_=AXRwuh7%s#q8oE_3F3=`X$p`sU9eEPxodhdT2A*j4`BuY}S|0RY<(>cFa
zX_b<FQhfT%F<GsHDleaeI?mR(gZl_a;<G%k3ZI~n!p@VF4HD)_DhUK3V;=nkHg(Yj
z5Bq=Dx_2($a4ma&lDLRnXa#?PF2Ik!=A||nF(u0_-2TcB8w=`0P6qIY6MGqi%w?y&
zo!a@TEj^oz=3QK`526)MhVosH9pMi&N2eZy;3u~W7qnbs`<``v^klcC$R&y}6wE{P
z_OLEGh*-M!!(Iu;Nc2%V1-(epXjCm~%)TK4_U99a+bJgDsR^dP!8<MZ0NWx|+Lo|w
z`f!I%Ohz0h3E{4~39mZgNz-A=FsnY!>|(Je{Df1kuy$QqeDQpV^1~F(Xm$6ARZpfo
z^d3HoVh9me@g1zuR67mQ!~Rz>(eQ5jBJ?mZ_tUHeOpnH1PM0oHE;^i#6?!rwg?p)J
zZ`JXSBQ!4MtyOz@Va;XUkg7?-IdvzS(rYJ=Hec2gXD)CC^;BFlHWZ`yvm>*x-&1>p
zp0PHH!(F}AMtV5;f-a`Z$Y2iu-|{O$9dc|tVbY9!1sll@_98~dY*Q=>`~3rHYgPnC
z8OwN7%XAqu^_0reT&k-YMSln=&}*>NxfNNWya`0CsS<na7V3puu-JxJ&ttOQ*$Zra
zRjnER1x$<nU)WxhD2(di@>(oO$oexq_apde*SE6doGTI%0IJLYR-Xdh&cwrMuPGsF
z3Lc%wyEgbEO4_SK)aL$8Gk-)gVAd)=4?wzsFUT*WZ4(R8zfT?yyvP|WyX{WPC9n=h
z!;)nRy(i|YxEn}kyktQo%UHAW?B_GE=Q(q|qBqyWox>sC1IA=|0Lz9?4IJ|iMiHQ7
zP!+@%ue>Zo@z`jxl;hjh(&-uA1AvKr^tS!VXEy9}-yy&fyiyl2Pwp~jvz)<rT>)>&
z?^104fPN)D(N_owFTFG<<^z~h(oKW-^Gl~tXO!#2^oUBtzV8ybesIQ-?R0QxP%P$;
z^MRN-4{QP}lH#cD*DRBL=&tMCja2a#KxheXr-+vfoc<T^e--=;W+E{Eo$p{UT9KMt
zeNTE<kDOueRwJ-hp|h`_{o+6gH@NLaNV|67WjL*WZ~-$?>{tHk0j7Ttqh7JCa;4$b
z*v}_>K74X%Nhl-14;mJH2X@mI9?(_+1Whx5MWyCE<%Ib!&N33=YtOlF)<M}d)RD{G
z(;Qts(z4Hh6%63WI)sumNdXvKo>>+MalN7+{4ZkkPPA2B?4WzP-zg7o<F)5)K~agI
zyo0S}jtG$EjaehrX8=a{F<cSqw_j%D`~(gsZ9#DJco;8tbFt1I;XSBocv>>(72v#+
z16D8|?K=rwfFEMaxuD|*;WeVC6i=A{A3M|HCw+L4GZ;N_d4s5XPhd_@KrzRT_aMg^
zG63hw1+0B-LDsbXO~E3k+n3E<{(1JW+OLchA3HV+E)YOWe%T4(K92wplX@Q@s|zo>
z<Nm|xt^%0_4+k0hWdLsr^pRqZ1h?tQ{@`w}l7NR9=7G-r*Z??HJh`1hBa+}y^e_B$
zSpukfI8x*W!4q_K4U6jf;#WoEA7tw?6VRz13Q#wxIG+%ej!3cdd~S_&e{zlzdIzKt
zK%^iQft(C0fdFXu)Kr?e;Hs_g-`s^@)F{8$2&0;+<a@K3eWdaC_g8CjthU59H7|x}
z&}j_#I*=g6dYXY-=t~pB5}h?Zy(}h9()oj&CG~!WS=&7HjP6ogTwGCJp1hSZINg3>
zsy?=zf+w5rF`#`t1O98RQ@YJ>T6CmXK2LI0x4%f?=g%3yvo%(Uz757(3BXp{3gLQ#
zJoHY<RUJC;DT}f+UjY;UZ#w|kv9DJI(>ofPsLagGnOZ~HgKz$QySu;BVq%dHT4kz$
zJLVdGbX@GuRo{k(zxI-B<}zEYnT2@#tFQN@jj(4XQqZ*z@~-umP~fncVVio&3Ne=|
z6}9lHlN=DU4sJwp_O0r%%YO|ML)ae0NF^}hWQ>zTj<w<(#JJ0f_KF2mCG3|)_41YF
ze~4%&SyCN|56^6_Pd3Uw&)Aglc*XPLgu|x7%9bSE&-`}d)q~+uAGDq5yw3ZQ@dvK_
zo-`Kr#IzE2e&Dt4&_H}2^*cUOEgjC=it%f8D3l4MOUjAZF2V25*LPyASd@@R2Ijim
z*Sht#Ngr!bV1P?1+TGn5z)`{>nDSr$unYi-kT9u^IHGE@1${)a)3F^KrwqS(jK#MD
zSBMYoq(@Y}OUuH55W}EP#(&W0KFrpSc}<1Y<tk1+lQWiHlo|R~CChoioSMF=W+f6I
zTItc+Ve$M60)^VzLuYB^*$_ScYdVEU>$~?qBt#()h{-3X$evC@_LYgYV-x>JR|aYM
zG0mX!Rs_s@8ZJq>GwaT;1Gla@iyPGcQ|kNE5EOc(!3l)mahuU2o6R=VTcf#<)lO9Q
zm=wNs^|-GS-k4PN1=5mo9KR&w&m8XnR!pxP%<g2+P7k?Y#hYfWxv8--ciuVX1HhrJ
zGd&NB!or&PAt7~=C~_TRp2+Cdp?h`W*6z`Xnh6{Bf(|?Y58Dw%z!6vHl(b}>0=NrL
zgwJ!;TC;XLMs!Cb(>igCX}q%$)`9;xl8HP8-8Z;Ce%NVV+%}mwS`!(grK_C6RJwH>
z1(OTsL3$;U=%Qp`pF1JYAt22p@*fs+9Ht}L{tRuy0>o4c7dvMEz`37L_>M5STy1#j
z&~3TPLMOKOu%r^YxRj$lu(x&Cl)W$f5otyMOBH#7)M8EciU5td^72yp3iycJL~Ko%
zT(r@gBVW={=Wn1<iYqCAo#g~{d+$;aY`}rF8pGm_?^Yo7$vfQPn~OtYz9AX00QIS(
z7J|gw^60(>a&DLLvTkZC(jhYTpTFgv3TdZ+4&&GYl%r+9lXk)(q5fs52mx*zHain4
z17O*&0FScDx{CML)qDcV^eD#L3<acP3jCHLUxWMmjsW<-IUmUguvk(p;P-Eg_47aP
zqIiM->-2|t0iSLF1>9tSHw*V)C-C-(dey#J4Aij^;1jGe=;X@&aZTD!tgRTv@h4&z
zE$~^<>gM3z?wkcwjU^;D7^wDo43H(`@6*M<#SsJFo?RWA0Y#8Jal;f0L3DrLQ2_5K
zypNxlUyo1OTP=W$|GYzs1oGh%xl0FH00OdyQBoeD`D3ba<U$a|*NCWqUSOY~@$%*o
z*MG?s0Ek`SNetd^K}Pd_dDjcg7Ab@t6NuG^hb^}prVI7)`_Q^^fNN))IQOS0AsFtr
zfNsc-8UbF5kQwaD*8;M7!LbI4{xkp~&x9rMNb`?;NS{Xn3UPu>=mWZ(ljuL}KTL;x
z<-~qqd$3<j5$yY)Kal~XYzni!0hBRA9hI*T3or={1y38v|8_SoIv^H&qUQr>E6c_(
zm-CN5=G~V>O%0>7C{0(^y8hHH${NTAPx2=$5W5<wq7WS!0i3vb@b5nq4j~&rG%;&k
z6~JpZ5mK!uHrsXf+a#1f6}|%EAR-AIU_C+@?~Nz?vM9ZaZ2pimNY{b1BHC^A#E5=L
zLQUO+O0>lOQqsSi{?dm2vO`h<qy7MsUk@<W`uo)(vwy3G4`^?q_L%lbvuN+T(n7N+
z1@gFzy#L5R6A)Fc&<Yt)_%ho2sWfD|CjBnQ|6+(&3-JD`Pyk8*d_aLnu{H!J3<LSU
zh<YS|d>$teZ2{s(6s)(weBE|;JgjQE^dEavqXKkhxZFqt=xh^_g2$}=>*YX&ame4=
zy#oZ7^tR6quu&U{<@8&UgoZ;`n$3G|bqq|@mOp$x@6@O9p7R^f3n1BVjQ5>sq3sk}
zKN|t7`*$qzzfJ&LB6Ivt9dbgJE&dw5I8ewtpEW}u^}|~<^Ctm66wqz{<yp=@i9lfm
zVB-Mt530^Z1V}Q4u}S~hYA-90*jZ(Tp7BZkY)`85ubWci_(yf#KLD-~;s=>>LLlqa
z%^`rtB!K&;X=ty2gl&1$o4`PpR2W9qkuf5H0wLo2U#AfPjNI01VVqonVh1ol3w18%
zdyE}{D1&<_(Eo{_8qhvp+s`L<lZ|+*-Bu>SrxG*oOUi#qO5W>0CKSVApPqbQsuwZ)
zH-f}rx{#T_G&Elz@ILoV0cx~D`epbYRRa##7&Ba<LHqD`DB~<dI?|q<ZQ!dc4tV&W
zMaFmU`h3mw(Sh>x(4T0E;G~<ZRO=n7GTAjpARlL;uV25m(i#d2qwH4v_yIzd9()@k
z1VV&WgbMl5>ojWyqlZN43OEd~|LH;N5kgrKIjeqeDIM)wd1Ji@yoC@kIfWJNH$E3?
zDt{V`rhWX}@FuS3kFEeJS!`><ONL?}tCE(6&kx2Lu{5-_82rhZk;k!sAUR=S$iW~H
zV90QSK#35eULhHRO%=^l`?ozn6`2w4W_IM&rca`J@o<G#wxwX<Ip32S!@`h(!3rY6
zF%`!CoPYd~w>&xs4yj8juOI=4CeHyDqEg(13OPlUQ8)sPnGq1{{;{w3tyZu!vUe)v
z19AeGM^6O0zAedEL+U?D*h3SV?+ZCdROMXjm2!ZdGEyu{v`)7hYz|};@!4i~7VkKP
z{mqF~33ebF^JS-T2<<qT_d3Fr3KW_}$?NWBZIiP)^T%|a2)q=}!;*E+V9hhI-muG^
zn56>_K8o@A5)x)aLbgWLf2RMQ6b~l%1CFFj3|Wt@6e%mL+dD9qdQ8$!L)!l`>qB48
zld@@a;y^Tr@hbI4)c?SuHVFm-(!Y%(C4O!@k?np99Pzhx2_(X!SpEP})Z~+smX^+5
z@sq6c(tAmlBK6;3sgwH0_K%*T*zKV=8E=M3y}=kqz#RUV@|DZ5fif$@AD>3*ErwtM
z_`z5)lVd`RUc%=~{}5*tH8nrSN3~Pfz8Q@BPvsauq=><#P8WwsqFUB~JlRHnJl!~)
zk;{K=pg;@t!lu&v%#2OxAqkW86s4VW!O{P<p%wN_FBFUFd5M@2f7DYTQnMHU2)h5$
zNWDG*Pa5C$+ewjf0N(JI+tpucWR%bo6S}Vg$=KQ*;KFrsD!2WQ)_vs-bRQBkZ|7b~
zpetXi?$WdV%~9wh@I+zs2pQuj4*2N1Q__FTOra6Loxm1%Lzn(<=kRYs@PJuS40;&Y
z4>sn>FS4;{|4Y{f{-PNHIYLUb5(bBX%HC=64iNlT*c8yq4T!j{-orf2bp#OpCly`5
z|1j%*iG?kH4~&9Be>l_q760?cr>U4H5p&=Qg%EJylZ#>gfAUrYp2WQ6(w2ZReWFfb
zk}~}<Wq?+X0|@BLB=rZ(Cn{oFeqp6ey~&_a-C@6;2EHRvbHm3{%~6LiUZ4sylpvQO
zGdvCpwe?(?osaS6uW3T|Y=PKHdJ9WRWt8>zEC2xDIh^W9@f@gXU+mU$kKojY3`wdP
z>D$}lrX0(2>1zFXfR5kG=>5HiK5jQ)IAXqXAp(x&;-rcE|7Gt<Jp&};c1BoP=Ux=|
z`I<K-EA*BvwvctgRvLoQhW{%ZFLb(}OzuAF%R`}Z0V#^PO0>j|j>hD#{iR&*y#{lw
z{DO$_J5kal|KP+H*N1$=O$rkrM;{ga0V~y~FqcQ=Sa(gJ)=~$CYR;6_%{z^h;PsLD
zOG$t>*`iHDAL2auZ`{?L^VF&`HGjF{)YV$CzRMHgY_8Nb+Ft!b*vkunf{6jC_H{@b
zN4o=K(@va8`D8?<M<FGkfJ~(UX)3?qW_AgMbiV51y`}nQfzDyVrx90)D3zr0&$$3E
zV5o#Z8!Zk{uaPiG0LEr;ByC5aS|6~&c1)}MOPCmUUj4!9kV=5w;=v%e-LIX(GyP&Y
zF0P$|%f2f$AQUji$eq~sPRaO{j-pz`Slf#QwR-l#{p7i>U5@^NE=r$YaWnlKy$LNf
zb?KK&m~;KnX#{m1J-&;8!2nFWhj(j4F+f0#p=VB)gGqX_SiA^4DgSl~a$M0A;NoLE
zhbiKd(uDKX1n1K!3!BTWmj0|PoZjmFIg7370u?C<o$|y=6Zr*)RNkjm01-$`8g;m%
z6hMlUVySduL^J><`%;^#xoL`Metq374+$H)k1<qLh0+@#yxCftVi-bM_0j-4Q8cEe
z!~V^Lc+$X@j;9q)m>3&<9zA<GAzyj~Xf(CwzFA_3;HH9JwMmMkT!74y)JBncfTCt^
zn4$c4sfD6yVF+8E0{8Fw5PO~NJjxL6qG;)sOdH|;I0%;M@hpR4gSvSB_mh2p@8O>@
zEe6z80>K)Ktd!u!68PA`)vA=77?($6<s!xU&5c9tqQ;FAt%&$igBFdZ6{cNIyp~Ab
zg3A|Alr<hU?4re@#aQVb_v1}s?RDNsws&Af%AVc#r2Ie+`hDELJ*f@cML>P!K44|O
z{+k)uVC>Fb<pq|%ggy!Xp6lMn4*`2R6b#7bs`T9jwj{*tg6%mj8%QTV-w^?Hp%<nC
zda0kXq~V;e*UL|vKV7;=FJOirFMYDa7@(T`CMEN`Qb~rpqDQvewPWn|_t_cskmu?g
z9E+NkPGdpkv6gB@xgAr1zSgvB%~cjM55qmTW0!=pV$O9}DV<@oS=4=#MZ@ILVx1+s
z=|Y~+&~8=5-ckWu5Mj8V?c$x9j+y(St=NYd_7_>`Ps}qk{A<_6%e2p++}!SCCZ<KD
z`h&d#?0`d=TT`jvM!sXz>$9ylEQcgRH@7dPD~ZNRzNHq<56Bzm{~`%FwOzyh*~9v&
z*Nrlhlm%k)>rk~J<jiV8DB@sD)dg!(=81WB-c?$--i7GdTPgSum$O?fCJ}WM_GYrw
zvaTv^zF6vKxFtgV@MOiw-Y<U^tzwc&w|i{5LujtE<9WK>OkILzrjhvgA(*F~BKF*?
zaq*(hN=5tp@xjroC+Nqc!-1Kz{s~4~L!m!I?l<DZ&15=&Vf))=dl;ag;WVCCP$JmP
zj`R;_j4+x)j>24IkHE<eMv=KTU*-Zf$>uJ$T8iRoa_QrljEgbuHlfuT9Xl*PM89d~
z<)%o&OY@1QH5=0(&D1NP)Va4FME7f_RH{lmREmt`ml>_8>|Uwua9%YF7r))|vj!Nz
zVO2j_;GW`3q*)(uYfC-?CL6-AlY1^$`v)TxC9FcmqC55V@VAEifOQ-0-SCXJTl&(o
z{x#>pd%NL|xs34@;in@snnxJ|D&z+tkc#yzy#Dk24S`XO%T%*xx>~O-uGZXE=2}!e
zh0d94hLI;e*42<yyNZOXg+0@}SS&4g{zGi!SC}DmVaHpzI42oHPc^ZvvNS#g3s$o}
z;T^=sZT``)Z2oxWu$ms_VB)<tSANqz)D}MTmCs2!Q%F{K(Y870r@huAb^&^GD7#Hg
zWY3M?v$}(hnUeyuR~E{bA;xHma>R2I>YR*0js=rJ4I-1&>AOw=jrxku*hXU1F5wPm
z)T{>L)M#_56NGh%3jHPr;SXsK<#f`|U_!MgI;JZ=^+gZMRT#SV5xEziWK}%R?E3xE
zNq+9-$);)A=V^+DHGp?L6pC|rqV<#gMEjf=>M-4!pd#t-Oh3i3Q+A)WW{{6>JrN+A
zfj50wIYlzob4Kmg7IkUd*RXTRnhx1)>WMJbzfH4i*sPZGjD>GW#<@0QdQ)+6?eD5i
zgmWe1?s-$*>*%t^aZcc(kfC*ENdqj?il`OSe|+{&u!w2Ady!TIN|J9>k#*jXRQskW
zk5SdZi~ZB<!r!culpVryy>no5RiU&ookMADg@{wUCHI-y1e}HLboXI=viE0q?evNt
z!AE_(EWWZ=SxD54XIWrna!@d^0<C@95;1oAEx7u_9SwLye%ImAvHXjS8adJG(WQo~
zMuq(h($Vlx>KlS}w~dzpqeTWqlA-k~ayCjEKe76kUpvIOY{tltw6A<4*XT!dJ=jj>
z6Wef?4iW})6HJxh;KGy9=%|o;4DKkiM$}vB*})Gt+fw|z-y6$vG8$>P@7*p}=ysxO
zaaBeg_3INaunviU<-D!g32WV!;{p63@yW(yLnAoc&%WN=&43N{vfZRBn+$-yU$tKA
z<fBJsJ(%P8LW5v16}eEzt8rRR%{z-nuiY!%?eG`aLG`T;m{e#ffr7l^ptoB_ePJc_
zS|ok|DM&bva+5jiDaL;+d~%r*(7Rmxt0t24_1RVtD`I?8TtwmF3gX#m+!Z9H#sV-Y
z1&^L$la2U{^1VVvzERXzKs5B`=OGEoGfvSL2dtA7(~=u+oTma7O=>~Vrl=cUx3?Sy
zC(5@piKLeSEH3-E{1;8TB!&oYYcbeWn&-c8iYyL{VzIFeHZFU3U{Uo>XB9BJ^#|4c
z3d41g9@XDrLzrlakFu^0U*_v{XH%wMS&P7HU2ZsJE8-HoOJ!6b_>3m)PLj6HGbaDt
zg>t4?7^RJRY`R2YlmAGw+1AMT_u6W~<zcj~#ex3~&pM7?z8Y|Qjpew?JC$LyRkqXs
z+vd8;lvZ(u<hDV>WmctrL=wJG{x|}%`RW3i=l+iR1M}=9m!qaV|37yti=XM{R!pGj
zZs8PQ?U=aU-*#2&HQP#T`Ax%%G)|9@B>XCQ+@5Fjx$LSOwN>f=x|N$8SY&ZWysueP
zaIO*HXnSFE8C_q9N)xmD`m@4;3*i?`DlNx+i@LY<T3$<MtDcgFtzR0bf_i|#lB)TD
zE$%<|kKF{=e;z;~YipCcmoJ&dw|qWLteWkTlcIVcjX#HqJpHcbtFwYu9>yCg9)7oW
z@m+lT(hJ_H%K7&o3>cEF6lHS!YgQCSJ{Lc@=kuIrOUgVP3{9^yA*m;UEz)%YT#_CS
zw!bWG%1+uGDpCZ~=^_Ebsc*O~Y0MAx8)sTe7!+%Z(5cS}-+LJ4nQ4&w*`?39J>F1d
zyc0ZM(@rg^Fm-RYN2KfE-PdHtc$VQX9<f`WwJ`+gj@GsDI}DlbV<#D+rDjRz$}LW<
zO}x$Pj{oU?=#LmWux^M$>1D-Z@(C|nqKlNTl1%m)YgiMnMZOG{-p;z~YpZ;@3W1eS
z&s2OF)-Lj9E{lv@+Qa|T-dl!c8Fc@mf=G(eCGpS(odQzQ@BmUG_0U}s(%nddba#Wa
zG$NfMEh#D8BI%h&(f2+7efG!mVPDtw;sY}G#9Fh~nz`@!{bmi8kXyMUAb$Jy@=c(v
zrA>n?S%#kaJa2PM@%wY7;3W6hrxP}Wy!@JW<!_m7ha8YuBd|$9##S{3-6cc*4`{e!
z-b{qzw_S=x%6~kzLN!cyYF)b8WjGP0;k-n4bm%y|<w#8Ss9AX~*&*Zns=W$ln5A;!
zu(qQ19^^7j_Jf}FOfHrxKShek)_&XGNR6~fO3plemG7NaJM9AFYqQmR!Rv|-G;zB%
z=NWM(b0U^UA3;ACR=w0J<|At0EZ3Kn3SW0>c}wr><f-%e!?H}9e>)PM2~DIs*uhL2
zlb?jc-BCHAsemLf=zs;Yswwnoxy+c55y$zWsx|V9&8--GI+Isf-Wx`!W~~R=JXYGh
zjDg#9@mKv?CZVd$(-H;q;#YwLXpkE!)kdafMGEG0cWn6&DOWM=p#9cmLbaBKq2u>^
zfQ^HE4Lg;6Z%Ay#^Lm)oa%_8??WeP=^-iHHcWhvUnI#}n!HP-B@>{q>K93e-W^2cc
zN#YC0`pzs;O5aUJJ~-_?)9UBk9JKq)B5<%uQA%9?<7HU>xAk4!!TG#lfGQ4b>{46U
z(*GGc?i#v>J>dB{CZ<2^1HX&?10JzEoqRjUCs1CouC$_VV*Z67p0`Fpj?M>-f_?5}
zzv&EC?MR6t**{3empuwyT3Ha^nI3IAGHU=(G~2*BDYeal_Bj8pAEEV=OzVbbpW>Ol
zg&paKf;t^ukgNO%bK}m3V#Oipu^0qM^?*lJPAJYe&c4Bs&oZ)*&Av%_5>8P>SJ@+x
z{pZ4uD{zHJ$NidpdGe<7+hHrX_;e`G1f~hL!^L|wjP0Xa8oSd5(VGXmFC81)vW<3G
z8Hsg$(hdlhk+e^p4Wd;Dj>w60iAnMUac+oEq{Gd*@$+GB^F`u@P)|=co94Gt^^fJC
z=tFi~kk@zDfbtg)>9|8{?<9)23EwcGmL1_tDDn?c9c;%XwYJBojduolB@kHZKX$YF
zy?2)X;P8=bt+|Rf9)m#KJ$qi3PvmTd{>w~A+P+A7nqIxr#ui_(Q;dQc)$LJ;!^}ME
z1e^%<vs28qD|H;hB0`n=WrA8SM_poBr2>4piRrElBWcxN;~Tdu_c*OQVQLI}rWYN?
z7n)c)J4ITmCo|9x7_u?UeOxWu8RA408kdmzn8=^6G2eAP%$Z^!jcY&g(HiCC0u{F|
zqA=#E7{6W04Hqirtr4p`xB;d^LS+dg)xl%M)<yhf!0}x6Q;Bf=-sJ2vyf+WZZj&jq
zJiI)q0n9R%$G$=vuh!N^!^Af{7-jOe!W<YokVzhY{agU^P}SC0Cb81%X@#-hUKf9$
zHnQ<l5W6YDR7Dm5@P)})sj>77t0}Q&Qs#*(a03gG8%$dt3CDj=qL35*jwj&sQoUGh
zfTbZ5b5vZP%3l-{VaJMUBZJ=a{cM;$FHz^u?e?;=8xm96S!?22rY&Y8c3<Z{4X2s@
z*TD5{%3wK}uqfJh6uZ1%XW{HVFV{viT&UVbmnqqsiDzfWOODLRz$R5sofr8u+)z3=
zMa8RYz>AXz-^uPD5>vtnMwO>&2v<^GE4N@&IsY!PwVjNcbnw73S&S%naWr8{H9FS?
za3C~S%qlkz_&3(<q=TjV)l4v|cBCyu%^$nv=Q#bA`guvUINtNYUvu}asmh@Y7|61K
zB%@ouDA=5RO*wh$Xtyq%lNm~c|KTcOt!Zem25od*&kv$tXE(eLRFz%O8<uU^ToHsI
z%n-y?Cyt?6g2VZmTO#9FIp^HP*qUAQ!~Vd>Kmjt55}DBltrfHf8HXX=cg4Uq34wL#
z_Cdx@oQQ)8lds|PtMhMmB_|HYr?Sc!m!VrOGz=cE$u#E$E^X@Np8ru&awA%dLHaLA
z%IK+mprM()%47_Sl6A~7&({bXmenMg-*c9hp>}W0p=I9KC89fB5$jV+cMi6fh`9!L
zrjy=`w%ABq4sf)8kk!TMQa4QWfjg)~%?e1Uqtzq|@b@r}Iw5E(tXh8koF*9DvmP#0
zZGUp0YbNJjQksRS)TvkM@3dBn+#_Fv5E_uf6>6{3@u_;6`&8N_ip?EUT;uCJtK6P?
zI#|bE4{y4Z^LHWPBI@YfvLwQ6D-g=N%inu67Ao)+Yd^fr(XzXshChwJC3Rl*Y}E4j
z{sH3%!CI(8wmvMuim{}y)v>`RHBYn?C7?jkfRQLQ?%JN|{dYR}3+@Ua2tk|k0y)aH
zWsS#U<e7n=T``HE*0tI-Sxz)?J&ts5qK?$MeL0!Epb^r&>7K7M*`2&_SZax+P9cAS
zi;=E30tHMQB#iFsf{r&W;8v*?K?DZRGO#aLdE3<Q-u&}+sdY>cOa0GOZKUE$v#or@
zj)ga9Sd$Bl?s}`I2m#vQ_Ixyg3ts*^x;`qU7eznu;LVLXP{D*Bhz>8!Tr>=I<P6(*
z=;38myq#gu;sC6k7Ug1DiYp&?%scBAhC}2U#?DI9%rDJzbvCu9Jjb%!@4NFeM~7-e
zSgrKaaLtlTziPupd`fKBaF2I6@t&JrWvRsZC9JV0)od`xVBic2+@*XA;p~lm9^or!
z-m~7ct<F6d)4bt&)!lnmR_>^Mdu2=FP&>Pd_8_h0Es5m(+~zRi<Cf-<WJa=lmSDPQ
z3&Pcp_xz>EmDxhR!>?NuC?a6K7TQ{(rGf&)Ub{63!cwrDYvG=^{QpAPIqcqXKel?F
zzP~lEfQHNQxcW>b;TflqcN>0hbyS~haihAU$b*S0VJ~O8K|It1L(E<0uPhh1dBU=j
z0$0wtHQ)0R<sOG^_m2;#F%z>J>q>9x8t-6BO-vuT7bcBwzw9nCii*j$c)G0o$Lve|
zr4m+_w7N`8^BcQoG^tSv5=vFa5mWPc?%btY`YF|!ok88f@d1lq?!46b;f`e;8Yw_w
zS11Ay_BREb{Uq?7I_Vc9DLC2QafX7s<*j2BXD~2*@qk8msTQ4mHhj_JU4br{cib;x
zeZDzx??Kh&vC}@EH**UYD(kyR&G@BPoaT5Fh$8`oA~BXuTjnS9vxUcgFgcr*yWx|Q
zG#6;B{#{L`qvbogrD1yHA$2?UnYL^9ZDfM7M;uCQ#a!v6R5|kbl9O5BbgW-2Xfc}2
z{X@iSEX{{w<_*PSSTb|pKHDH1RK6ahyXtwh^(b*SCW?6sc`0X+E)O%?<~JiJujBqi
z9xH$v=P1;IYg@W}G-x8>$A8l~Wb3ec0=l7^r!0mm+sQ7ptJg~@S?rSS9fO@qS5>D$
zd@7Y{rP`MbidgYFNfiFQ?D~X#HD=gcqYz%0<Ii+HLIL}5qJ+u(%CHP;Bv12C$2qux
z+u4pv*ByvnjHJzj-|NNiJ7ld*1lfmwzpD`9xYRrn(23IhRMYU-aYUfP3d<+I22enN
zHU8#*j5lr*^aq^8z8}IF*w{q(CO03%9r(d%t~PD9%UyIwqI_GG>&z$CB<WW+iw%mT
z?$VIIgjAb35Cw)SM|Gu%5wo{mj{9=}Tw;A;;bCp1|8-)>QU3i2GN1$L^u2`6nGOAa
z(_Gw70$K+w)Xf8R46R2~^zf_lGf}3dx6EKM16(fQBffXBGcYl5cZ&&pWhP&U+g;(W
zEB73Hapa=15t!%~P6@76Re?^>KIZDL-Yc7xQ0FYtC4buxb|Uq)$oL+$<=<s05SZej
zR@d>@gn6~(jUs>JJHw&1PBv~q`JWzEbCt9Rs4>nLNRXZ!*b6*@&Tc6DuQrIJ)+xQ1
z=XyWS`s%Ws|4v9(C;R$X;Zs3LaAQQLeQyGn+5=5i*ss=b+ZQvHT?N-eXFP6QwmdsE
ziHP1tN$GxQ1o!-$&ZSE4TB=ujsRj`-KupTmeQJ#66)}$uZubseB+d-QR)uV;#z(C?
zMv<Av4<;C;SzA$S4~J?zj4zn(NC~S6lo*WhjJRa9<!$j)l=a`%?54E!si;sy^Y>LL
z!6W!EI-xwci0sH>cVbsEH`-Tm4`X8NV^&Tu)Op(iZwY4APV?Xdi%u6Yyxt`~l~tq%
zgMg9WC8BHMcyx_do+vZ4rjnOHr{j5O9&M#lwzEf*<vKgjP8|fst9y~wP6P6s9wPbw
z{(97RzkTJJIb=0_*rQIt+jTIX&_1lbV19XlAu@R^<qS?Hfq$X6=Ul|#M6vpj%JkFl
zoJhm2=8RfX&$Z)b19e)mnZ`qT&kYn^#ERpDqHI3Ft2u&%h-Xya37*92cad}34NU~t
zeRgtQ4;_k{u9+?S#M)6g%WGSvWAllcudy8EXm>2vcH)8Hbab2YI7M+(4Dy{=%kIHc
zOf$o?W$p73_0NaA_$;0^=qcQm)8~v|gc<dW%aSlFyt*=YHg#dXmN_A&+4Rh>j6vGM
zuH07p2ef>-8o18K-?`55CglgbP-^ACRU%Kjw_H#Gv1hz7)X}p1qNOTqVy8lh(L7Uq
z)Y2xuLB%jBDSR5QVaOQ4oz%5xh{|r|{<;o-8VYjFWBRVtNoLi(w%nqsI~a6&n0Va4
zak8nvy^;Sf&vDOZ`Iv;-?O>TAik_R_&YvMIc5#<Ec%aX}c+&b>!$p8YfuMv5W2oc6
zYH+JZU!sQK5B<Y#nXr%1Oyy0tkoOz)&wglZ?tR7VE7Y9ly6U=vj}MP+EbmyziX;kN
z8TxDPPcW})W*<Al&|@es2n)qM8Qq`EtS`OMK%%Dk8HkNjl|S+9z+=3Pr%K}3ALW2S
z=SQK+y-8+h_<oOqWlsOl?kNSoaSde6WHU>U5y5u~gAB=Ho&u8x72$(iVgRw<qI6I|
z!|3NHVbv8dw7c1q<!N6O?wK!qc6gje)s=HEkJ4e&hL!}m@U(yWq3G@Xx4Nx;1=GcY
zG6r=4<A!6jbD9#tZ&fX;5qA!D6ObwGVv8u&`Cs2V^el22TI^e`{vX7T%aXU>8oIDQ
z`5X6;hPAYKVr-qgo@lkBU>```KRmW=EhqB+s26OthnH6T%e_@}_i~$Ct>|T<M0Xo~
zkzKDPUIgHMH2vm%U_?ZFO@c2!O>`-u=W@Ey9l5nibH6DW5K4}RD1w4DvlP!cu7A8l
z%&j?0XD(2bS}UJ_xa_H|QTEZyh%TrdsX3H?jQ95IbsG{rl+QX=C7QFJ_tUdn#yyq7
zreA>zO@iMnI4xufSZ}u$YeosNFp`u?{Bm^R;pUvZO>L&7@+m*qw%Nzpvk*(&IwZNx
zIY|3s5AKq3N-a42nm+v&T>F#UU2bn~6HWAm2&aorz}!3kt-kuKZ<=)h<bKpf)$KV*
zav~{r5B=mj@CRWqs)YeHrLxci@pqBC56qHwFvjd><a{+qG`k&C!@fK=GeD;@`WcGP
zCM8=;HkgHmo3dxf|5GeMbQx#w!^|G6Y<?Aob8`X~Hj#K>W^~PZG#4>>Qtcrzusl+`
zr<^@?S~y~npV-<PX08%zDD5|(srVJ!J4$dAiz8yuIbR{+Y4E#UrT-B)7r;FaYF%by
zlb#%nDNln{NIS@NR(GEOYTs{q#Oc9frxPSje2eM;f2Q;^v<ja>J#2Peu->69iIDSF
z)~udd{UJy+UN)=MZpi4TT6+W{kBK!W_sXK|nI(r$JwrkGl;6Zgo%fk@n7dB@Y>rxB
z`FPU!-Ge)<<q@@Eg<0u*nOe|EW_LzS$T#%FP|9#4FOpT~`Gn~waV)}L|3i}KdU%+f
zy-tm>dN=zGbJ|$T?12diOOaEmx*aMtYa!A6DGG8O0n~Wszt|?vq!D581J9(_I2#vI
z36L3zf<jGJW<i&a%f#281u_wzRpj{+Gh+eb!l7Q@NcQ+x>sr$7k*66FzdB=hUzG`1
zDzj9Zdf?E#KH$riP~u!QiQ+`mU1N{3WZ_x%WY+?;1+56|V2Rj%!aS02V<LsNuMj~#
zpQCIo6u~Rk2orpEFQUmT-9XqnN~}4dxB?Dq$81wp9#I!mGMHq^kV(;odd&b~q|7pV
zmMZExzO_s-Y-l=abO}uTcQJlHl>`&mv`73w<H%|nTG_6P)suD!axbC>uMs+^-VYI<
z=xI`YXK^{4;_?~c`yIX7iEtes1EJxEb_Z!T?T-6^WZzYh#Ojk*ab7dZd8_3JrMk=-
z5D6CL{t*e5jF>&^Ua=-HdzQ?Fig5YD$pqkI^~;pom-Sav&_y4=sRVFhnoMS10Q(hC
znT~_f%RprMWLuDLUpvrztQeQD-`MOxtnMk=ha%G1+Dr&4sSS^n4S|*EGSQb$=X9^!
zu$5jl5^JK5TwriN`okvJXQ1AnC1&^KV5#&~tHH9fi%cSTn&`ImtSuFp8uBW+BL(@D
zXT+=9&)A8y{4^ej(}Pb*l21lYu#9vOVf9Y&!^%|G&8CK<S5y=mW#hhyi}Na`OT-Rx
zpS_NC-xR4v>nq>wdj!4=$!&5!jsu&8(QiS)e1zn*2<70=+)4H+%|<o~ky3tp_b==5
z*P_<UB(F>^*L*VvI*Yk_t=}T($vddp;0M(d#&#6#>#q2DStA)Tt2g$dSNfxxWu{?3
znRimWC}R)xjY^%Rp#w^%LI!<uW_K(@b=oYM$UZ@}Fq9)|V;%7q;OB`?CUWXGRmzG<
z-XyZj>bLZ9i+jfvIpwUK6szsYmLzs5HkVJHM4Gi+I%#n&7}nKf?!I6*_;|vCi~KFb
zm?)L*HQ!t=Ooj8}C8nuj{#$bfRl40oW>fG^B_msv;sHPsOLZiF(Yeos7YSF-gMvA#
zg-d?fiwq}>>TKDaC2D|eVEyWF<2o}q$G<FfjRUYSINVVLF}^rhXpy&ICn((i@Jlb*
zF_WM>0Fife$9^jD2OE5;g5r4pKe(ZuI|xVxbxKOe$+(m{3SX{NG|2~lf#7_pEis_`
z6sXm&3Z@3flL6!sDUOrVnA_QDPkj`r4s!j*<PV)~3BRCLa4+elmf(Pv6a4WyD>b&8
z$FYRG+}B;ZFbM9A2P96tM}5dh1V4gTkRLGo3Em$4%H`Ck{<hpk@*=;Pi#qx?RD^$Z
zlYC;pBFcAmpA#I5YpdR8C)5L?QkH2Fg#DHffDZ0CZ=!&`4wvn6OXFnB>F_*t<uK;|
z4hCT>r1LdPa8*0P!@BY4h`u@|d(0=WdJ%vW(B!mO^S=SCAx_!3f$1O6gsr3w{5$0C
zDQ7JyfCf0vGl-i%aq<GUiXv({54dCiDUlk^6P*8|FTA0^FMBx@@+1(0!QsACDDE)T
zAG8oaI^cqVJ1dSb<n6(~qnAl1Q2wi?2}cgly3g~agBDlpcB;1jVLevC!|z1eZ_`BL
zgZgS82a<2lEdX8#wt`YG%-){eeE>%YdChMuhkWWvUj0v&10HYuIp(NAOm*pOaeu{E
zuf8A6vboY=kt4R?(h2dwFF*elfJ+`*^u#zYkW1!omH?VXSHzHet^<Ag3JOsGBfRhK
zktmFVih-bw9<#LMtf;6+1u!el1F!Dg2!ML;4?LYP1<IaJgN2CLLO@qz9Pv)QO5E=j
zd&qylE=Y*HElAF}ENIWeSR-_);D95=O5wL5A&lHip-FI9@68Y70a7)KuA3B>$q_k?
z4n26#hrtX$>5`_Ngu4FI%{<oUETlra>W&9p&{zOm+b?Aq$)@XhX1xAZ#s4z=Ut_qS
zv2P|)bS7+=IFe~z*8XU3+ZAF~p1gwt=>GiB$P0zwX!edoubdsA<MbdQXk*~ojrlQY
zB^Fra{X>~8(s*&oymvqg&8C&(ou&8Q#s;u3gn-rl`2I@6enu)h`e|1dC;A(;^q?T(
z(Vo5NXV8Cv`V*FLi>6cR1v&j1z_b09(?>+8d_SC&5D!lh+_JxDkj6vnjfI2i$AaLf
zl%kWKO`jq3$1#D&Nyd2*%lnvbw8Sz919-+f7HTNVNT(zEJ1})J<0^I~<;4Y-|KNkS
z(4L}QP9I`zVQJP?>%Pl*|Ncc-+}P(pI2sy|eE?2e$3NpI<^yo&2|dqBw+6tg{tRt9
z#D8MRQ>2t%t6Zo#{RiuFChO%x)4G+Od&=xBgpM1ggn?(*nv)~CcL)dN4Ff!Qi`MzJ
zXF=(hyx?_)y~*UnO+ArYR7e2|Q6!vwwMT)sWTX*4Qi7ZaIHUsuj{qW+3l1cgCE+mD
zZ5%hIxF2KX`lg0#`QBgPlh-e}gOtl9P=iQ=2RtGNH9Y<Xhvs!3UY3EA0Ef*Ez_CSl
zCG!3nZaoB`Rxmc30BYXrzYvsgbwi$jcvI<<$PLFFA%h@k@nVht#@O26fb99#hVUd7
zJn$xNe{Z6_8H@o)6e1NM#6}Fdn}s{KI~fbgjR8i&k=Dw+2fSdu7vNYPtymV3zh&@i
z()|<;L_t|Ae6e=LWc9o$!%78h+H+k3ASy%$3=8q4?f}zGfAFw)muMCsU{T?iHJwQc
zyFUpaA{Ri(VdJ%$3l1tiD0!hmkMW205@`%CDFL~eaZo{8ix`$krhf6?I9x0TAee5s
zZd?S?AzTh}6|Rs!2wf~*P?A}!NDzc{7!OLWC-`OlV9BxEK*`5)kvO2VMGEk{8{S`^
z`~dp#(Qq69$vnK^5YyYJjsP$ETf0J_2vJ#W;4vubY5UmSn_z>>7NB?X#hF+p1nnP&
zFh>r+)!sNX!LhL@NDyg=9GmVW>`lEU`${F0rjz5QCoiNgaHtxc5DX4~bVocP;U4ah
z4ZQ#0akGoVu%tQw()}INAF$2x`0uHUd<4hou4WuM$=tYL5S;vkPDBoXpZm<-P<WD-
z{??Ozcxn~k_bc>3U6&<}aY;ys@{l<N0J227#@0dj)A)11C89CF?^}u#YQd6}NO)aM
zqn(Bm_s4x9l5k}xYYYJTkdU7c@2(~u9d;=E*7@H*3&1iDWK~$G!MJ*HB;UaEzw}10
zo!wZ#B0pHo_K1(}f&Y_ZOB%wV$>b`?F@N#B9B`vYi;M@3xdMkx?M@vH-BgW)vk4OE
z`i{XME2SZ7aET3TT%6c%B$kA;2HFdpsK886pwY0KJ?CZ;oPZnwp!V-LoD0VNlLiQn
zlm>^D<39cb?xM+69o|Jt?T`7V|4NX9hNiK}nH#ivAC+bkWPxG2zsN3i(@75&P~2T_
zgWuE%>c}8?8X8+w!KS}JNgBA?v8A5@Zb;iH(o$H`>*|hEm7cqQxG*a`p!PG*TY%n#
zisAjL9SE;mXc+ego(~xS18mo9WO#tKg6;hZu2<({$oqmnqxS~K5jayt`dz^AQjoI3
zl0Jv^OTE3g(aj<}UBGoQ?phGAFEK=`Y&ud#DQr6Cn=tCl0C!-v*YjXyXwyI(;*<q%
zo~wgne~nlP^vos}y#bW-)5mKRp4z6RfWXM-#@Lp?A{`tOL=V?pu-67aeMdvaNss<G
zROBA0e0YVXb0!7xL#pPRywNHp17*lx)qew9Ih+<=!(d6s9vT420#m)p7P|Fk@CZ~`
zB=FVnV|$CPh?LhpdolPs42y`}!3UbirNz{;NXi^-oCN-eK=UG7+#9URs5(Q5hB!SG
zh~D^gSXBM-%<<7U=uwx>nre!FjUfVLr70A8#J~V$ri8(>=Gefa{|J)QiRBUF({uLa
zoCL^C<cSo5pjniBhVBoWhLPZ--+3E@ekuiVgafZ<k)*Cl^>e;19Qo<dINrzF`<rJ%
znCg9lt+kp;p(;&3k+ApE%w$z0w7;lph%{K(sUsnhn^z~62I&uOCJ9(!<Xwb(A%M_k
zI|jHwCy}<(_6|H+B4TgIH`HM6`?r<;x`7+@zQlM8l@$yS=16&Z>rlK_Pw)X;J0_9c
zv(igAlw<C)p?r~*{a?!;>2<=>UZfzk3aI%k;()dJV~Iw89Allh<U?tni%8b>n<p0?
z5nWc0Sn0*Y2Cvez)*_IeO9AAzXhht0?Lpa#cH_){j&LcWV|B7sGQcw5A^4^}WrYyF
z)8bce7dpLkfvBnf^#^<e=%dgm+*cPrc5GuY=i`wM<Gh&tg9&I0@WbO)cB*=P{&GSl
z&`jc)x{?gY_yMkW$sC>yMj-Q_0Mj%X60$_*(XyLOgdo2ZKv+LwN~5QxrG2j3-u*XC
zizH+Q$-V?_uD((>?X=$!5hQ=%{Z7ViV>1}-<|(S?@pn*62n}|I@!Wgjh)Hn#_p*#V
zCmc+F6S6gc!B&bTol+w30gyJXU9KZKuB)L<#NP+nToKh1UVqiuQv+d22M6q}zDe&H
zal)Bk|5`Yf-UnaOsfI58kthG{_5Oc{C_h>Ik3KNqvfN6ngCG!s2*C8h2?T(fL3+Y>
z)YfxVEA?=2tHyN`-(P$88isErN}eaAHo?)Z+$w`U|Hak&8^G8A2bSgtnh4_QpS_}x
z7ibab!(qaS5C|t|_9v`I|E4by4(~S2f<_?y2ityA7zy8y9Oh)HLI3Ag0kF^DtDVew
zlcNAUK&^i@o`#1c2m+!iKZ_cd#a9xZqwQ?<K61-Vi68ZVhXaozwIqhlKJYO6kh9Yb
zK5n6K)a|J~OSOyjxSqL%rn(5yiUp=9cdJODDIlj`skmBN2K+TgT9!3EllJpc<Y|t9
zh?@1WVSaV&*6?OR#i3k!v--WKTXq^Qlj(B`OnmjFJFgz|6;E15xyzAInvdnz@-gpF
z8Iw&b?&V!2J`$jPP$wPYyHK^dzT4l}N9^lRN46~I3?^VTabNNS(2`%HhZ^rc?wIrQ
z426Zdm0?UzF=^L9VzQNqBR+{AqjL%8{f{?n`Xn{j?u<X_eOHIv!^B506;9u1cfEtV
zl6!ycE0e2E56t$XO{rz~m+4--^b_ez{2ydy{e2>;jxk5!riOf7sfR$leeB<&Uc)EF
z@v74P8h&!U(4aeX=qHp%GC%SfoVBFzz26%;oVaq&)*?9lIrXT2rOi$-erIMUUom0I
z*-tm3o#t*$4;u|UKMt|_vv?#nfE%E(V@Sk7%MpdB0tkrvg?9^E#_*8T`bBx^j6piC
zM9*+*Iin+^zZ4IRw;-69pN|a?qTv%t!vfeF#BlN&Wh68JNs%^EM)ymJl=tcvm2CCg
zoJ!BV5cZWg`uuvBi0;x`+RrGDpom)T18(R*5=E~r8A-@(@EhB$D5*#xUL^P>=zCA(
zE$QpD$7sJ%UDm#C-gcgqs5I?gD4PUCCOx!cVe}XwRSNd>n-n}aI6b%E%e0UI4o&1e
zZ*V}=7yG!!Z}NVuL*s;00nu&dYw4DR_fI$L&#0hS&{m6ay6F$j#T;eYS|nqokNLK-
z@MXSe>&!*27IVvtwr~3I9`@B*&M<|jTD(K#G<#X+fIm3{@A=zDEV$BbRx3?ffCDUT
zZR)NadJEP1OLe02p)QS&K*lGEWvI`#6K^kbl`NcBzmjXVDQOuvFM-u$nWe;srTM8P
z3?x*E!`wW>Lw`C%;r*(Z??PxtI^ZQFT~Oz(DKWla8cDULV~*`!xKyFL{_s4mY}P~z
zS)p=<sU{~x{WBJOZ4FbACE2eZfrf`|{p{3Is!AtbCt^4iw9-M(u6{5O|5$ZPtaEC%
zn6DgcD6#R~j1wYj@?_`P37Vtw2?BeSqG$9_pzU_$WCoc?;4espwF#3iaC&N4rVAA?
zg+Cn~aC=l7xc!JDx9#)Nvg{rpV|YKg@%`JxLvMlkW~EGs+#Me=4I=;E1?di9q}hYn
zh|1OSFXBy1<wsT0tLU~z^{K12HRQ5)Xc2KMoI_+*R<>GV_l(Ozon3O@^7Twh)I{?!
zZBTuZV0<6kk;M79_i&`Uvx$85!^T13VMlUn2ctwts+s!eOA6N)Q7;lYI{KJJ;M@<t
z*gmQ|K48lSDGE-!%kFIf?&wcoIA1c^$TJ#Of1g9+Q>)ut#CDbXDHn$4m=>~^^l-S9
zo6pu`{Bhr%?HZBQ=tg>Um%SQ)*@S_#R=J@j!zO4_#dAW|%whNOhhzP9drz=@DlBHl
zR&8g;vkm=iXQaoFPt1)<n>@~C4YO4b{CXRrTUXv~b@do8|B^iBns9d1oE6)3H>Fec
zkbg@ee~%Z~V8$fmhyu=?5iaFf<KsZ8WQ5)-`x!&M8ki2?zVQ*2HqW$5eXdt9FQ&qE
z*ZEe}s}~Z}zfLgJuQm<Y?j(}sRFQ}!a$0MxB(b332KmcLcs_jfZQk^?tJ&9rhV6{&
z3!5KiAMyI+w&rT0+|KRo3G3@IuHV6<)~i_X*f&7U&PE@TL={(_czWoC9kxIiTOZSj
zL-)$us7*|}PZc-*sWmkpcNwLtqlF1$ZM|ma05rWueX)|Df050h!=tf-*MA{O=<oNQ
zvs-X`f4$1O^$ik<0kX(V{i<`^JK&0SChhiAq6{ap_lq)5WGS>4Rz&T7NbJuhofl%p
zG15#kYAD@6uVp`?EJ=lq_DPBfJC{cJd!(jJhm`-oxn;SrhIwl9@bpD>YR#mb!0?vJ
zR^=qjoPV#I;=(oU<XLgdSRw2s(}ti#mz*}P|JcWwX=WoS-1WesN3K71V~hw}m(MC(
zE-O6W)t6s#vLR=}y3x|o2$Ow!4R5dank%(%;YF=3@QuZ_5Kf+kQjd6V9>u%Sa|<*{
zzWh1AgR_Ut<6crUzB+n%+@W0N^x^LJIY}l5^5W40zC5?;ZCB^T&Rn*Qt2ajR#);Z2
z#GB6=-W*X=IEM>1tz@!Q|1!1Jz4)3H5mX-%{$|7d6eUV%W!`!MjZWMrw-gZxS3@Jt
zCAaa`N18c}po=OPBuMA*TY;VEC;y?xZ=O#Er?qx%T|1XM1YN&!KaMrD?T!h_-pPpY
zY5ujqV5-@wXrMh<Vv}Y=kHVMmU95alk=aEsxNO_rg;rb+w!?E-p&=Jr+}I@$|AQ%I
z)Hq=MXMYNLZob~}!$e<uQn^zNW0@cAxA}4L`{SdByWZewHS`}Xx4eCZygskWl<f!g
zY$>Hb{vbG+7Ln%Y^eN};8H{BFhJ_iiyOGN26;3)v#WvC7w$M7}HU0%nPMzM9GU2D#
zRIRL+j-Ez{V)}bjENJ@|#o7ECL}8>DwNF)45ql4dhS*MqF9@96P)^MmCQk||&O)4*
ztztx7k)Rvv#{_pbXQnKqV_)hUTnx>P>|~+Rjf7kc#X5Zybq$t#P5$kRV{L-1aB|XN
zpZg#N>F2Uh)JZeJp(Ce>+u`v)DnH1C!1i2_8@Dpf(B!M5U*^BGp@=iqLRoO!OlI7x
zZ;v3yV@~)^gTgJixx+YOL(P3`5xG8lO;$x<iIYnH^;g1Ib6<_1aJx*1PVude9TRc6
zuni_57O7K$rRx$xRH}plr<T+d4?Uxfm%q@{-<;SF-*UK$y6AmCw)*vgu&2@P{@u8V
z?pA^@fytvF1^=~de$TKv%7g*6Cbh3W(ss}4>O90JKj7YWx@XT-W>4-*!nz@}2OSQ7
zV>s28H}a{a=S)G)-lb@EHBErNZ?Ku9!Ld_o(tG=4h-?qd@4zQGTo(*F@B7&2(R6cj
zE)322voZR^_v}CCv{31+FefbWAPG}bMPrZdB7)0=s&P(DYJ{*piY};S-L8@yNz9kq
z5#Tu&z^Gbi9_#kZdp0LN#Jl?N)T-Bnt?x(<?}1NmYx=b^xtnQWej2;B$@=t=wMe)3
zck0)9-xqCna@h<&jLM96kIMjdLutyaS`n$*S5n1AMc4Z3y8d^szzkJZOZ006{II5q
zcNFMqezI^ftZEsf#FVtn6QHy7P3O~d1fO94FzMLmltL_MV(d0CTo_2$0Bq!5TYb#W
zmUFv!&*R7Fhp`Hy?Jp8;eVnd?N2$O7c9K+%G})uVQX3sY<Oa8x4#-zGL0-+or%!%B
z>`qT{wV1VQ{VWD{3HUGPb!Aqr2?au|Cd;*80_1j9_Iy#XlM91Uuscw~CW{jXp0jqT
zv{6QfGP@MJW`noC`00>h433|2g~An9OLA)A3#L8bWzM}!0x;W9k4+0x_asLK-+<7(
zA$`Gf{R!<Nqqw+Eecrd%Z4QL)%dVa5;*E~IAMeqf|2S$a<N0VeD9UBOEwQFn?s_kb
z5NcmrawgM6*>%+U+`=MdTBVOdf$eaxcrTTixwZMROB|PMabE|$hD05%+K6L!b>n8+
zvQdQ#G|%(B@KB>w-nu<W<mJwH9!OU}IJ^385;$kUxgD(a5)BJlVkY+(mkPm0Du^UR
zqecm$<rBQQ8F4V73O#Bn7d?$|{tn%kV%u|lVu48~n-VY2WHT=t-B;=SlA9-M;93V5
z7KM`$s`)}wQmAK>iemg|?U!4I(j_`<&OhqDtOlG8P)4mz@DMt8p*jnMR(+$$e-5%p
zc3PPuq0&uo(Bh*df{<l^*|qbPK2q+Z=$i{n*uzg<9>o3vt+HV~W0fOhW%wGtQtKYm
zX~MtAhKra^VXo1|YrW+Y#cFc)DdBP6Ltcl0nz#ENj2A;N#2u!7>H7TgJn}Tp?cS-%
zmgq*OOZPbN<#;nMyv(#l#<lByr*!glp5KINpe$tq5n5a=Iu_KzQf)JK7tEZ!HRbNG
zi?=-n9q;le=dVQO##B?&`!>ji_v?2!M&hqKAJ-zswD=0XTkrC~FRc{#kr&aLTV%37
zdduW(i=@q9HAWWyV<S%w8TYzSS1<Pxoh<7z!G^h4!=^Atm)4<>k&9KHo|=k-MlHqs
zPswR84xeC*AeAR~Q0ZQBJ&8s42C6^`>tOuK7-{v9hS*ZWH(2ngiI9dsX3SVYQq*1f
zS$#sLE4iV|Zsp;1=Gk$mONvr=74v+T^^nGL#MW_~TNNv}izMilf_w}5q{(Vjeq_0!
zy7W=`yr;N*V@7PgxF;nye`Svy(Z`NY9|QcsvVrj+_ObtSFCCn>MoV;-1dxQ4g@(dI
zDWm<1S_;pwV;qxxw%q!o>xhGhI%tvVCpYei!=FbBaj^^AK_2$XxnJ0}av2H}g?^3@
z(rt#|&e6hH4&>e4rZPPv>jJ8W*w4H#G>x=;<wIafhNGvA17$tY|2q%P*$i}imM#C%
zBGxD#df-NC{*q}J0nKXatA5t_>2ON!h1^wt6q}7?@%L@@2IiL|#6!ml#iJ?9HGxzw
zgE{$+(#^q5=!8{=?orFQ)N%cm@@^^X??9{zpuiN~&8o5;v@aXYkh?VXJ&9_W7+Q33
zj|gtXf?iTt%+Lw3RFsMp%D)0OscZUZcT^q0lchdGR(4}P)8WgB<rb7jqq2k2xkZ9y
zT0R;Y5$vJA6EG{8G1G8G7*Tjn+0#e}7p$St#;wymun(*_(ge&4Bxo!c5^o~2i&xx{
zEV|_zshOF^wm!(<t{a99&rT~J&(ZB>QNEsfpe_B2lESsiH9>#r4(`?EISOH4)sW+y
z7=_Fzq0vMkY@(U3vW(ei+h)chu|h{AZP>p1JIe4oxzfX-F9b(A4pdkm7)qf&2-Bzq
zv#2M=t{BXM@zYmIck6Jln#tC{U(vZm4nAxPmJ#cX`$tD*{Xd$f=Wm(sb@gICma6G9
zTrsBSV9=@`JljJVL0I3l4LCxb8Z({|rNeo>H~9vWb^SOoir(b=wwCMzLvfg2g2FkA
z+Ol7KeF%4mAWi*nyf(|o3%iC<^V;?kCY-A>fr;_6ZIY~rLKVTs)NDr+#zP#v)O4i;
zJFLN~sB{MgQ`R9MG{(i{kn7??z-x2o2kkAO_cQLoh5U!G_upHx753a1e~}z#ULGHj
zpYHliaOxcIC}{6)*k!$Y%3y6BmuB$&JHBlQyOF^{i=P}ZkECyh*p9i_Zok`;R-LSF
zx0)XkXBrC@YRy*pXVTC6x0p>KpXf{8?eKYok;}6LE56q&1S9wokgIOW4A$fex+P>w
z!u?<SOcw4Nti9T0Ec|q9q307vX;<r_E@A^MK5^#-W9+w+=s0@ZE<<Z<r`wzlkIGSy
zXS(@XTU6KTy$1~rFSb^`AlPLG;^RS&+bB|mujZ`XwT&3R;$6X-@ZA~qyJQ~pwH1>{
z#ZFjbL6?Lp7No%v7CR`N`y2w6Fkxz&5C;q_=%KZCtGvQ#LPFzAm18<IwGIaJFwcc1
z9+iHtZ8a&E3?%T3hNXr_)eaW*We#zC3Xa3naOl;ij_ch&ctYzKU*{-39JAU)=Ft`#
zwvuy>Pn}Q7&Pl(alchbDMqC;=w25ziHyAnEO0C8~Xit2ExE+-)xR$DWnhH2TE^VR2
z6x;#KS$7E$kPrz!dvEcYXf52&QrdV~_DX$|Tx;TtQk!cH38{DXeK^WUciGF*IZD?P
zciJJyPkr-uF^aVMg6MGr^;g{<tviHg=QF9%RUY}IOV1o%w(ee=Z%pB$BX+8J=_saX
zq&J{E0&**+n^Sw;LPUjl34H}i(U&r*51D#kFpuakWjV2hZXqCH2|-waee21Q`U@3-
zI9W)r>INQ%*TFIa1qI0w$@eQj-0TSJF8$lqCj3F})lgI#(3X^Fus+tGrZT3$M)Iw!
zlF|qVFufc#-`9M9+bRJ~yz(rAxq+&kbvuPT@IirII<O!K9YVT<h+<%#9w8bqdH){n
zEyQowz<BdZKPWW<ojKId*9iX}?Je&GAlO-|mm8V)bFj3aqjgW-^xh^1raW~=y7Y^-
zmrnGxWd|A6jqKRq!M9k+GL(xrkc1B`uf+Z|g#cQ<d>{lN0&=qKYU0DS0ZP6ogX(#h
zpXk2~yt{Sl&I2h?VP#7-V96CN!G(~Uann!($;CtTu_aT5z9Qq6>dk5SDyk|C`P~d3
zl?KQobt_l|>nSRtDIv5vJl(T}K~cr0SJ=cx4&21|Tl_Qh66SFc2Qozwv+*KYVq(KX
z{<!<tLXbCzro>QN+X+c4t1{&a!<={5mX@1l{fRe{-ttC7dKufoZ!7vK02BRbbOfjp
z3zk=9Gh3!VGGL+Bc>Q>15)n)QpTg+m`<X)OD7auN4FfiEGbb<b5JQ<Cp&OQY3t4zq
z-=(Xs7nG8dlUoUQPI=<+sRG#O@9q?tb;y^CG<lTR*x8jl5r%?pA==#e!j|xT&T6_+
zjLUi+hgko4xzy~B<=Z!XOGe#lrhVkI!hwpX2(k1Hp6DJJ5-XRY)QB&qyqS@9#N#*`
zXs0(SDV}%rJ-WO}QFFPcJP&t<rEaXU4Vy&?=?InZ7bzC19Hi7YxH6&8_f6{SEOflT
zz2thK(YnYPElKAxtUrhFcvs)2QzvtLTKK^@TzQJ0uU_4j<FBZTxStEX-)IIi)$BhP
zqrgf=#j6l{=o6gu^HcCHbw&Xq0FL~YH5z*#1?P$>lZIG0Tx@X>)sHU0<YZ1?&R)Y+
zzM8bf5z{&-LULd5W&}aBIxb?f{4_3_;^G0g(q>)f=oeT*89{fR<3mHk1Tzw@8ph(;
z6#q^d0zQj(M|q~V=z>yoltR}p>>1!=kwOoBgDbvxWijiIMJFelJ5-7OHO^;<B%)Bt
zNU(=O&50K(@C{D?;)nOkqxMEa89&iKKT&j>NlOZpX)DsfVRG3exDn%bkD(w2OTPct
z8u*)c8!&k95j)+R@&9&O7O+R5{)qG6H4%aHB8d<C-*ohQJZ-?ULy=_uyCy0W>}rVk
z3B~^o7~U;#3sxM4|E>vP0q&)9tV#6WUHsp{{hzNn?_alr+nO>Lu^AU{fqznBa-wC=
Hb$tFGzvD?u

literal 0
HcmV?d00001

diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 6bd41c58e..b3c93c1be 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -6,7 +6,7 @@
 from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model_id = "meta-llama/Llama-3.3-70B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 

From b336fa2c0e5fcb2664798af5389c2f757f05b3c6 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 15 Jun 2025 23:33:27 -0400
Subject: [PATCH 23/36] deprecate sequential_targets on modifiers

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/args/dataset_arguments.py   | 10 +++
 src/llmcompressor/pipelines/registry.py       |  8 ++-
 .../pipelines/sequential/helpers.py           | 66 ++++++++++++-------
 .../pipelines/sequential/pipeline.py          |  4 +-
 4 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
index f60c9560e..949933f97 100644
--- a/src/llmcompressor/args/dataset_arguments.py
+++ b/src/llmcompressor/args/dataset_arguments.py
@@ -186,3 +186,13 @@ class DatasetArguments(CustomDatasetArguments):
             "{module}.{method_name} or {function_name}"
         },
     )
+    sequential_targets: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of layer targets for the sequential pipeline. "
+            "This is typically a single DecoderLayer. "
+            "Not specifying this argument will cause the sequential pipeline to "
+            "default to using the `no_split_params` specified by the HF model "
+            "definition"
+        },
+    )
diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py
index 2ac384866..cc4c29d8c 100644
--- a/src/llmcompressor/pipelines/registry.py
+++ b/src/llmcompressor/pipelines/registry.py
@@ -17,8 +17,12 @@
 
 __all__ = ["CalibrationPipeline"]
 
-SEQUENTIAL_MODIFIERS = (AWQModifier, GPTQModifier, SparsityModifierBase)
-CALIBRATION_MODIFIERS = (SmoothQuantModifier, *SEQUENTIAL_MODIFIERS)
+CALIBRATION_MODIFIERS = (
+    SmoothQuantModifier,
+    AWQModifier,
+    GPTQModifier,
+    SparsityModifierBase,
+)
 
 
 class CalibrationPipeline(ABC, RegistryMixin):
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index b7937a2fc..17e6724db 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -2,7 +2,7 @@
 import inspect
 from collections import deque
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Set
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
 
 import torch
 from compressed_tensors import has_offloaded_params
@@ -23,7 +23,10 @@
 
 from .ast_helpers import autowrap_forwards
 
-__all__ = ["trace_subgraphs", "Subgraph", "get_targets_from_modifiers"]
+if TYPE_CHECKING:
+    from llmcompressor.args.dataset_arguments import DatasetArguments
+
+__all__ = ["trace_subgraphs", "Subgraph", "get_sequential_targets"]
 
 
 @dataclass
@@ -416,44 +419,59 @@ def match_modules(model: Module, target_names: List[str]) -> Set[Module]:
     )
 
 
-def get_targets_from_modifiers(
-    modifiers: List[Modifier], model: PreTrainedModel
+def get_sequential_targets(
+    modifiers: List[Modifier], model: PreTrainedModel, args: "DatasetArguments"
 ) -> List[str]:
     """
-    Infer sequential targets from modifiers list
+    Infer sequential targets from modifiers list and dataset args
 
     :param model: model being calibrated
     :param modifiers: list of modifiers being applied during calibration
+    :param dataset_args: dataset arguments passed by user
     :return: list of sequential targets
     """
-    # avoid circular import
-    from llmcompressor.pipelines.registry import SEQUENTIAL_MODIFIERS
-
-    sequential_modifiers = [
-        modifier for modifier in modifiers if isinstance(modifier, SEQUENTIAL_MODIFIERS)
+    modifier_targets = [
+        (modifier, modifier.sequential_targets)
+        for modifier in modifiers
+        if getattr(modifier, "sequential_targets", None) is not None
     ]
 
-    if len(sequential_modifiers) >= 2:
-        types = [type(modifier) for modifier in sequential_modifiers]
+    # deprecation warning
+    if len(modifier_targets) > 1:
         logger.warning(
+            "Passing sequential targets through modifiers is deprecated, "
+            "please use `oneshot(sequential_targets=...)`"
+        )
+
+    # cannot infer from multiple modifiers
+    if len(modifier_targets) >= 2:
+        types = [type(modifier) for modifier, _ in modifier_targets]
+        raise ValueError(
             "Cannot infer sequential targets from multiple sequential modifiers "
-            f"({types}). Defaulting to {types[0]}"
+            f"({types})"
         )
-    elif len(sequential_modifiers) <= 0:
-        types = [type(modifier) for modifier in modifiers]
-        raise ValueError(f"Cannot infer sequential targets from list of {types}")
 
-    modifier = sequential_modifiers[0]
+    # resolve single modifier
+    if len(modifier_targets) == 1:
+        if args.sequential_targets is not None:
+            raise ValueError(
+                f"Got sequential targets from both {type(modifier_targets[0][0])} "
+                "and dataset arguments `sequential_targets`"
+            )
+
+        sequential_targets = modifier_targets[0][1]
 
-    # infer sequential targets
-    if modifier.sequential_targets is None:
-        sequential_targets = get_no_split_params(model)
-    elif isinstance(modifier.sequential_targets, str):
-        sequential_targets = [modifier.sequential_targets]
+    # if no modifiers, use data args
     else:
-        sequential_targets = modifier.sequential_targets
+        sequential_targets = args.sequential_targets  # may be `None`
 
-    return sequential_targets
+    # validate and infer
+    if sequential_targets is None:
+        return get_no_split_params(model)
+    elif isinstance(sequential_targets, str):
+        return [sequential_targets]
+    else:
+        return sequential_targets
 
 
 def add_line_numbers(text: str) -> str:
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index 3e0490b70..628fdf4d2 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -11,7 +11,7 @@
 from llmcompressor.pipelines.cache import IntermediatesCache
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
-    get_targets_from_modifiers,
+    get_sequential_targets,
     trace_subgraphs,
 )
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
@@ -64,7 +64,7 @@ def __call__(
 
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()
-        sequential_targets = get_targets_from_modifiers(modifiers, model)
+        sequential_targets = get_sequential_targets(modifiers, model, dataset_args)
         ignore = dataset_args.tracing_ignore
 
         # trace subgraphs

From 34ef39418fa97d3679a54eebeaaf4f74f66d9745 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 15 Jun 2025 23:38:54 -0400
Subject: [PATCH 24/36] update examples

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_vision/idefics3_example.py    | 2 +-
 examples/multimodal_vision/llava_example.py       | 2 +-
 examples/multimodal_vision/mistral3_example.py    | 2 +-
 examples/multimodal_vision/mllama_example.py      | 2 +-
 examples/multimodal_vision/phi3_vision_example.py | 2 +-
 examples/multimodal_vision/pixtral_example.py     | 2 +-
 examples/multimodal_vision/qwen2_vl_example.py    | 2 +-
 examples/multimodal_vision/qwen_2_5_vl_example.py | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
index 27f230569..09722c127 100644
--- a/examples/multimodal_vision/idefics3_example.py
+++ b/examples/multimodal_vision/idefics3_example.py
@@ -31,7 +31,6 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        sequential_targets=["LlamaDecoderLayer"],
         ignore=["re:.*lm_head", "re:model.vision_model.*", "re:model.connector.*"],
     ),
 ]
@@ -91,6 +90,7 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
     data_collator=data_collator,
+    sequential_targets=["LlamaDecoderLayer"],
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
index 4b9d1cf9e..984e8a1fd 100644
--- a/examples/multimodal_vision/llava_example.py
+++ b/examples/multimodal_vision/llava_example.py
@@ -30,7 +30,6 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        sequential_targets=["LlamaDecoderLayer"],
         ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
     ),
 ]
@@ -46,6 +45,7 @@ def data_collator(batch):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
     data_collator=data_collator,
+    sequential_targets=["LlamaDecoderLayer"],
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py
index 251fe4297..fc3657b0e 100644
--- a/examples/multimodal_vision/mistral3_example.py
+++ b/examples/multimodal_vision/mistral3_example.py
@@ -43,7 +43,6 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        sequential_targets=["MistralDecoderLayer"],
         ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
     ),
 ]
@@ -59,6 +58,7 @@ def data_collator(batch):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
     data_collator=data_collator,
+    sequential_targets=["MistralDecoderLayer"],
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
index 3c5236d1c..7d94a677c 100644
--- a/examples/multimodal_vision/mllama_example.py
+++ b/examples/multimodal_vision/mllama_example.py
@@ -30,7 +30,6 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        sequential_targets=["MllamaSelfAttentionDecoderLayer"],
         ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"],
     ),
 ]
@@ -46,6 +45,7 @@ def data_collator(batch):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
     data_collator=data_collator,
+    sequential_targets=["MllamaSelfAttentionDecoderLayer"],
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
index fd274ea12..324df5d31 100644
--- a/examples/multimodal_vision/phi3_vision_example.py
+++ b/examples/multimodal_vision/phi3_vision_example.py
@@ -75,7 +75,6 @@ def data_collator(batch):
 recipe = GPTQModifier(
     targets="Linear",
     scheme="W4A16",
-    sequential_targets=["Phi3DecoderLayer"],
     ignore=["lm_head", "re:model.vision_embed_tokens.*"],
 )
 
@@ -88,6 +87,7 @@ def data_collator(batch):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
     data_collator=data_collator,
+    sequential_targets=["Phi3DecoderLayer"],
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
index 6414cca0e..b2b4c7440 100644
--- a/examples/multimodal_vision/pixtral_example.py
+++ b/examples/multimodal_vision/pixtral_example.py
@@ -36,7 +36,6 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        sequential_targets=["MistralDecoderLayer"],
         ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
     ),
 ]
@@ -52,6 +51,7 @@ def data_collator(batch):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
     data_collator=data_collator,
+    sequential_targets=["MistralDecoderLayer"],
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
index 94c97398d..14033872d 100644
--- a/examples/multimodal_vision/qwen2_vl_example.py
+++ b/examples/multimodal_vision/qwen2_vl_example.py
@@ -79,7 +79,6 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        sequential_targets=["Qwen2VLDecoderLayer"],
         ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
     ),
 ]
@@ -94,6 +93,7 @@ def data_collator(batch):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
     data_collator=data_collator,
+    sequential_targets=["Qwen2VLDecoderLayer"],
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py
index 7923d317d..096596d24 100644
--- a/examples/multimodal_vision/qwen_2_5_vl_example.py
+++ b/examples/multimodal_vision/qwen_2_5_vl_example.py
@@ -73,7 +73,6 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        sequential_targets=["Qwen2_5_VLDecoderLayer"],
         ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
     ),
 ]
@@ -88,6 +87,7 @@ def data_collator(batch):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
     data_collator=data_collator,
+    sequential_targets=["Qwen2_5_VLDecoderLayer"],
 )
 
 # Confirm generations of the quantized model look sane.

From 58fe92972a80941446a9cb66a1a6c07a43c8dd0d Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 15 Jun 2025 23:40:00 -0400
Subject: [PATCH 25/36] fix deprecation warning

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/pipelines/sequential/helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index 17e6724db..9a1751ea5 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -437,7 +437,7 @@ def get_sequential_targets(
     ]
 
     # deprecation warning
-    if len(modifier_targets) > 1:
+    if len(modifier_targets) >= 1:
         logger.warning(
             "Passing sequential targets through modifiers is deprecated, "
             "please use `oneshot(sequential_targets=...)`"

From 54ef06a95e430760acf831a3185977b7dfa77bae Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 00:01:46 -0400
Subject: [PATCH 26/36] fix layer sequential pipeline

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/pipelines/layer_sequential/pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 2cfda0d0e..6d862a8a7 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -16,7 +16,7 @@
     to_next_layer_kwargs,
 )
 from llmcompressor.pipelines.registry import CalibrationPipeline
-from llmcompressor.pipelines.sequential.helpers import get_targets_from_modifiers
+from llmcompressor.pipelines.sequential.helpers import get_sequential_targets
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
@@ -68,7 +68,7 @@ def __call__(
 
         # find layers
         modifiers = session.get_modifiers()
-        sequential_targets, _ = get_targets_from_modifiers(modifiers, model)
+        sequential_targets = get_sequential_targets(modifiers, model, dataset_args)
         layers = match_modules(model, sequential_targets)
 
         LifecycleCallbacks.calibration_epoch_start()

From 4bb86e54b14652c0340e8a2fe227b4381355a2ae Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 00:07:36 -0400
Subject: [PATCH 27/36] remove unused import

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/pipelines/sequential/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llmcompressor/pipelines/sequential/__init__.py b/src/llmcompressor/pipelines/sequential/__init__.py
index d96ee6987..7c726f6c4 100644
--- a/src/llmcompressor/pipelines/sequential/__init__.py
+++ b/src/llmcompressor/pipelines/sequential/__init__.py
@@ -1,3 +1,2 @@
 # flake8: noqa
-from .helpers import get_targets_from_modifiers
 from .pipeline import *

From b2367cef328ac3c2ee6e3808236f323a6d9e30d9 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 10:46:54 -0400
Subject: [PATCH 28/36] dispatch in pipelines

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantization_w4a16/llama3_example.py |  2 +-
 src/llmcompressor/args/model_arguments.py     |  5 +++-
 src/llmcompressor/entrypoints/oneshot.py      | 10 -------
 src/llmcompressor/pipelines/basic/pipeline.py |  2 ++
 .../pipelines/layer_sequential/pipeline.py    | 17 ++++-------
 .../pipelines/sequential/helpers.py           | 29 +++++++++++++++++--
 .../pipelines/sequential/pipeline.py          | 13 ++-------
 7 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index b3c93c1be..6bd41c58e 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -6,7 +6,7 @@
 from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
-model_id = "meta-llama/Llama-3.3-70B-Instruct"
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
index 9cf8a687c..ea3c3936a 100644
--- a/src/llmcompressor/args/model_arguments.py
+++ b/src/llmcompressor/args/model_arguments.py
@@ -82,7 +82,10 @@ class ModelArguments:
     )
     oneshot_device: Optional[str] = field(
         default="cuda",
-        metadata={"help": "Device to run oneshot calibration on"},
+        metadata={
+            "help": "This argument is deprecated and nonfunctional "
+            "and will be removed in future release"
+        },
     )
     model_revision: str = field(
         default="main",
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index c1dae7933..54a36abfe 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -2,8 +2,6 @@
 from datetime import datetime
 from typing import Optional
 
-import torch
-from compressed_tensors.utils import offloaded_dispatch
 from loguru import logger
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
@@ -125,14 +123,6 @@ def __init__(
         # initialize the model and processor
         pre_process(model_args)
 
-        # offload to cpu if possible
-        if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
-            offloaded_dispatch(
-                model_args.model, execution_device=model_args.oneshot_device
-            )
-        else:
-            logger.warning("CUDA is not available! Compressing model on CPU instead")
-
         # Set instance attributes
         self.model = self.model_args.model
         self.processor = self.model_args.processor
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
index 15b94786a..35c52f166 100644
--- a/src/llmcompressor/pipelines/basic/pipeline.py
+++ b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -9,6 +9,7 @@
 from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pytorch.utils.helpers import tensors_to_device
+from llmcompressor.utils.dev import dispatch_for_generation
 from llmcompressor.utils.helpers import calibration_forward_context
 
 if TYPE_CHECKING:
@@ -37,6 +38,7 @@ def __call__(
         :param dataloader: loads data for calibration
         :param dataset_args: dataset arguments relevant to pipelines
         """
+        dispatch_for_generation(model)
         model_device = get_execution_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 6d862a8a7..d8ad73a10 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -3,7 +3,6 @@
 import torch
 import tqdm
 from compressed_tensors.utils import disable_offloading
-from loguru import logger
 from torch.utils.data.dataloader import DataLoader
 
 from llmcompressor.core import LifecycleCallbacks, active_session
@@ -16,7 +15,10 @@
     to_next_layer_kwargs,
 )
 from llmcompressor.pipelines.registry import CalibrationPipeline
-from llmcompressor.pipelines.sequential.helpers import get_sequential_targets
+from llmcompressor.pipelines.sequential.helpers import (
+    dispatch_for_sequential,
+    get_sequential_targets,
+)
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
@@ -56,15 +58,8 @@ def __call__(
         """
         session = active_session()
 
-        # check for offloading
-        if model.device != torch.device("meta"):
-            logger.warning(
-                "Attemping to use sequential pipeline with a model which is not "
-                "offloaded to the cpu. Deploying a model in this way may lead to more "
-                "memory usage than is required. It is recommended to set "
-                '`oneshot_device="cuda"` or call `force_cpu_offload` on your model '
-                "before compressing"
-            )
+        # prepare model for sequential onloading
+        dispatch_for_sequential(model)
 
         # find layers
         modifiers = session.get_modifiers()
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index 9a1751ea5..869f60578 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -5,8 +5,9 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
 
 import torch
-from compressed_tensors import has_offloaded_params
+from accelerate.hooks import remove_hook_from_module
 from compressed_tensors.quantization import find_name_or_class_matches
+from compressed_tensors.utils import has_offloaded_params, offloaded_dispatch
 from loguru import logger
 from torch.fx import Graph, GraphModule, Node
 from torch.fx.graph import PythonCode
@@ -26,7 +27,12 @@
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
 
-__all__ = ["trace_subgraphs", "Subgraph", "get_sequential_targets"]
+__all__ = [
+    "trace_subgraphs",
+    "Subgraph",
+    "get_sequential_targets",
+    "dispatch_for_sequential",
+]
 
 
 @dataclass
@@ -503,3 +509,22 @@ def is_ancestor(module: Module) -> bool:
 
     is_ancestor(model)
     return ancestors
+
+
+def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel:
+    """
+    Dispatch a model for sequential calibration using a sequential pipeline.
+    The model will be offloaded to the CPU and dispatched to CUDA device if available.
+    Removes any existing hooks.
+
+    :param model: model to dispatch
+    :return: dispatched model
+    """
+    remove_hook_from_module(model, recurse=True)
+
+    if torch.cuda.is_available():
+        offloaded_dispatch(model, execution_device=torch.device("cuda:0"))
+    else:
+        logger.warning("CUDA is not available! Compressing model on CPU instead")
+
+    return model
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index 628fdf4d2..a2a714565 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -2,7 +2,6 @@
 
 import torch
 from compressed_tensors.utils import disable_offloading, get_execution_device
-from loguru import logger
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
 
@@ -11,6 +10,7 @@
 from llmcompressor.pipelines.cache import IntermediatesCache
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
+    dispatch_for_sequential,
     get_sequential_targets,
     trace_subgraphs,
 )
@@ -52,15 +52,8 @@ def __call__(
         """
         session = active_session()
 
-        # check for offloading
-        if model.device != torch.device("meta"):
-            logger.warning(
-                "Attemping to use sequential pipeline with a model which is not "
-                "offloaded to the cpu. Deploying a model in this way may lead to more "
-                "memory usage than is required. It is recommended to set "
-                '`oneshot_device="cuda"` or call `force_cpu_offload` on your model '
-                "before compressing"
-            )
+        # prepare model for sequential onloading
+        dispatch_for_sequential(model)
 
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()

From 06bb6611088a0cb44070031f961d7fbd298f0929 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 11:09:00 -0400
Subject: [PATCH 29/36] add train dispatch

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/entrypoints/train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llmcompressor/entrypoints/train.py b/src/llmcompressor/entrypoints/train.py
index 4b5d8b73b..0bfb26e53 100644
--- a/src/llmcompressor/entrypoints/train.py
+++ b/src/llmcompressor/entrypoints/train.py
@@ -8,6 +8,7 @@
 from llmcompressor.core.session_functions import active_session
 from llmcompressor.datasets.utils import get_processed_dataset
 from llmcompressor.transformers.finetune.trainer import Trainer
+from llmcompressor.utils.dev import dispatch_for_generation
 
 from .utils import post_process, pre_process
 
@@ -63,6 +64,7 @@ def train(**kwargs) -> PreTrainedModel:
     )
 
     pre_process(model_args)
+    dispatch_for_generation(model_args.model)  # train is dispatched same as generation
 
     processed_dataset = get_processed_dataset(
         dataset_args=dataset_args,

From a64a777e8af4bf6473f4574b4210da47a9cf8e0f Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 12:01:36 -0400
Subject: [PATCH 30/36] use remove_dispatch

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/entrypoints/README.md           | 9 ++-------
 src/llmcompressor/entrypoints/utils.py            | 4 ++++
 src/llmcompressor/pipelines/sequential/helpers.py | 9 ++++++---
 src/llmcompressor/utils/dev.py                    | 5 +++--
 4 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md
index 5e907b802..0ead2b7ec 100644
--- a/src/llmcompressor/entrypoints/README.md
+++ b/src/llmcompressor/entrypoints/README.md
@@ -29,9 +29,7 @@ from llmcompressor.modifiers.quantization import QuantizationModifier
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 # Load the model
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
@@ -204,9 +202,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 oneshot_output_dir = "./oneshot_model"
 
 # Load the model
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
@@ -226,7 +222,6 @@ from llmcompressor import create_session, train
 # Student model
 model = AutoModelForCausalLM.from_pretrained(
     oneshot_output_dir,
-    device_map="auto",
     quantization_config=CompressedTensorsConfig(run_compressed=False),
 )
 
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
index 4bbc31e82..2c77dc73d 100644
--- a/src/llmcompressor/entrypoints/utils.py
+++ b/src/llmcompressor/entrypoints/utils.py
@@ -3,6 +3,7 @@
 from pathlib import PosixPath
 from typing import Optional, Tuple
 
+from compressed_tensors.utils import remove_dispatch
 from loguru import logger
 from torch.nn import Module
 from transformers import (
@@ -84,6 +85,9 @@ def post_process(
     Raises:
         ValueError: If saving fails due to an invalid `output_dir` or other issues.
     """
+    # remove any existing dispatches
+    remove_dispatch(model_args.model)
+
     if model_args is not None and output_dir is not None:
         if recipe_args is not None and getattr(recipe_args, "stage", None) is not None:
             output_dir = os.path.join(output_dir, recipe_args.stage)
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index 869f60578..4f562818a 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -5,9 +5,12 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
 
 import torch
-from accelerate.hooks import remove_hook_from_module
 from compressed_tensors.quantization import find_name_or_class_matches
-from compressed_tensors.utils import has_offloaded_params, offloaded_dispatch
+from compressed_tensors.utils import (
+    has_offloaded_params,
+    offloaded_dispatch,
+    remove_dispatch,
+)
 from loguru import logger
 from torch.fx import Graph, GraphModule, Node
 from torch.fx.graph import PythonCode
@@ -520,7 +523,7 @@ def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel:
     :param model: model to dispatch
     :return: dispatched model
     """
-    remove_hook_from_module(model, recurse=True)
+    remove_dispatch(model)
 
     if torch.cuda.is_available():
         offloaded_dispatch(model, execution_device=torch.device("cuda:0"))
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
index e773b48f1..57ce74fb1 100644
--- a/src/llmcompressor/utils/dev.py
+++ b/src/llmcompressor/utils/dev.py
@@ -6,8 +6,8 @@
 
 import torch
 from accelerate import dispatch_model, infer_auto_device_map
-from accelerate.hooks import remove_hook_from_module
 from accelerate.utils import get_balanced_memory
+from compressed_tensors.utils import remove_dispatch
 from huggingface_hub import snapshot_download
 from safetensors.torch import save_file
 from transformers import AutoModelForCausalLM, PreTrainedModel
@@ -124,7 +124,8 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
     :param model: model to dispatch
     :return: model which is dispatched
     """
-    remove_hook_from_module(model, recurse=True)
+    remove_dispatch(model)
+
     max_memory = get_balanced_memory(
         model,
         dtype=model.dtype,

From 8f71004e6807e8dfed77e9c65ba136b2e6d9751d Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 13:29:22 -0400
Subject: [PATCH 31/36] fix example

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantization_2of4_sparse_w4a16/README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md
index c011ecf1d..51e04dd98 100644
--- a/examples/quantization_2of4_sparse_w4a16/README.md
+++ b/examples/quantization_2of4_sparse_w4a16/README.md
@@ -49,9 +49,7 @@ import torch
 from transformers import AutoModelForCausalLM
 
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
-model = AutoModelForCausalLM.from_pretrained(
-    model_stub, torch_dtype=torch.bfloat16, device_map="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16)
 
 dataset = "ultrachat-200k"
 splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}

From 7d7b00d09e1db029d4d9c9686fa11d04535fbca4 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 14:57:39 -0400
Subject: [PATCH 32/36] remove device arg from e2e

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/entrypoints/README.md |  3 ---
 src/llmcompressor/entrypoints/utils.py  |  3 ++-
 tests/e2e/e2e_utils.py                  | 17 +++++------------
 tests/e2e/vLLM/test_vllm.py             |  2 --
 4 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md
index 0ead2b7ec..f023d3c02 100644
--- a/src/llmcompressor/entrypoints/README.md
+++ b/src/llmcompressor/entrypoints/README.md
@@ -112,7 +112,6 @@ output_dir = "./oneshot_model"
 # The model to train
 model = AutoModelForCausalLM.from_pretrained(
     output_dir,
-    device_map="auto",
     quantization_config=CompressedTensorsConfig(run_compressed=False),
 )
 
@@ -146,7 +145,6 @@ Comparisons are defined in `/src/llmcompressor/modifiers/distillation/utils/pyto
 # Define the teacher model
 distill_teacher = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Meta-Llama-3-8B-Instruct",  
-    device_map="auto",
 )
 
 # Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with
@@ -236,7 +234,6 @@ num_calibration_samples = 8  # The number of workers processing datasets in para
 # Define teacher model
 distill_teacher = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Meta-Llama-3-8B-Instruct",
-    device_map="auto",
 )
 
 # Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
index 2c77dc73d..418725d47 100644
--- a/src/llmcompressor/entrypoints/utils.py
+++ b/src/llmcompressor/entrypoints/utils.py
@@ -86,7 +86,8 @@ def post_process(
         ValueError: If saving fails due to an invalid `output_dir` or other issues.
     """
     # remove any existing dispatches
-    remove_dispatch(model_args.model)
+    if model_args is not None and model_args.model is not None:
+        remove_dispatch(model_args.model)
 
     if model_args is not None and output_dir is not None:
         if recipe_args is not None and getattr(recipe_args, "stage", None) is not None:
diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
index 2325b7a34..853d2318b 100644
--- a/tests/e2e/e2e_utils.py
+++ b/tests/e2e/e2e_utils.py
@@ -14,28 +14,21 @@
 def _load_model_and_processor(
     model: str,
     model_class: str,
-    device: str,
 ):
     pretrained_model_class = getattr(transformers, model_class)
-    loaded_model = pretrained_model_class.from_pretrained(
-        model, device_map=device, torch_dtype="auto"
-    )
+    loaded_model = pretrained_model_class.from_pretrained(model, torch_dtype="auto")
     processor = AutoProcessor.from_pretrained(model)
     return loaded_model, processor
 
 
 @log_time
-def _run_oneshot(device: str, **oneshot_kwargs):
-    oneshot(
-        **oneshot_kwargs,
-        oneshot_device=device,
-    )
+def _run_oneshot(**oneshot_kwargs):
+    oneshot(**oneshot_kwargs)
 
 
 def run_oneshot_for_e2e_testing(
     model: str,
     model_class: str,
-    device: str,
     num_calibration_samples: int,
     max_seq_length: int,
     dataset_id: str,
@@ -49,7 +42,7 @@ def run_oneshot_for_e2e_testing(
     oneshot_kwargs = {}
 
     loaded_model, processor = _load_model_and_processor(
-        model=model, model_class=model_class, device=device
+        model=model, model_class=model_class
     )
 
     if dataset_id:
@@ -86,6 +79,6 @@ def data_collator(batch):
 
     # Apply quantization.
     logger.info("ONESHOT KWARGS", oneshot_kwargs)
-    _run_oneshot(device=device, **oneshot_kwargs)
+    _run_oneshot(**oneshot_kwargs)
 
     return oneshot_kwargs["model"], processor
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
index 64d8204e5..89ddb5219 100644
--- a/tests/e2e/vLLM/test_vllm.py
+++ b/tests/e2e/vLLM/test_vllm.py
@@ -88,7 +88,6 @@ def set_up(self, test_data_file: str):
         logger.info("========== RUNNING ==============")
         logger.info(self.save_dir)
 
-        self.device = "cuda:0"
         self.prompts = [
             "The capital of France is",
             "The president of the US is",
@@ -105,7 +104,6 @@ def test_vllm(self, test_data_file: str):
         oneshot_model, tokenizer = run_oneshot_for_e2e_testing(
             model=self.model,
             model_class=self.model_class,
-            device=self.device,
             num_calibration_samples=self.num_calibration_samples,
             max_seq_length=self.max_seq_length,
             scheme=self.scheme,

From 501056e44a9f02a1178b680d85af84bb911fd24b Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 15:07:30 -0400
Subject: [PATCH 33/36] simplify pipeline inference logic, add comment

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/pipelines/basic/pipeline.py |  2 +-
 src/llmcompressor/pipelines/registry.py       | 49 ++++---------------
 2 files changed, 11 insertions(+), 40 deletions(-)

diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
index 35c52f166..dfb99172e 100644
--- a/src/llmcompressor/pipelines/basic/pipeline.py
+++ b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -38,7 +38,7 @@ def __call__(
         :param dataloader: loads data for calibration
         :param dataset_args: dataset arguments relevant to pipelines
         """
-        dispatch_for_generation(model)
+        dispatch_for_generation(model)  # basic dispatch is identical to generation
         model_device = get_execution_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py
index cc4c29d8c..2c1a54cf5 100644
--- a/src/llmcompressor/pipelines/registry.py
+++ b/src/llmcompressor/pipelines/registry.py
@@ -7,23 +7,13 @@
 from torch.utils.data.dataloader import DataLoader
 
 from llmcompressor.modifiers import Modifier
-from llmcompressor.modifiers.awq import AWQModifier
-from llmcompressor.modifiers.obcq.sgpt_base import SparsityModifierBase
-from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationMixin
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
 
 __all__ = ["CalibrationPipeline"]
 
-CALIBRATION_MODIFIERS = (
-    SmoothQuantModifier,
-    AWQModifier,
-    GPTQModifier,
-    SparsityModifierBase,
-)
-
 
 class CalibrationPipeline(ABC, RegistryMixin):
     @staticmethod
@@ -48,7 +38,7 @@ def from_modifiers(
         :return: CalibrationPipeline instance to be called with data (if not datafree)
         """
         user = standardize_lookup_name(user) if user else None
-        inferred = standardize_lookup_name(cls._validate_infer_pipeline(modifiers))
+        inferred = standardize_lookup_name(cls._infer_pipeline(modifiers))
         independent = standardize_lookup_name("independent")
 
         if user == independent:
@@ -64,30 +54,11 @@ def from_modifiers(
         return cls.load_from_registry(pipeline)
 
     @staticmethod
-    def _validate_infer_pipeline(modifiers: List[Modifier]) -> str:
-        if any(isinstance(modifier, CALIBRATION_MODIFIERS) for modifier in modifiers):
-            return "sequential"
-
-        active_qmods = _get_active_quant_modifiers(modifiers)
-        if len(active_qmods) > 1:
-            raise ValueError(
-                f"Recipe contains more than one active quantization config "
-                f"({active_qmods}). These configs may be conflicting, Please modify "
-                "your recipe to use at most one quantization config"
-            )
-
-        if len(active_qmods) == 1:
-            quant_modifier = active_qmods[0]
-            config = quant_modifier.resolve_quantization_config()
-            if config.requires_calibration_data():
-                return "sequential"
-
-        return "datafree"
-
-
-def _get_active_quant_modifiers(modifiers: List[Modifier]) -> List[QuantizationMixin]:
-    return [
-        modifier
-        for modifier in modifiers
-        if isinstance(modifier, QuantizationMixin) and modifier.has_config()
-    ]
+    def _infer_pipeline(modifiers: List[Modifier]) -> str:
+        # only in the case of weight-only qmod quantization can we skip calibration
+        if len(modifiers) == 1 and isinstance(modifiers[0], QuantizationModifier):
+            config = modifiers[0].resolve_quantization_config()
+            if not config.requires_calibration_data():
+                return "datafree"
+
+        return "sequential"

From 74aa7c92def694b9de8fc82869217d2d4db752bd Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 15:12:08 -0400
Subject: [PATCH 34/36] update examples imports

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/awq/qwen3_moe_example.py                               | 2 +-
 examples/multimodal_audio/whisper_example.py                    | 2 +-
 examples/multimodal_vision/gemma3_example.py                    | 2 +-
 examples/multimodal_vision/idefics3_example.py                  | 2 +-
 examples/multimodal_vision/llava_example.py                     | 2 +-
 examples/multimodal_vision/mistral3_example.py                  | 2 +-
 examples/multimodal_vision/mllama_example.py                    | 2 +-
 examples/multimodal_vision/phi3_vision_example.py               | 2 +-
 examples/multimodal_vision/pixtral_example.py                   | 2 +-
 examples/multimodal_vision/qwen2_vl_example.py                  | 2 +-
 examples/multimodal_vision/qwen_2_5_vl_example.py               | 2 +-
 examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py | 2 +-
 examples/quantization_kv_cache/gemma2_fp8_kv_example.py         | 2 +-
 examples/quantization_kv_cache/llama3_fp8_kv_example.py         | 2 +-
 examples/quantization_kv_cache/phi3.5_fp8_kv_example.py         | 2 +-
 examples/quantization_w4a16/llama3_example.py                   | 2 +-
 examples/quantization_w4a16_fp4/llama3_example.py               | 2 +-
 examples/quantization_w4a4_fp4/llama3_example.py                | 2 +-
 examples/quantization_w8a8_fp8/gemma2_example.py                | 2 +-
 examples/quantization_w8a8_fp8/llama3.2_vision_example.py       | 2 +-
 examples/quantization_w8a8_fp8/llama3_example.py                | 2 +-
 examples/quantization_w8a8_fp8/llava1.5_example.py              | 2 +-
 examples/quantization_w8a8_fp8/qwen2vl_example.py               | 2 +-
 examples/quantization_w8a8_fp8/whisper_example.py               | 2 +-
 examples/quantization_w8a8_int8/gemma2_example.py               | 2 +-
 examples/quantization_w8a8_int8/llama3_example.py               | 2 +-
 examples/quantizing_moe/deepseek_moe_w4a16.py                   | 2 +-
 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py                | 2 +-
 examples/quantizing_moe/deepseek_moe_w8a8_int8.py               | 2 +-
 examples/quantizing_moe/mixtral_moe_w8a8_fp8.py                 | 2 +-
 examples/quantizing_moe/qwen_moe_w4a16.py                       | 2 +-
 examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py         | 2 +-
 src/llmcompressor/utils/__init__.py                             | 1 +
 33 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
index 96baf5995..5fdc231c9 100644
--- a/examples/awq/qwen3_moe_example.py
+++ b/examples/awq/qwen3_moe_example.py
@@ -3,7 +3,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen3-30B-A3B"
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index f286ddc7f..9c2e494a8 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -4,7 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "openai/whisper-large-v3"
diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py
index 9ac1f4ff7..5437ba36c 100644
--- a/examples/multimodal_vision/gemma3_example.py
+++ b/examples/multimodal_vision/gemma3_example.py
@@ -5,7 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Load model.
 model_id = "google/gemma-3-4b-it"
diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
index 09722c127..1225349c4 100644
--- a/examples/multimodal_vision/idefics3_example.py
+++ b/examples/multimodal_vision/idefics3_example.py
@@ -6,7 +6,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Load model.
 model_id = "HuggingFaceM4/Idefics3-8B-Llama3"  # or "HuggingFaceTB/SmolVLM-Instruct"
diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
index 984e8a1fd..0a17d8c50 100644
--- a/examples/multimodal_vision/llava_example.py
+++ b/examples/multimodal_vision/llava_example.py
@@ -5,7 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Load model.
 model_id = "llava-hf/llava-1.5-7b-hf"
diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py
index fc3657b0e..e70ee43ec 100644
--- a/examples/multimodal_vision/mistral3_example.py
+++ b/examples/multimodal_vision/mistral3_example.py
@@ -8,7 +8,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Load model.
 model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
index 7d94a677c..6672aff2e 100644
--- a/examples/multimodal_vision/mllama_example.py
+++ b/examples/multimodal_vision/mllama_example.py
@@ -5,7 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Load model.
 model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
index 324df5d31..fa4b0feab 100644
--- a/examples/multimodal_vision/phi3_vision_example.py
+++ b/examples/multimodal_vision/phi3_vision_example.py
@@ -7,7 +7,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Load model.
 model_id = "microsoft/Phi-3-vision-128k-instruct"
diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
index b2b4c7440..a0ed50ef4 100644
--- a/examples/multimodal_vision/pixtral_example.py
+++ b/examples/multimodal_vision/pixtral_example.py
@@ -5,7 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Load model.
 model_id = "mgoin/pixtral-12b"
diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
index 14033872d..8cccf768e 100644
--- a/examples/multimodal_vision/qwen2_vl_example.py
+++ b/examples/multimodal_vision/qwen2_vl_example.py
@@ -8,7 +8,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Load model.
 model_id = "Qwen/Qwen2-VL-2B-Instruct"
diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py
index 096596d24..10a0edeec 100644
--- a/examples/multimodal_vision/qwen_2_5_vl_example.py
+++ b/examples/multimodal_vision/qwen_2_5_vl_example.py
@@ -8,7 +8,7 @@
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Load model.
 model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
index 4bf505047..0b83d7384 100644
--- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
+++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -3,7 +3,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot, train
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # load the model in as bfloat16 to save on memory and compute
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
index 6246d41f7..f753d71dd 100644
--- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -2,7 +2,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "google/gemma-2-9b-it"
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
index 3ee8c38db..339c353fa 100644
--- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -3,7 +3,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
index 39d832830..0d16e1b22 100644
--- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -2,7 +2,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 # Phi-3.5 is a special case for KV cache quantization because it has
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 6bd41c58e..89c9d353e 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -3,7 +3,7 @@
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py
index da526cad7..d35de8d30 100644
--- a/examples/quantization_w4a16_fp4/llama3_example.py
+++ b/examples/quantization_w4a16_fp4/llama3_example.py
@@ -2,7 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
 
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
index f209a581b..95d01657b 100644
--- a/examples/quantization_w4a4_fp4/llama3_example.py
+++ b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -3,7 +3,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
index 5c41a4d35..1b56512b4 100644
--- a/examples/quantization_w8a8_fp8/gemma2_example.py
+++ b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -2,7 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "google/gemma-2-27b-it"
 
diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
index 1c21c23d0..6a1454cd0 100644
--- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
+++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -2,7 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py
index 346012e4e..39c196752 100644
--- a/examples/quantization_w8a8_fp8/llama3_example.py
+++ b/examples/quantization_w8a8_fp8/llama3_example.py
@@ -2,7 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
index 41a02b156..a03188a61 100644
--- a/examples/quantization_w8a8_fp8/llava1.5_example.py
+++ b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -2,7 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "llava-hf/llava-1.5-7b-hf"
 
diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py
index f52fb5c9e..ebadbe973 100644
--- a/examples/quantization_w8a8_fp8/qwen2vl_example.py
+++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py
@@ -2,7 +2,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
 
diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py
index 403de54a8..2cbbebe7d 100644
--- a/examples/quantization_w8a8_fp8/whisper_example.py
+++ b/examples/quantization_w8a8_fp8/whisper_example.py
@@ -3,7 +3,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "openai/whisper-large-v2"
 
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
index 13c900d4c..d332532b0 100644
--- a/examples/quantization_w8a8_int8/gemma2_example.py
+++ b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -3,7 +3,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # 1) Select model and load it.
 MODEL_ID = "google/gemma-2-2b-it"
diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py
index 66487dba4..feab87455 100644
--- a/examples/quantization_w8a8_int8/llama3_example.py
+++ b/examples/quantization_w8a8_int8/llama3_example.py
@@ -4,7 +4,7 @@
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
index 5de7911fb..9880e9248 100644
--- a/examples/quantizing_moe/deepseek_moe_w4a16.py
+++ b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -4,7 +4,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
 
 from llmcompressor import oneshot
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
 # Please consider either downgrading your transformers version to a
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
index e247c77fb..0bc9c24df 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -4,7 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
 # Please consider either downgrading your transformers version to a
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
index 8648dbbf8..3ec506c34 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -5,7 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
 # Please consider either downgrading your transformers version to a
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
index e45217203..a17bf873d 100644
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -5,7 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
index 51b8821b9..40a78a9b7 100644
--- a/examples/quantizing_moe/qwen_moe_w4a16.py
+++ b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -4,7 +4,7 @@
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # select a Mixture of Experts model for quantization
 MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
index 616db364e..590b74611 100644
--- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
+++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -6,7 +6,7 @@
 from llmcompressor import oneshot
 from llmcompressor.modifiers.obcq import SparseGPTModifier
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils.dev import dispatch_for_generation
+from llmcompressor.utils import dispatch_for_generation
 
 # Configuration
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/src/llmcompressor/utils/__init__.py b/src/llmcompressor/utils/__init__.py
index 98d5e1c65..c4fb71cdc 100644
--- a/src/llmcompressor/utils/__init__.py
+++ b/src/llmcompressor/utils/__init__.py
@@ -4,4 +4,5 @@
 
 # flake8: noqa
 
+from .dev import *
 from .helpers import *

From e4487e24a0d0dbd669a0b32d179a65faf5ce3c6a Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 15:17:20 -0400
Subject: [PATCH 35/36] fix call

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 tests/llmcompressor/transformers/tracing/test_models.py | 1 -
 tests/lmeval/test_lmeval.py                             | 2 --
 2 files changed, 3 deletions(-)

diff --git a/tests/llmcompressor/transformers/tracing/test_models.py b/tests/llmcompressor/transformers/tracing/test_models.py
index 327f3d55d..135928902 100644
--- a/tests/llmcompressor/transformers/tracing/test_models.py
+++ b/tests/llmcompressor/transformers/tracing/test_models.py
@@ -136,7 +136,6 @@ def test_model_trace(model_id, model_class, targets, modality, backends):
         modality=modality,
         trust_remote_code=True,
         skip_weights=True,
-        device_map="cpu",
     )
 
     target_modules = get_target_modules(model, targets)
diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py
index d4d6e6056..51aa50665 100644
--- a/tests/lmeval/test_lmeval.py
+++ b/tests/lmeval/test_lmeval.py
@@ -90,7 +90,6 @@ def set_up(self, test_data_file: str):
         logger.info("========== RUNNING ==============")
         logger.info(self.scheme)
 
-        self.device = "cuda:0"
         self.num_calibration_samples = 512
         self.max_seq_length = 2048
 
@@ -103,7 +102,6 @@ def test_lm_eval(self, test_data_file: str):
         oneshot_model, processor = run_oneshot_for_e2e_testing(
             model=self.model,
             model_class=self.model_class,
-            device=self.device,
             num_calibration_samples=self.num_calibration_samples,
             max_seq_length=self.max_seq_length,
             scheme=self.scheme,

From f134e56e087bba14eaea590d72b2f3653e32b9fe Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 16 Jun 2025 18:00:15 -0400
Subject: [PATCH 36/36] wip: run compression in parallel

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../modifiers/quantization/gptq/base.py       | 75 ++++++++++++-------
 .../quantization/gptq/gptq_quantize.py        | 21 +++++-
 2 files changed, 67 insertions(+), 29 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index 7ae61f3e2..fb8baf0a7 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -1,5 +1,6 @@
 import contextlib
 import warnings
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
@@ -22,12 +23,12 @@
 from llmcompressor.modifiers import Modifier
 from llmcompressor.modifiers.quantization.gptq.gptq_quantize import (
     accumulate_hessian,
+    initialize_linalg,
     make_empty_hessian,
     quantize_weight,
 )
 from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
 from llmcompressor.sentinel import Sentinel
-from llmcompressor.utils.metric_logging import CompressionLogger
 
 __all__ = ["GPTQModifier"]
 
@@ -252,34 +253,54 @@ def compress_modules(self):
         """
         Quantize modules which have been calibrated
         """
-        for module in list(self._num_samples.keys()):
-            name = self._module_names[module]
-            num_samples = self._num_samples[module]
-            quant_args = getattr_chain(module, "quantization_scheme.weights")
-
-            logger.info(f"Quantizing {name} using {num_samples} samples")
-            with torch.no_grad(), align_module_device(
-                module
-            ), self._maybe_onload_hessian(module), CompressionLogger(
-                module
-            ) as comp_logger:
-                loss, quantized_weight, scale, zero_point, g_idx = quantize_weight(
-                    module=module,
-                    quant_args=quant_args,
-                    hessians_dict=self._hessians,
-                    blocksize=self.block_size,
-                    percdamp=self.dampening_frac,
-                )
-                comp_logger.set_loss(loss)
+        import time
+
+        start_time = time.time()
+
+        futures = []
+        with ThreadPoolExecutor() as executor:
+            for module in list(self._num_samples.keys()):
+                initialize_linalg(get_execution_device(module))
+                future = executor.submit(self._compress_module, module)
+                futures.append(future)
+
+            for future in as_completed(futures, timeout=300):  # no timeout
+                name, num_samples, loss = future.result()
+                logger.info(f"Quantized {name}")
+                logger.info(f"    num_samples={num_samples}")
+                logger.info(f"    loss={loss:.2f}")
+
+        logger.info(
+            f"Quantized {len(futures)} modules in {time.time() - start_time: .1f}s"
+        )
+
+    def _compress_module(self, module: torch.nn.Module) -> Tuple[str, int, float]:
+        name = self._module_names[module]
+        num_samples = self._num_samples[module]
+        quant_args = getattr_chain(module, "quantization_scheme.weights")
+
+        with torch.no_grad(), align_module_device(module), self._maybe_onload_hessian(
+            module
+        ):
+            logger.info(f"Quantizing {name}...")
+            loss, quantized_weight, scale, zero_point, g_idx = quantize_weight(
+                module=module,
+                quant_args=quant_args,
+                hessians_dict=self._hessians,
+                blocksize=self.block_size,
+                percdamp=self.dampening_frac,
+            )
+
+        update_offload_parameter(module, "weight", quantized_weight)
+        update_offload_parameter(module, "weight_scale", scale)
+        update_offload_parameter(module, "weight_zero_point", zero_point)
+        if g_idx is not None:
+            update_offload_parameter(module, "weight_g_idx", g_idx)
 
-            update_offload_parameter(module, "weight", quantized_weight)
-            update_offload_parameter(module, "weight_scale", scale)
-            update_offload_parameter(module, "weight_zero_point", zero_point)
-            if g_idx is not None:
-                update_offload_parameter(module, "weight_g_idx", g_idx)
+        # self._hessians[module] already deleted by quantize_weight
+        del self._num_samples[module]
 
-            # self._hessians[module] already deleted by quantize_weight
-            del self._num_samples[module]
+        return name, num_samples, loss
 
     def on_end(self, state: State, event: Event, **kwargs):
         """
diff --git a/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py
index 4392ed8cf..b3fc63fab 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py
@@ -17,8 +17,25 @@
 from llmcompressor.pytorch.utils.helpers import tensor_sparsity
 
 GPTQ_PRECISION = torch.float32
-
-__all__ = ["make_empty_hessian", "accumulate_hessian", "quantize_weight"]
+INITIALIZED_DEVICES = set()
+
+__all__ = [
+    "initialize_linalg",
+    "make_empty_hessian",
+    "accumulate_hessian",
+    "quantize_weight",
+]
+
+
+def initialize_linalg(device: torch.device):
+    # pre-load torch.linalg module to avoid loading the module in threads,
+    # which can cause lazy loading assertion errors
+    # https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp#L50  # noqa: E501
+    # https://github.com/pytorch/ignite/issues/3004
+    if device not in INITIALIZED_DEVICES:
+        _input = torch.ones((1, 1), device=device)
+        _ = torch.cholesky_inverse(torch.linalg.cholesky(_input))
+        INITIALIZED_DEVICES.add(device)
 
 
 def make_empty_hessian(