vllm-project
diff --git a/‎.github/actions/prepare-code-coverage/action.yml
Lines changed: 19 additions & 0 deletions b/‎.github/actions/prepare-code-coverage/action.yml
Lines changed: 19 additions & 0 deletions
diff --git a/‎.github/workflows/test-check-transformers.yaml
Lines changed: 21 additions & 2 deletions b/‎.github/workflows/test-check-transformers.yaml
Lines changed: 21 additions & 2 deletions
diff --git a/‎.github/workflows/test-check.yaml
Lines changed: 46 additions & 0 deletions b/‎.github/workflows/test-check.yaml
Lines changed: 46 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/big_models_with_sequential_onloading/README.md
Lines changed: 2 additions & 2 deletions b/‎examples/big_models_with_sequential_onloading/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/big_models_with_sequential_onloading/llama3.3_70b.py
Lines changed: 5 additions & 1 deletion b/‎examples/big_models_with_sequential_onloading/llama3.3_70b.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/multimodal_vision/llama4_example.py
Lines changed: 92 additions & 0 deletions b/‎examples/multimodal_vision/llama4_example.py
Lines changed: 92 additions & 0 deletions
diff --git a/‎examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
Lines changed: 5 additions & 0 deletions b/‎examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/quantization_w4a4_fp4/llama4_example.py
Lines changed: 93 additions & 0 deletions b/‎examples/quantization_w4a4_fp4/llama4_example.py
Lines changed: 93 additions & 0 deletions
@@ -0,0 +1,19 @@
+name: prepare code coverage
+description: installs code coverage dependencies and exports an updated 'PYTEST_ADDOPTS' env var
+
+runs:
+  using: composite
+  steps:
+    - run: |-
+        # install dependencies
+        pip3 install coverage pytest-cov https://github.com/neuralmagic/pytest-nm-releng/archive/v0.4.0.tar.gz
+
+        # generate and source flags
+        FLAGS_FILE="coverage_flags.sh"
+        nmre-generate-coverage-flags --package "llmcompressor" --output-file "$FLAGS_FILE"
+        source "$FLAGS_FILE"
+        rm "$FLAGS_FILE"
+
+        # export defined/updated 'PYTEST_ADDOPTS' env var
+        echo "PYTEST_ADDOPTS=$PYTEST_ADDOPTS" | tee -a "$GITHUB_ENV"
+      shell: bash
@@ -1,10 +1,16 @@
 name: Test Checks (Transformers)
 on:
   pull_request:
-    branches: main
+    branches: [ main ]
     types: [ labeled, synchronize ]
   push:
-    branches: main
+    branches: [ main ]
+  workflow_dispatch:
+    inputs:
+      code_coverage:
+        description: if enabled, code coverage metrics will be collected during the test run
+        type: boolean
+        default: false
 
 env:
   CADENCE: "commit"
@@ -72,6 +78,9 @@ jobs:
           BUILD_TYPE=nightly pip3 install .
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
+      - name: "⚙️ Prepare code coverage"
+        if: inputs.code_coverage
+        uses: ./.github/actions/prepare-code-coverage
       - name: "🔬 Running transformers tests"
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
@@ -104,3 +113,13 @@ jobs:
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
           pytest -v tests/llmcompressor/transformers/kv_cache
+      - name: "Upload coverage report"
+        if: (success() || failure()) && inputs.code_coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: transformers-tests-coverage-results
+          path: |
+            .coverage
+            coverage-html
+            coverage.json
+          retention-days: 5
@@ -4,6 +4,12 @@ on:
     branches:
       - main
   push:
+  workflow_dispatch:
+    inputs:
+      code_coverage:
+        description: if enabled, code coverage metrics will be collected during the test run
+        type: boolean
+        default: false
 
 env:
   CADENCE: "commit"
@@ -36,8 +42,21 @@ jobs:
           BUILD_TYPE=nightly pip3 install .
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
+      - name: "⚙️ Prepare code coverage"
+        if: inputs.code_coverage
+        uses: ./.github/actions/prepare-code-coverage
       - name: "🔬 Running base tests"
         run: make test
+      - name: "Upload coverage report"
+        if: (success() || failure()) && inputs.code_coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: base-tests-coverage-results
+          path: |
+            .coverage
+            coverage-html
+            coverage.json
+          retention-days: 5
 
   pytorch-tests:
     runs-on: ubuntu-22.04
@@ -65,9 +84,23 @@ jobs:
           BUILD_TYPE=nightly pip3 install .
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
+      - name: "⚙️ Prepare code coverage"
+        if: inputs.code_coverage
+        uses: ./.github/actions/prepare-code-coverage
       - name: "🔬 Running pytorch tests"
         run: |
           pytest -v tests/llmcompressor/pytorch
+      - name: "Upload coverage report"
+        if: (success() || failure()) && inputs.code_coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: pytorch-tests-coverage-results
+          path: |
+            .coverage
+            coverage-html
+            coverage.json
+          retention-days: 5
+
 
   compat-pytorch-1_9-pytorch-tests:
     runs-on: ubuntu-22.04
@@ -95,6 +128,19 @@ jobs:
           BUILD_TYPE=nightly pip3 install .
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
+      - name: "⚙️ Prepare code coverage"
+        if: inputs.code_coverage
+        uses: ./.github/actions/prepare-code-coverage
       - name: "🔬 Running pytorch tests"
         run: |
           pytest -v tests/llmcompressor/pytorch
+      - name: "Upload coverage report"
+        if: (success() || failure()) && inputs.code_coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: compat-pytorch-tests-coverage-results
+          path: |
+            .coverage
+            coverage-html
+            coverage.json
+          retention-days: 5
@@ -18,11 +18,11 @@ Big updates have landed in LLM Compressor! To get a more in-depth look, check ou
 
 Some of the exciting new features include:
 
-* **Large Model Support with Sequential Onloading** As of llm-compressor>=0.6.0, you can now quantize very large language models on a single GPU. Models are broken into disjoint layers which are then onloaded to the GPU one layer at a time. For more information on sequential onloading, see [Big Modeling with Sequential Onloading](examples/big_models_with_sequential_onloading/README.md) as well as the [DeepSeek-R1 Example](examples/quantizing_moe/deepseek_r1_example.py).
+* **Llama4 Quantization Support**: Quantize a Llama4 model to [W4A16](examples/multimodal_vision/llama4_example.py) or [NVFP4](examples/quantization_w4a4_fp4/llama4_example.py). The checkpoint produced can seamlessly run in vLLM.
+* **Large Model Support with Sequential Onloading**: As of llm-compressor>=0.6.0, you can now quantize very large language models on a single GPU. Models are broken into disjoint layers which are then onloaded to the GPU one layer at a time. For more information on sequential onloading, see [Big Modeling with Sequential Onloading](examples/big_models_with_sequential_onloading/README.md) as well as the [DeepSeek-R1 Example](examples/quantizing_moe/deepseek_r1_example.py).
 * **Preliminary FP4 Quantization Support:** Quantize weights and activations to FP4 and seamlessly run the compressed model in vLLM. Model weights and activations are quantized following the NVFP4 [configuration](https://github.com/neuralmagic/compressed-tensors/blob/f5dbfc336b9c9c361b9fe7ae085d5cb0673e56eb/src/compressed_tensors/quantization/quant_scheme.py#L104). See examples of [weight-only quantization](examples/quantization_w4a16_fp4/llama3_example.py) and [fp4 activation support](examples/quantization_w4a4_fp4/llama3_example.py). Support is currently preliminary and additional support will be added for MoEs.
 * **Updated AWQ Support:** Improved support for MoEs with better handling of larger models
 * **Axolotl Sparse Finetuning Integration:** Seamlessly finetune sparse LLMs with our Axolotl integration. Learn how to create [fast sparse open-source models with Axolotl and LLM Compressor](https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open). See also the [Axolotl integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#llmcompressor).
-* **Day 0 Llama 4 Support:** Meta utilized LLM Compressor to create the [FP8-quantized Llama-4-Maverick-17B-128E](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8), optimized for vLLM inference using [compressed-tensors](https://github.com/neuralmagic/compressed-tensors) format.
 
 ### Supported Formats
 * Activation Quantization: W8A8 (int8 and fp8)
 
@@ -18,7 +18,7 @@ The Llama 3.3 70b is larger than 80 GB, surpassing the size of 1 A100. However,
 
 ```python
 model_id = "meta-llama/Llama-3.3-70B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map=None)
 ```
 
 The model is first loaded onto the `cpu`, as indicated through the use of `None` for the `device_map` argument in the `from_pretrained` method when loading the model.
@@ -42,4 +42,4 @@ output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 ```
 
-Finally, we call `dispatch_for_generation` to evenly load the model across available devices (potentially offloading the model if required) and run sample generations on the newly quantized model.
+Finally, we call `dispatch_for_generation` to evenly load the model across available devices (potentially offloading the model if required) and run sample generations on the newly quantized model.
@@ -8,7 +8,11 @@
 
 # Select model and load it.
 model_id = "meta-llama/Llama-3.3-70B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype="auto",
+    device_map=None,
+)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # Select calibration dataset.
 
@@ -0,0 +1,92 @@
+import torch
+from datasets import load_dataset
+from transformers import Llama4ForConditionalGeneration, Llama4Processor
+
+from llmcompressor import oneshot
+from llmcompressor.modeling import prepare_for_calibration
+from llmcompressor.modifiers.quantization import GPTQModifier
+
+# Select model and load it.
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+processor = Llama4Processor.from_pretrained(model_id)
+# We update `Llama4TextMoe` modules with custom `SequentialLlama4TextMoe`.
+# This change allows compatibility with vllm.
+# To apply your own custom module for experimentation, consider updating
+# `SequentialLlama4TextMoe` under llmcompressor/modeling/llama4.py
+model = prepare_for_calibration(model)
+
+DATASET_ID = "neuralmagic/calibration"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 8192
+
+ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
+
+
+def preprocess_function(example):
+    messgages = []
+    for message in example["messages"]:
+        messgages.append(
+            {
+                "role": message["role"],
+                "content": [{"type": "text", "text": message["content"]}],
+            }
+        )
+
+    return processor.apply_chat_template(
+        messgages,
+        return_tensors="pt",
+        padding=False,
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        tokenize=True,
+        add_special_tokens=False,
+        return_dict=True,
+        add_generation_prompt=False,
+    )
+
+
+ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
+
+
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: torch.tensor(value)
+        if key != "pixel_values"
+        else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        for key, value in batch[0].items()
+    }
+
+
+# Configure the quantization algorithm to run.
+recipe = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16",
+    ignore=[
+        "re:.*lm_head",
+        "re:.*self_attn",
+        "re:.*router",
+        "re:vision_model.*",
+        "re:multi_modal_projector.*",
+        "Llama4TextAttention",
+    ],
+)
+
+# Apply algorithms.
+# due to the large size of Llama4, we specify sequential targets such that
+# only one MLP is loaded into GPU memory at a time
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    data_collator=data_collator,
+    sequential_targets=["Llama4TextMLP"],
+)
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
@@ -1,3 +1,7 @@
+# NOTE: Fine tuning can require more steps than is shown in the example
+# See the Axolotl integration blog post for best fine tuning practices
+# https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open
+
 from pathlib import Path
 
 import torch
@@ -74,6 +78,7 @@
 )
 
 # Sparse finetune
+# This step can be supplanted by fine tuning via integrated FT libraries such as Axolotl
 train(
     model=(output_path / "sparsity_stage"),
     **oneshot_kwargs,
 
@@ -0,0 +1,93 @@
+import torch
+from datasets import load_dataset
+from transformers import Llama4ForConditionalGeneration, Llama4Processor
+
+from llmcompressor import oneshot
+from llmcompressor.modeling import prepare_for_calibration
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# Select model and load it.
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+processor = Llama4Processor.from_pretrained(model_id)
+# We update `Llama4TextMoe` modules with custom `SequentialLlama4TextMoe`.
+# This change allows compatibility with vllm.
+# To apply your own custom module for experimentation, consider updating
+# `SequentialLlama4TextMoe` under llmcompressor/modeling/llama4.py
+model = prepare_for_calibration(model)
+
+DATASET_ID = "neuralmagic/calibration"
+NUM_CALIBRATION_SAMPLES = 20
+MAX_SEQUENCE_LENGTH = 8192
+
+ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
+
+
+def preprocess_function(example):
+    messgages = []
+    for message in example["messages"]:
+        messgages.append(
+            {
+                "role": message["role"],
+                "content": [{"type": "text", "text": message["content"]}],
+            }
+        )
+
+    return processor.apply_chat_template(
+        messgages,
+        return_tensors="pt",
+        padding=False,
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        tokenize=True,
+        add_special_tokens=False,
+        return_dict=True,
+        add_generation_prompt=False,
+    )
+
+
+ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
+
+
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: torch.tensor(value)
+        if key != "pixel_values"
+        else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        for key, value in batch[0].items()
+    }
+
+
+# Configure the quantization algorithm to run.
+recipe = QuantizationModifier(
+    targets="Linear",
+    scheme="NVFP4",
+    ignore=[
+        "re:.*lm_head",
+        "re:.*self_attn",
+        "re:.*router",
+        "re:vision_model.*",
+        "re:multi_modal_projector.*",
+        "Llama4TextAttention",
+    ],
+)
+
+# Apply algorithms.
+# due to the large size of Llama4, we specify sequential targets such that
+# only one MLP is loaded into GPU memory at a time
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    sequential_targets=["Llama4TextMLP"],
+    data_collator=data_collator,
+)
+
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
+model.save_pretrained(SAVE_DIR)
+processor.save_pretrained(SAVE_DIR)