From d09f8311d2c6c938579d3299d499fbd9b9583549 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 18 Jun 2025 21:08:07 -0700
Subject: [PATCH 01/17] Update
[ghstack-poisoned]
---
examples/models/llama/config/llm_config.py | 4 +-
examples/models/llama/export_llama_lib.py | 8 +-
extension/llm/export/README.md | 145 +++++++++++++++++++
extension/llm/export/export_llm.py | 55 ++++++-
extension/llm/export/test/test_export_llm.py | 112 ++++++++++++++
extension/llm/install_requirements.sh | 9 ++
6 files changed, 327 insertions(+), 6 deletions(-)
create mode 100644 extension/llm/export/README.md
create mode 100644 extension/llm/export/test/test_export_llm.py
create mode 100755 extension/llm/install_requirements.sh
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 034d8af7562..201e3a5414a 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -65,7 +65,9 @@ class BaseConfig:
params: Model parameters, such as n_layers, hidden_size, etc.
If left empty will use defaults specified in model_args.py.
checkpoint: Path to the checkpoint file.
- If left empty, the model will be initialized with random weights.
+ If left empty, the model will either be initialized with random weights
+ if it is a Llama model or the weights will be downloaded from HuggingFace
+ if it is a non-Llama model.
checkpoint_dir: Path to directory containing sharded checkpoint files.
tokenizer_path: Path to the tokenizer file.
metadata: Json string containing metadata information.
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 1f055d65822..78c6244abee 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -53,6 +53,8 @@
)
from executorch.util.activation_memory_profiler import generate_memory_trace
+from omegaconf import DictConfig
+
from ..model_factory import EagerModelFactory
from .source_transformation.apply_spin_quant_r1_r2 import (
fuse_layer_norms,
@@ -571,12 +573,14 @@ def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str:
def export_llama(
- export_options: Union[argparse.Namespace, LlmConfig],
+ export_options: Union[argparse.Namespace, LlmConfig, DictConfig],
) -> str:
if isinstance(export_options, argparse.Namespace):
# Legacy CLI.
llm_config = LlmConfig.from_args(export_options)
- elif isinstance(export_options, LlmConfig):
+ elif isinstance(export_options, LlmConfig) or isinstance(
+ export_options, DictConfig
+ ):
# Hydra CLI.
llm_config = export_options
else:
diff --git a/extension/llm/export/README.md b/extension/llm/export/README.md
new file mode 100644
index 00000000000..e15c7fd7f77
--- /dev/null
+++ b/extension/llm/export/README.md
@@ -0,0 +1,145 @@
+# LLM Export API
+
+This directory contains the unified API for exporting Large Language Models (LLMs) to ExecuTorch. The `export_llm` module provides a streamlined interface to convert various LLM architectures to optimized `.pte` files for on-device inference.
+
+## Overview
+
+The LLM export process transforms a model from its original format to an optimized representation suitable for mobile and edge devices. This involves several key steps:
+
+1. **Model Instantiation**: Load the model architecture and weights from sources like Hugging Face
+2. **Source Transformations**: Apply model-specific optimizations and quantization
+3. **IR Export**: Convert to intermediate representations (EXIR, Edge dialect)
+4. **Graph Transformations**: Apply backend-specific optimizations and PT2E quantization
+5. **Backend Delegation**: Partition operations to hardware-specific backends (XNNPACK, CoreML, QNN, etc.)
+6. **Serialization**: Export to final ExecuTorch `.pte` format
+
+## Supported Models
+
+- **Llama**: Llama 2, Llama 3, Llama 3.1, Llama 3.2 (1B, 3B, 8B variants)
+- **Qwen**: Qwen 2.5, Qwen 3 (0.6B, 1.7B, 4B variants)
+- **Phi**: Phi-3-Mini, Phi-4-Mini
+- **Stories**: Stories110M (educational model)
+- **SmolLM**: SmolLM2
+
+## Installation
+
+First, install the required dependencies:
+
+```bash
+./extension/llm/install_requirements.sh
+```
+
+## Usage
+
+The export API supports two configuration approaches:
+
+### Option 1: Hydra CLI Arguments
+
+Use structured configuration arguments directly on the command line:
+
+```bash
+python -m extension.llm.export.export_llm \
+ base.model_class=llama3 \
+ model.use_sdpa_with_kv_cache=True \
+ model.use_kv_cache=True \
+ export.max_seq_length=128 \
+ debug.verbose=True \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ quantization.qmode=8da4w
+```
+
+### Option 2: Configuration File
+
+Create a YAML configuration file and reference it:
+
+```bash
+python -m extension.llm.export.export_llm --config my_config.yaml
+```
+
+Example `my_config.yaml`:
+```yaml
+base:
+ model_class: llama3
+ tokenizer_path: /path/to/tokenizer.json
+
+model:
+ use_kv_cache: true
+ use_sdpa_with_kv_cache: true
+ enable_dynamic_shape: true
+
+export:
+ max_seq_length: 512
+ output_dir: ./exported_models
+ output_name: llama3_optimized.pte
+
+quantization:
+ qmode: 8da4w
+ group_size: 32
+
+backend:
+ xnnpack:
+ enabled: true
+ extended_ops: true
+
+debug:
+ verbose: true
+```
+
+**Important**: You cannot mix both approaches. Use either CLI arguments OR a config file, not both.
+
+## Example Commands
+
+### Export Qwen3 0.6B with XNNPACK backend and quantization
+```bash
+python -m extension.llm.export.export_llm \
+ base.model_class=qwen3-0_6b \
+ base.params=examples/models/qwen3/0_6b_config.json \
+ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ model.dtype_override=FP32 \
+ export.max_seq_length=512 \
+ export.output_name=qwen3_0_6b.pte \
+ quantization.qmode=8da4w \
+ backend.xnnpack.enabled=true \
+ backend.xnnpack.extended_ops=true \
+ debug.verbose=true
+```
+
+### Export Phi-4-Mini with custom checkpoint
+```bash
+python -m extension.llm.export.export_llm \
+ base.model_class=phi_4_mini \
+ base.checkpoint=/path/to/phi4_checkpoint.pth \
+ base.params=examples/models/phi-4-mini/config.json \
+ base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ export.max_seq_length=256 \
+ export.output_name=phi4_mini.pte \
+ backend.xnnpack.enabled=true \
+ debug.verbose=true
+```
+
+### Export with CoreML backend (iOS optimization)
+```bash
+python -m extension.llm.export.export_llm \
+ base.model_class=llama3 \
+ model.use_kv_cache=true \
+ export.max_seq_length=128 \
+ backend.coreml.enabled=true \
+ backend.coreml.compute_units=ALL \
+ quantization.pt2e_quantize=coreml_c4w \
+ debug.verbose=true
+```
+
+## Configuration Options
+
+For a complete reference of all available configuration options, see the [LlmConfig class definition](../../../examples/models/llama/config/llm_config.py) which documents all supported parameters for base, model, export, quantization, backend, and debug configurations.
+
+## Further Reading
+
+- [Llama Examples](../../../examples/models/llama/README.md) - Comprehensive Llama export guide
+- [LLM Runner](../runner/) - Running exported models
+- [ExecuTorch Documentation](https://pytorch.org/executorch/) - Framework overview
\ No newline at end of file
diff --git a/extension/llm/export/export_llm.py b/extension/llm/export/export_llm.py
index 09a15d6ab58..2af7439b805 100644
--- a/extension/llm/export/export_llm.py
+++ b/extension/llm/export/export_llm.py
@@ -23,23 +23,72 @@
backend.xnnpack.enabled=True \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w"
+
+Example usage using config file:
+python -m extension.llm.export.export_llm \
+ --config example_llm_config.yaml
"""
+import argparse
+import sys
+from typing import Any, List, Tuple
+
import hydra
+import yaml
from executorch.examples.models.llama.config.llm_config import LlmConfig
from executorch.examples.models.llama.export_llama_lib import export_llama
from hydra.core.config_store import ConfigStore
-from omegaconf import OmegaConf
+from omegaconf import DictConfig, OmegaConf
cs = ConfigStore.instance()
cs.store(name="llm_config", node=LlmConfig)
-@hydra.main(version_base=None, config_path=None, config_name="llm_config")
-def main(llm_config: LlmConfig) -> None:
+def parse_config_arg() -> Tuple[str, List[Any]]:
+ """First parse out the arg for whether to use Hydra or the old CLI."""
+ parser = argparse.ArgumentParser(add_help=True)
+ parser.add_argument("--config", type=str, help="Path to the LlmConfig file")
+ args, remaining = parser.parse_known_args()
+ return args.config, remaining
+
+
+def pop_config_arg() -> str:
+ """
+ Removes '--config' and its value from sys.argv.
+ Assumes --config is specified and argparse has already validated the args.
+ """
+ idx = sys.argv.index("--config")
+ value = sys.argv[idx + 1]
+ del sys.argv[idx : idx + 2]
+ return value
+
+
+@hydra.main(version_base=None, config_name="llm_config")
+def hydra_main(llm_config: LlmConfig) -> None:
export_llama(OmegaConf.to_object(llm_config))
+def main() -> None:
+ config, remaining_args = parse_config_arg()
+ if config:
+ # Check if there are any remaining hydra CLI args when --config is specified
+ # This might change in the future to allow overriding config file values
+ if remaining_args:
+ raise ValueError(
+ "Cannot specify additional CLI arguments when using --config. "
+ f"Found: {remaining_args}. Use either --config file or hydra CLI args, not both."
+ )
+
+ config_file_path = pop_config_arg()
+ default_llm_config = LlmConfig()
+ llm_config_from_file = OmegaConf.load(config_file_path)
+ # Override defaults with values specified in the .yaml provided by --config.
+ merged_llm_config = OmegaConf.merge(default_llm_config, llm_config_from_file)
+ export_llama(merged_llm_config)
+ else:
+ hydra_main()
+
+
if __name__ == "__main__":
main()
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
new file mode 100644
index 00000000000..970a32c9606
--- /dev/null
+++ b/extension/llm/export/test/test_export_llm.py
@@ -0,0 +1,112 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import sys
+import tempfile
+import unittest
+from unittest.mock import MagicMock, patch
+
+from executorch.examples.models.llama.config.llm_config import LlmConfig
+from executorch.extension.llm.export.export_llm import main, parse_config_arg, pop_config_arg
+
+
+class TestExportLlm(unittest.TestCase):
+ def test_parse_config_arg_with_config(self) -> None:
+ """Test parse_config_arg when --config is provided."""
+ # Mock sys.argv to include --config
+ test_argv = ["script.py", "--config", "test_config.yaml", "extra", "args"]
+ with patch.object(sys, "argv", test_argv):
+ config_path, remaining = parse_config_arg()
+ self.assertEqual(config_path, "test_config.yaml")
+ self.assertEqual(remaining, ["extra", "args"])
+
+ def test_parse_config_arg_without_config(self) -> None:
+ """Test parse_config_arg when --config is not provided."""
+ test_argv = ["script.py", "debug.verbose=True"]
+ with patch.object(sys, "argv", test_argv):
+ config_path, remaining = parse_config_arg()
+ self.assertIsNone(config_path)
+ self.assertEqual(remaining, ["debug.verbose=True"])
+
+ def test_pop_config_arg(self) -> None:
+ """Test pop_config_arg removes --config and its value from sys.argv."""
+ test_argv = ["script.py", "--config", "test_config.yaml", "other", "args"]
+ with patch.object(sys, "argv", test_argv):
+ config_path = pop_config_arg()
+ self.assertEqual(config_path, "test_config.yaml")
+ self.assertEqual(sys.argv, ["script.py", "other", "args"])
+
+ @patch("executorch.extension.llm.export.export_llm.export_llama")
+ def test_with_config(self, mock_export_llama: MagicMock) -> None:
+ """Test main function with --config file and no hydra args."""
+ # Create a temporary config file
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+ f.write("""
+base:
+ tokenizer_path: /path/to/tokenizer.json
+export:
+ max_seq_length: 256
+""")
+ config_file = f.name
+
+ try:
+ test_argv = ["script.py", "--config", config_file]
+ with patch.object(sys, "argv", test_argv):
+ main()
+
+ # Verify export_llama was called with config
+ mock_export_llama.assert_called_once()
+ called_config = mock_export_llama.call_args[0][0]
+ self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
+ self.assertEqual(called_config["export"]["max_seq_length"], 256)
+ finally:
+ os.unlink(config_file)
+
+ def test_with_cli_args(self) -> None:
+ """Test main function with only hydra CLI args."""
+ test_argv = ["script.py", "debug.verbose=True"]
+ with patch.object(sys, "argv", test_argv):
+ with patch("executorch.extension.llm.export.export_llm.hydra_main") as mock_hydra:
+ main()
+ mock_hydra.assert_called_once()
+
+ def test_config_with_cli_args_error(self) -> None:
+ """Test that --config rejects additional CLI arguments to prevent mixing approaches."""
+ # Create a temporary config file
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+ f.write("base:\n checkpoint: /path/to/checkpoint.pth")
+ config_file = f.name
+
+ try:
+ test_argv = ["script.py", "--config", config_file, "debug.verbose=True"]
+ with patch.object(sys, "argv", test_argv):
+ with self.assertRaises(ValueError) as cm:
+ main()
+
+ error_msg = str(cm.exception)
+ self.assertIn("Cannot specify additional CLI arguments when using --config", error_msg)
+ finally:
+ os.unlink(config_file)
+
+ def test_config_rejects_multiple_cli_args(self) -> None:
+ """Test that --config rejects multiple CLI arguments (not just single ones)."""
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+ f.write("export:\n max_seq_length: 128")
+ config_file = f.name
+
+ try:
+ test_argv = ["script.py", "--config", config_file, "debug.verbose=True", "export.output_dir=/tmp"]
+ with patch.object(sys, "argv", test_argv):
+ with self.assertRaises(ValueError):
+ main()
+ finally:
+ os.unlink(config_file)
+
+
+if __name__ == "__main__":
+ unittest.main()
+
diff --git a/extension/llm/install_requirements.sh b/extension/llm/install_requirements.sh
new file mode 100755
index 00000000000..8f322083c03
--- /dev/null
+++ b/extension/llm/install_requirements.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Install requirements for LLM extension
+pip install hydra-core>=1.3.0 omegaconf>=2.3.0
From 9b8ea72d164fee5817b91d5ebe63ad42c07fc796 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 18 Jun 2025 21:08:10 -0700
Subject: [PATCH 02/17] Update
[ghstack-poisoned]
---
examples/models/llama/config/llm_config.py | 169 +++++++++---------
.../models/llama/config/test_llm_config.py | 31 +++-
extension/llm/export/test/test_export_llm.py | 35 +++-
3 files changed, 149 insertions(+), 86 deletions(-)
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 201e3a5414a..0504b386f45 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -16,7 +16,6 @@
import ast
import re
from dataclasses import dataclass, field
-from enum import Enum
from typing import ClassVar, List, Optional
@@ -25,32 +24,27 @@
################################################################################
-class ModelType(str, Enum):
- STORIES110M = "stories110m"
- LLAMA2 = "llama2"
- LLAMA3 = "llama3"
- LLAMA3_1 = "llama3_1"
- LLAMA3_2 = "llama3_2"
- LLAMA3_2_VISION = "llama3_2_vision"
- STATIC_LLAMA = "static_llama"
- QWEN2_5 = "qwen2_5"
- QWEN3_0_6B = "qwen3-0_6b"
- QWEN3_1_7B = "qwen3-1_7b"
- QWEN3_4B = "qwen3-4b"
- PHI_4_MINI = "phi_4_mini"
- SMOLLM2 = "smollm2"
+MODEL_TYPE_OPTIONS = [
+ "stories110m",
+ "llama2",
+ "llama3",
+ "llama3_1",
+ "llama3_2",
+ "llama3_2_vision",
+ "static_llama",
+ "qwen2_5",
+ "qwen3-0_6b",
+ "qwen3-1_7b",
+ "qwen3-4b",
+ "phi_4_mini",
+ "smollm2",
+]
-class PreqMode(str, Enum):
- """
- If you are dealing with pre-quantized checkpoints, this used to
- be the way to specify them. Now you don't need to specify these
- options if you use a TorchAo-prequantized checkpoint, but they
- are still around to preserve backward compatibility.
- """
-
- PREQ_8DA4W = "8da4w"
- PREQ_8DA4W_OUT_8DA8W = "8da4w_output_8da8w"
+PREQ_MODE_OPTIONS = [
+ "8da4w",
+ "8da4w_output_8da8w",
+]
@dataclass
@@ -82,7 +76,7 @@ class BaseConfig:
are loaded.
"""
- model_class: ModelType = ModelType.LLAMA3
+ model_class: str = "llama3"
params: Optional[str] = None
checkpoint: Optional[str] = None
checkpoint_dir: Optional[str] = None
@@ -90,26 +84,28 @@ class BaseConfig:
metadata: Optional[str] = None
use_lora: int = 0
fairseq2: bool = False
- preq_mode: Optional[PreqMode] = None
+ preq_mode: Optional[str] = None
preq_group_size: int = 32
preq_embedding_quantize: str = "8,0"
+ def __post_init__(self):
+ if self.model_class not in MODEL_TYPE_OPTIONS:
+ raise ValueError(f"model_class must be one of {MODEL_TYPE_OPTIONS}, got '{self.model_class}'")
+
+ if self.preq_mode is not None and self.preq_mode not in PREQ_MODE_OPTIONS:
+ raise ValueError(f"preq_mode must be one of {PREQ_MODE_OPTIONS}, got '{self.preq_mode}'")
+
################################################################################
################################# ModelConfig ##################################
################################################################################
-class DtypeOverride(str, Enum):
- """
- DType of the model. Highly recommended to use "fp32", unless you want to
- export without a backend, in which case you can also use "bf16". "fp16"
- is not recommended.
- """
-
- FP32 = "fp32"
- FP16 = "fp16"
- BF16 = "bf16"
+DTYPE_OVERRIDE_OPTIONS = [
+ "fp32",
+ "fp16",
+ "bf16",
+]
@dataclass
@@ -147,7 +143,7 @@ class ModelConfig:
[16] pattern specifies all layers have a sliding window of 16.
"""
- dtype_override: DtypeOverride = DtypeOverride.FP32
+ dtype_override: str = "fp32"
enable_dynamic_shape: bool = True
use_shared_embedding: bool = False
use_sdpa_with_kv_cache: bool = False
@@ -160,6 +156,9 @@ class ModelConfig:
local_global_attention: Optional[List[int]] = None
def __post_init__(self):
+ if self.dtype_override not in DTYPE_OVERRIDE_OPTIONS:
+ raise ValueError(f"dtype_override must be one of {DTYPE_OVERRIDE_OPTIONS}, got '{self.dtype_override}'")
+
self._validate_attention_sink()
self._validate_local_global_attention()
@@ -261,31 +260,25 @@ class DebugConfig:
################################################################################
-class Pt2eQuantize(str, Enum):
- """
- Type of backend-specific Pt2e quantization strategy to use.
-
- Pt2e uses a different quantization library that is graph-based
- compared to `qmode`, which is also specified in the QuantizationConfig
- and is source transform-based.
- """
+PT2E_QUANTIZE_OPTIONS = [
+ "xnnpack_dynamic",
+ "xnnpack_dynamic_qc4",
+ "qnn_8a8w",
+ "qnn_16a16w",
+ "qnn_16a4w",
+ "coreml_c4w",
+ "coreml_8a_c8w",
+ "coreml_8a_c4w",
+ "coreml_baseline_8a_c8w",
+ "coreml_baseline_8a_c4w",
+ "vulkan_8w",
+]
- XNNPACK_DYNAMIC = "xnnpack_dynamic"
- XNNPACK_DYNAMIC_QC4 = "xnnpack_dynamic_qc4"
- QNN_8A8W = "qnn_8a8w"
- QNN_16A16W = "qnn_16a16w"
- QNN_16A4W = "qnn_16a4w"
- COREML_C4W = "coreml_c4w"
- COREML_8A_C8W = "coreml_8a_c8w"
- COREML_8A_C4W = "coreml_8a_c4w"
- COREML_BASELINE_8A_C8W = "coreml_baseline_8a_c8w"
- COREML_BASELINE_8A_C4W = "coreml_baseline_8a_c4w"
- VULKAN_8W = "vulkan_8w"
-
-class SpinQuant(str, Enum):
- CUDA = "cuda"
- NATIVE = "native"
+SPIN_QUANT_OPTIONS = [
+ "cuda",
+ "native",
+]
@dataclass
@@ -320,9 +313,9 @@ class QuantizationConfig:
qmode: Optional[str] = None
embedding_quantize: Optional[str] = None
- pt2e_quantize: Optional[Pt2eQuantize] = None
+ pt2e_quantize: Optional[str] = None
group_size: Optional[int] = None
- use_spin_quant: Optional[SpinQuant] = None
+ use_spin_quant: Optional[str] = None
use_qat: bool = False
calibration_tasks: Optional[List[str]] = None
calibration_limit: Optional[int] = None
@@ -330,6 +323,12 @@ class QuantizationConfig:
calibration_data: str = "Once upon a time"
def __post_init__(self):
+ if self.pt2e_quantize is not None and self.pt2e_quantize not in PT2E_QUANTIZE_OPTIONS:
+ raise ValueError(f"pt2e_quantize must be one of {PT2E_QUANTIZE_OPTIONS}, got '{self.pt2e_quantize}'")
+
+ if self.use_spin_quant is not None and self.use_spin_quant not in SPIN_QUANT_OPTIONS:
+ raise ValueError(f"use_spin_quant must be one of {SPIN_QUANT_OPTIONS}, got '{self.use_spin_quant}'")
+
if self.qmode:
self._validate_qmode()
@@ -377,16 +376,18 @@ class XNNPackConfig:
extended_ops: bool = False
-class CoreMLQuantize(str, Enum):
- B4W = "b4w"
- C4W = "c4w"
+COREML_QUANTIZE_OPTIONS = [
+ "b4w",
+ "c4w",
+]
-class CoreMLComputeUnit(str, Enum):
- CPU_ONLY = "cpu_only"
- CPU_AND_GPU = "cpu_and_gpu"
- CPU_AND_NE = "cpu_and_ne"
- ALL = "all"
+COREML_COMPUTE_UNIT_OPTIONS = [
+ "cpu_only",
+ "cpu_and_gpu",
+ "cpu_and_ne",
+ "all",
+]
@dataclass
@@ -398,11 +399,17 @@ class CoreMLConfig:
enabled: bool = False
enable_state: bool = False
preserve_sdpa: bool = False
- quantize: Optional[CoreMLQuantize] = None
+ quantize: Optional[str] = None
ios: int = 15
- compute_units: CoreMLComputeUnit = CoreMLComputeUnit.CPU_ONLY
+ compute_units: str = "cpu_only"
def __post_init__(self):
+ if self.quantize is not None and self.quantize not in COREML_QUANTIZE_OPTIONS:
+ raise ValueError(f"quantize must be one of {COREML_QUANTIZE_OPTIONS}, got '{self.quantize}'")
+
+ if self.compute_units not in COREML_COMPUTE_UNIT_OPTIONS:
+ raise ValueError(f"compute_units must be one of {COREML_COMPUTE_UNIT_OPTIONS}, got '{self.compute_units}'")
+
if self.ios not in (15, 16, 17, 18):
raise ValueError(f"Invalid coreml ios version: {self.ios}")
@@ -481,7 +488,7 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
# BaseConfig
if hasattr(args, "model"):
- llm_config.base.model_class = ModelType(args.model)
+ llm_config.base.model_class = args.model
if hasattr(args, "params"):
llm_config.base.params = args.params
if hasattr(args, "checkpoint"):
@@ -499,7 +506,7 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
# PreqMode settings
if hasattr(args, "preq_mode") and args.preq_mode:
- llm_config.base.preq_mode = PreqMode(args.preq_mode)
+ llm_config.base.preq_mode = args.preq_mode
if hasattr(args, "preq_group_size"):
llm_config.base.preq_group_size = args.preq_group_size
if hasattr(args, "preq_embedding_quantize"):
@@ -507,7 +514,7 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
# ModelConfig
if hasattr(args, "dtype_override"):
- llm_config.model.dtype_override = DtypeOverride(args.dtype_override)
+ llm_config.model.dtype_override = args.dtype_override
if hasattr(args, "enable_dynamic_shape"):
llm_config.model.enable_dynamic_shape = args.enable_dynamic_shape
if hasattr(args, "use_shared_embedding"):
@@ -549,11 +556,11 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
if hasattr(args, "embedding_quantize"):
llm_config.quantization.embedding_quantize = args.embedding_quantize
if hasattr(args, "pt2e_quantize") and args.pt2e_quantize:
- llm_config.quantization.pt2e_quantize = Pt2eQuantize(args.pt2e_quantize)
+ llm_config.quantization.pt2e_quantize = args.pt2e_quantize
if hasattr(args, "group_size"):
llm_config.quantization.group_size = args.group_size
if hasattr(args, "use_spin_quant") and args.use_spin_quant:
- llm_config.quantization.use_spin_quant = SpinQuant(args.use_spin_quant)
+ llm_config.quantization.use_spin_quant = args.use_spin_quant
if hasattr(args, "use_qat"):
llm_config.quantization.use_qat = args.use_qat
if hasattr(args, "calibration_tasks"):
@@ -581,13 +588,11 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
args, "coreml_preserve_sdpa", False
)
if hasattr(args, "coreml_quantize") and args.coreml_quantize:
- llm_config.backend.coreml.quantize = CoreMLQuantize(args.coreml_quantize)
+ llm_config.backend.coreml.quantize = args.coreml_quantize
if hasattr(args, "coreml_ios"):
llm_config.backend.coreml.ios = args.coreml_ios
if hasattr(args, "coreml_compute_units"):
- llm_config.backend.coreml.compute_units = CoreMLComputeUnit(
- args.coreml_compute_units
- )
+ llm_config.backend.coreml.compute_units = args.coreml_compute_units
# Vulkan
if hasattr(args, "vulkan"):
diff --git a/examples/models/llama/config/test_llm_config.py b/examples/models/llama/config/test_llm_config.py
index 0853e9dbbd8..15513bcd6f2 100644
--- a/examples/models/llama/config/test_llm_config.py
+++ b/examples/models/llama/config/test_llm_config.py
@@ -11,7 +11,6 @@
from executorch.examples.models.llama.config.llm_config import (
BackendConfig,
BaseConfig,
- CoreMLComputeUnit,
CoreMLConfig,
DebugConfig,
ExportConfig,
@@ -66,6 +65,34 @@ def test_shared_embedding_without_lowbit(self):
with self.assertRaises(ValueError):
LlmConfig(model=model_cfg, quantization=qcfg)
+ def test_invalid_model_type(self):
+ with self.assertRaises(ValueError):
+ BaseConfig(model_class="invalid_model")
+
+ def test_invalid_dtype_override(self):
+ with self.assertRaises(ValueError):
+ ModelConfig(dtype_override="invalid_dtype")
+
+ def test_invalid_preq_mode(self):
+ with self.assertRaises(ValueError):
+ BaseConfig(preq_mode="invalid_preq")
+
+ def test_invalid_pt2e_quantize(self):
+ with self.assertRaises(ValueError):
+ QuantizationConfig(pt2e_quantize="invalid_pt2e")
+
+ def test_invalid_spin_quant(self):
+ with self.assertRaises(ValueError):
+ QuantizationConfig(use_spin_quant="invalid_spin")
+
+ def test_invalid_coreml_quantize(self):
+ with self.assertRaises(ValueError):
+ CoreMLConfig(quantize="invalid_quantize")
+
+ def test_invalid_coreml_compute_units(self):
+ with self.assertRaises(ValueError):
+ CoreMLConfig(compute_units="invalid_compute_units")
+
class TestValidConstruction(unittest.TestCase):
@@ -94,7 +121,7 @@ def test_valid_llm_config(self):
backend=BackendConfig(
xnnpack=XNNPackConfig(enabled=False),
coreml=CoreMLConfig(
- enabled=True, ios=17, compute_units=CoreMLComputeUnit.ALL
+ enabled=True, ios=17, compute_units="all"
),
),
)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index 970a32c9606..258a867dc6b 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -47,9 +47,20 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
f.write("""
base:
+ model_class: llama3
tokenizer_path: /path/to/tokenizer.json
+ preq_mode: 8da4w
+model:
+ dtype_override: fp32
export:
max_seq_length: 256
+quantization:
+ pt2e_quantize: xnnpack_dynamic
+ use_spin_quant: cuda
+backend:
+ coreml:
+ quantize: c4w
+ compute_units: cpu_and_gpu
""")
config_file = f.name
@@ -61,8 +72,15 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
# Verify export_llama was called with config
mock_export_llama.assert_called_once()
called_config = mock_export_llama.call_args[0][0]
+ self.assertEqual(called_config["base"]["model_class"], "llama3")
self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
+ self.assertEqual(called_config["base"]["preq_mode"], "8da4w")
+ self.assertEqual(called_config["model"]["dtype_override"], "fp32")
self.assertEqual(called_config["export"]["max_seq_length"], 256)
+ self.assertEqual(called_config["quantization"]["pt2e_quantize"], "xnnpack_dynamic")
+ self.assertEqual(called_config["quantization"]["use_spin_quant"], "cuda")
+ self.assertEqual(called_config["backend"]["coreml"]["quantize"], "c4w")
+ self.assertEqual(called_config["backend"]["coreml"]["compute_units"], "cpu_and_gpu")
finally:
os.unlink(config_file)
@@ -78,7 +96,13 @@ def test_config_with_cli_args_error(self) -> None:
"""Test that --config rejects additional CLI arguments to prevent mixing approaches."""
# Create a temporary config file
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- f.write("base:\n checkpoint: /path/to/checkpoint.pth")
+ f.write("""
+base:
+ model_class: llama2
+ checkpoint: /path/to/checkpoint.pth
+model:
+ dtype_override: bf16
+""")
config_file = f.name
try:
@@ -95,7 +119,14 @@ def test_config_with_cli_args_error(self) -> None:
def test_config_rejects_multiple_cli_args(self) -> None:
"""Test that --config rejects multiple CLI arguments (not just single ones)."""
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- f.write("export:\n max_seq_length: 128")
+ f.write("""
+base:
+ model_class: qwen2_5
+export:
+ max_seq_length: 128
+quantization:
+ pt2e_quantize: qnn_8a8w
+""")
config_file = f.name
try:
From f31059be1fbe183a307ad21cb14fbbdb89ca4e02 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 18 Jun 2025 21:08:14 -0700
Subject: [PATCH 03/17] Update
[ghstack-poisoned]
---
.../demo-apps/android/LlamaDemo/README.md | 2 +-
.../docs/delegates/qualcomm_README.md | 18 +--
.../docs/delegates/xnnpack_README.md | 10 +-
.../LLaMA/docs/delegates/mps_README.md | 2 +-
.../LLaMA/docs/delegates/xnnpack_README.md | 8 +-
.../deepseek-r1-distill-llama-8B/README.md | 24 +--
examples/models/llama/README.md | 140 +++++++++---------
examples/models/llama/UTILS.md | 12 +-
examples/models/llama2/README.md | 2 +-
examples/models/phi_4_mini/README.md | 28 ++--
examples/models/qwen2_5/README.md | 28 ++--
examples/models/qwen3/README.md | 76 +++++-----
12 files changed, 175 insertions(+), 175 deletions(-)
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
index 4b8cafd2d4e..8fed04d7ff5 100644
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ b/examples/demo-apps/android/LlamaDemo/README.md
@@ -154,7 +154,7 @@ curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokeni
# Create params.json file
touch params.json
echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
-python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -d fp16 -n stories110m_h.pte -kv
+python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json model.dtype_override="fp16" export.output_name=stories110m_h.pte model.use_kv_cache=True
python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
```
### Push model
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
index fb9df3c3375..969b6cacab9 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -97,7 +97,7 @@ cmake --build cmake-out/examples/models/llama -j16 --config Release
## Export Llama Model
QNN backend currently supports exporting to these data types: fp32, int4/ int8 with PTQ, int4 with SpinQuant (Llama 3 only).
-We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add “--soc_model SM8550” in your export command. Without setting this flag, the export will default to SM8650.
+We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add "--soc_model SM8550" in your export command. Without setting this flag, the export will default to SM8650.
### Export with PTQ
We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B). However, there is accuracy regression and we are working on improving it.
@@ -106,12 +106,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B)
Examples:
```
# 4 bits weight only quantize
-python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="test.pte"
```
If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example:
```
# 8 bits quantization with 4 shards
-python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_8a8w" model.dtype_override="fp32" backend.qnn.num_sharding=4 base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="test.pte"
```
Note: if you encountered issues below
```
@@ -163,7 +163,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure
* 8B models might need 16GB RAM on the device to run.
```
# Please note that calibration_data must include the prompt template for special tokens.
-python -m examples.models.llama.export_llama -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+python -m extension.llm.export.export_llm base.tokenizer= base.params= base.checkpoint= model.use_kv_cache=True backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.enable_dynamic_shape=False backend.qnn.num_sharding=8 backend.qnn.calibration_tasks="wikitext" backend.qnn.calibration_limit=1 backend.qnn.calibration_seq_length=128 backend.qnn.optimized_rotation_path= backend.qnn.calibration_data="<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
```
## Pushing Model and Tokenizer
@@ -210,17 +210,17 @@ Alternative you can also just run the shell script directly as in the root direc
sh examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
```
This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them into AAR, and copies it to the app.
-Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app.
+Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to "examples/demo-apps/android/LlamaDemo/app/libs" before building the Android app.
## Run the Android Demo App
-First, make sure your Android phone’s chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html).
+First, make sure your Android phone's chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html).
-If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into “examples/demo-apps/android/LlamaDemo/app/libs”
+If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into "examples/demo-apps/android/LlamaDemo/app/libs"
### Alternative 1: Android Studio (Recommended)
-Open Android Studio and select “Open an existing Android Studio project” to open examples/demo-apps/android/LlamaDemo.
+Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo.
Run the app (^R). This builds and launches the app on the phone.
### Alternative 2: Command line
@@ -238,4 +238,4 @@ If the app successfully run on your device, you should see something like below:
## Reporting Issues
-If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github.
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github.
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
index de99387f82d..c60bd537e6b 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -55,7 +55,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an
Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" quantization.use_spin_quant="native" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_spinquant.pte"
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -63,7 +63,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048--preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_qat_lora.pte"
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -74,7 +74,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_bf16.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -90,7 +90,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla
* We prepared this model using the following command
```
-python -m examples.models.llama.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --max_context_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.dtype_override="fp32" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True quantization.qmode="8da4w" quantization.group_size=256 backend.xnnpack.enabled=True export.max_seq_length=8193 export.max_context_length=8193 quantization.embedding_quantize="4,32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' base.output_prune_map= export.output_name="llama_guard_3_1b_pruned_xnnpack.pte"
```
@@ -100,7 +100,7 @@ python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama.pte"
```
You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
index 47352607bca..d6bccc0ef47 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -49,7 +49,7 @@ Install the required packages to export the model
Export the model
```
-python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32
+python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.mps.enabled=True model.dtype_override="fp32" model.enable_dynamic_shape=False quantization.qmode="8da4w" quantization.group_size=32
```
## Pushing Model and Tokenizer
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
index bb33b50f8b7..d64a119e35f 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -51,7 +51,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an
Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" quantization.use_spin_quant="native" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_spinquant.pte"
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -59,7 +59,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_qat_lora.pte"
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -69,7 +69,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
* Export Llama model and generate .pte file as below:
```
-python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_bf16.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -79,7 +79,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl
Export the model
```
-python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' quantization.embedding_quantize="4,32" export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
```
### For LLaVA model
diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md
index 5fd47ad61ec..7695c678337 100644
--- a/examples/models/deepseek-r1-distill-llama-8B/README.md
+++ b/examples/models/deepseek-r1-distill-llama-8B/README.md
@@ -52,18 +52,18 @@ torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth")
5. Generate a PTE file for use with the Llama runner.
```
-python -m examples.models.llama.export_llama \
- --checkpoint /tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
- -p params.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -X \
- -qmode 8da4w \
- --group_size 128 \
- -d fp16 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --embedding-quantize 4,32 \
- --output_name="DeepSeek-R1-Distill-Llama-8B.pte"
+python -m extension.llm.export.export_llm \
+ base.checkpoint=/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
+ base.params=params.json \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ backend.xnnpack.enabled=True \
+ quantization.qmode="8da4w" \
+ quantization.group_size=128 \
+ model.dtype_override="fp16" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ quantization.embedding_quantize="4,32" \
+ export.output_name="DeepSeek-R1-Distill-Llama-8B.pte"
```
6. Run the model on your desktop for validation or integrate with iOS/Android apps. Instructions for these are available in the Llama [README](../llama/README.md) starting at Step 3.
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index c6f0350fff7..23a377a6611 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -167,15 +167,15 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
LLAMA_CHECKPOINT=path/to/consolidated.00.pth
LLAMA_PARAMS=path/to/params.json
-python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${LLAMA_CHECKPOINT:?}" \
- --params "${LLAMA_PARAMS:?}" \
- -kv \
- --use_sdpa_with_kv_cache \
- -d bf16 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --output_name="llama3_2.pte"
+python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+ base.params="${LLAMA_PARAMS:?}" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="bf16" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ export.output_name="llama3_2.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -189,23 +189,23 @@ For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/exec
LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/consolidated.00.pth.pth
LLAMA_PARAMS=path/to/spinquant/params.json
-python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
- --params "${LLAMA_PARAMS:?}" \
- --use_sdpa_with_kv_cache \
- -X \
- --xnnpack-extended-ops \
- --preq_mode 8da4w_output_8da8w \
- --preq_group_size 32 \
- --max_seq_length 2048 \
- --max_context_length 2048 \
- --output_name "llama3_2.pte" \
- -kv \
- -d fp32 \
- --preq_embedding_quantize 8,0 \
- --use_spin_quant native \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
+ base.params="${LLAMA_PARAMS:?}" \
+ model.use_sdpa_with_kv_cache=True \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ base.preq_mode="8da4w_output_8da8w" \
+ base.preq_group_size=32 \
+ export.max_seq_length=2048 \
+ export.max_context_length=2048 \
+ export.output_name="llama3_2.pte" \
+ model.use_kv_cache=True \
+ model.dtype_override="fp32" \
+ base.preq_embedding_quantize="8,0" \
+ quantization.use_spin_quant="native" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -218,24 +218,24 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/consolidated.00.pth.pth
LLAMA_PARAMS=path/to/qlora/params.json
-python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
- --params "${LLAMA_PARAMS:?}" \
- -qat \
- -lora 16 \
- --preq_mode 8da4w_output_8da8w \
- --preq_group_size 32 \
- --preq_embedding_quantize 8,0 \
- --use_sdpa_with_kv_cache \
- -kv \
- -X \
- --xnnpack-extended-ops \
- -d fp32 \
- --max_seq_length 2048 \
- --max_context_length 2048 \
- --output_name "llama3_2.pte" \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${LLAMA_QUANTIZED_CHECKPOINT:?}" \
+ base.params="${LLAMA_PARAMS:?}" \
+ quantization.use_qat=True \
+ base.use_lora=16 \
+ base.preq_mode="8da4w_output_8da8w" \
+ base.preq_group_size=32 \
+ base.preq_embedding_quantize="8,0" \
+ model.use_sdpa_with_kv_cache=True \
+ model.use_kv_cache=True \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ model.dtype_override="fp32" \
+ export.max_seq_length=2048 \
+ export.max_context_length=2048 \
+ export.output_name="llama3_2.pte" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -247,20 +247,20 @@ You can export and run the original Llama 3 8B instruct model.
2. Export model and generate `.pte` file
```
- python -m examples.models.llama.export_llama \
- --checkpoint \
- -p \
- -kv \
- --use_sdpa_with_kv_cache \
- -X \
- -qmode 8da4w \
- --group_size 128 \
- -d fp32 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --embedding-quantize 4,32 \
- --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+ python -m extension.llm.export.export_llm \
+ base.checkpoint= \
+ base.params= \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ backend.xnnpack.enabled=True \
+ quantization.qmode="8da4w" \
+ quantization.group_size=128 \
+ model.dtype_override="fp32" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ quantization.embedding_quantize="4,32" \
+ export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
```
- Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size.
+ Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize="4,32"` as shown above to further reduce the model size.
If you're interested in deploying on non-CPU backends, [please refer the non-cpu-backend section](non_cpu_backends.md)
@@ -389,22 +389,22 @@ QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
QEMBEDDING_BITWIDTH=4 # Can be 1-8
QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
-python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${LLAMA_CHECKPOINT:?}" \
- --params "${LLAMA_PARAMS:?}" \
- -kv \
- --use_sdpa_with_kv_cache \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --output_name="llama3_2.pte" \
- -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
- --group_size ${QLINEAR_GROUP_SIZE} \
- -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
- -d fp32
+python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+ base.params="${LLAMA_PARAMS:?}" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ export.output_name="llama3_2.pte" \
+ quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \
+ quantization.group_size=${QLINEAR_GROUP_SIZE} \
+ quantization.embedding_quantize="torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+ model.dtype_override="fp32"
```
A few notes:
-- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument. For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric. If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
+- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `model.use_shared_embedding=True` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument. For example, `quantization.embedding_quantize="torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `quantization.embedding_quantize="torchao:4,32"`), whereas `quantization.embedding_quantize="torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric. If `model.use_shared_embedding=True` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
- To do channelwise quantization, specify group_size to 0. This works for both linear and embedding layers.
Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels.
diff --git a/examples/models/llama/UTILS.md b/examples/models/llama/UTILS.md
index 5f760ad7670..25bd7f77080 100644
--- a/examples/models/llama/UTILS.md
+++ b/examples/models/llama/UTILS.md
@@ -19,7 +19,7 @@ From `executorch` root:
```
3. Export model and generate `.pte` file.
```
- python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -X -kv
+ python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json backend.xnnpack.enabled=True model.use_kv_cache=True
```
## Smaller model delegated to other backends
@@ -27,15 +27,15 @@ From `executorch` root:
Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
for each backend ([CoreML](https://pytorch.org/executorch/main/backends-coreml), [MPS](https://pytorch.org/executorch/main/backends-mps), [QNN](https://pytorch.org/executorch/main/backends-qualcomm)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
-- Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
-- MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
-- QNN: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
+- Lower to CoreML: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.coreml.enabled=True base.checkpoint=stories110M.pt base.params=params.json`
+- MPS: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.mps.enabled=True base.checkpoint=stories110M.pt base.params=params.json`
+- QNN: `python -m extension.llm.export.export_llm model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True base.checkpoint=stories110M.pt base.params=params.json`
The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run.
For CoreML, there are 2 additional optional arguments:
-* `--coreml-ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `--coreml-ios 18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though)
-* `--coreml-quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `--coreml-quantize b4w` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML
+* `backend.coreml.ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `backend.coreml.ios=18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though)
+* `backend.coreml.quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `backend.coreml.quantize="b4w"` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML
To deploy the large 8B model on the above backends, [please visit this section](non_cpu_backends.md).
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index 615ad3948fc..21f761b7f71 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -37,7 +37,7 @@ You can export and run the original Llama 2 7B model.
3. Export model and generate `.pte` file:
```
- python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
+ python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32"
```
4. Create tokenizer.bin.
```
diff --git a/examples/models/phi_4_mini/README.md b/examples/models/phi_4_mini/README.md
index a23e4f49638..c2b3d515ec0 100644
--- a/examples/models/phi_4_mini/README.md
+++ b/examples/models/phi_4_mini/README.md
@@ -7,9 +7,9 @@ Phi-4-mini uses the same example code as Llama, while the checkpoint, model para
All commands for exporting and running Llama on various backends should also be applicable to Phi-4-mini, by swapping the following args:
```
---model phi_4_mini
---params examples/models/phi-4-mini/config.json
---checkpoint
+base.model_class="phi_4_mini"
+base.params="examples/models/phi-4-mini/config.json"
+base.checkpoint=
```
### Generate the Checkpoint
@@ -32,17 +32,17 @@ Export to XNNPack, no quantization:
# Set these paths to point to the downloaded files
PHI_CHECKPOINT=path/to/checkpoint.pth
-python -m examples.models.llama.export_llama \
- --model phi_4_mini \
- --checkpoint "${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \
- --params examples/models/phi-4-mini/config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \
- --output_name="phi-4-mini.pte"
- --verbose
+python -m extension.llm.export.export_llm \
+ base.model_class="phi_4_mini" \
+ base.checkpoint="${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \
+ base.params="examples/models/phi-4-mini/config.json" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="fp32" \
+ backend.xnnpack.enabled=True \
+ base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+ export.output_name="phi-4-mini.pte" \
+ debug.verbose=True
```
Run using the executor runner:
diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md
index 9bf791a35ed..b40daaca469 100644
--- a/examples/models/qwen2_5/README.md
+++ b/examples/models/qwen2_5/README.md
@@ -7,9 +7,9 @@ Qwen 2.5 uses the same example code as Llama, while the checkpoint, model params
All commands for exporting and running Llama on various backends should also be applicable to Qwen 2.5, by swapping the following args:
```
---model qwen2_5
---params examples/models/qwen2_5/1_5b_config.json
---checkpoint
+base.model_class="qwen2_5"
+base.params="examples/models/qwen2_5/1_5b_config.json"
+base.checkpoint=
```
### Generate the Checkpoint
@@ -32,17 +32,17 @@ Export to XNNPack, no quantization:
# Set these paths to point to the downloaded files
QWEN_CHECKPOINT=path/to/checkpoint.pth
-python -m examples.models.llama.export_llama \
- --model "qwen2_5" \
- --checkpoint "${QWEN_CHECKPOINT:?}" \
- --params examples/models/qwen2_5/1_5b_config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \
- --output_name="qwen2_5-1_5b.pte"
- --verbose
+python -m extension.llm.export.export_llm \
+ base.model_class="qwen2_5" \
+ base.checkpoint="${QWEN_CHECKPOINT:?}" \
+ base.params="examples/models/qwen2_5/1_5b_config.json" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="fp32" \
+ backend.xnnpack.enabled=True \
+ base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+ export.output_name="qwen2_5-1_5b.pte" \
+ debug.verbose=True
```
Run using the executor runner:
diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
index a589d27c19d..acdd4497503 100644
--- a/examples/models/qwen3/README.md
+++ b/examples/models/qwen3/README.md
@@ -7,8 +7,8 @@ Qwen 3 uses the same example code as our optimized Llama model, while the checkp
All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args:
```
---model [qwen3-0.6b,qwen3-1_7b,qwen3-4b]
---params [examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json]
+base.model_class=[qwen3-0_6b,qwen3-1_7b,qwen3-4b]
+base.params=[examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json]
```
### Example export
@@ -16,50 +16,50 @@ Here is a basic example for exporting Qwen 3, although please refer to the Llama
Export 0.6b to XNNPack, quantized with 8da4w:
```
-python -m examples.models.llama.export_llama \
- --model qwen3-0_6b \
- --params examples/models/qwen3/0_6b_config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --xnnpack-extended-ops \
- -qmode 8da4w \
- --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- --output_name="qwen3-0_6b.pte" \
- --verbose
+python -m extension.llm.export.export_llm \
+ base.model_class="qwen3-0_6b" \
+ base.params="examples/models/qwen3/0_6b_config.json" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="fp32" \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ quantization.qmode="8da4w" \
+ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ export.output_name="qwen3-0_6b.pte" \
+ debug.verbose=True
```
Export 1.7b to XNNPack, quantized with 8da4w:
```
-python -m examples.models.llama.export_llama \
- --model qwen3-1_7b \
- --params examples/models/qwen3/1_7b_config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --xnnpack-extended-ops \
- -qmode 8da4w \
- --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- --output_name="qwen3-1_7b.pte" \
- --verbose
+python -m extension.llm.export.export_llm \
+ base.model_class="qwen3-1_7b" \
+ base.params="examples/models/qwen3/1_7b_config.json" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="fp32" \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ quantization.qmode="8da4w" \
+ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ export.output_name="qwen3-1_7b.pte" \
+ debug.verbose=True
```
Export 4b to XNNPack, quantized with 8da4w:
```
-python -m examples.models.llama.export_llama \
- --model qwen3-4b \
- --params examples/models/qwen3/4b_config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --xnnpack-extended-ops \
- -qmode 8da4w \
- --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- --output_name="qwen3-4b.pte" \
- --verbose
+python -m extension.llm.export.export_llm \
+ base.model_class="qwen3-4b" \
+ base.params="examples/models/qwen3/4b_config.json" \
+ model.use_kv_cache=True \
+ model.use_sdpa_with_kv_cache=True \
+ model.dtype_override="fp32" \
+ backend.xnnpack.enabled=True \
+ backend.xnnpack.extended_ops=True \
+ quantization.qmode="8da4w" \
+ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ export.output_name="qwen3-4b.pte" \
+ debug.verbose=True
```
### Example run
From 49d56c40df5380fd6ad4b921535eebcad5883e96 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:18:45 -0700
Subject: [PATCH 04/17] Update
[ghstack-poisoned]
---
examples/models/llama/config/llm_config.py | 80 +++++++--------
examples/models/llama/export_llama_lib.py | 30 +++---
examples/models/llama/model.py | 14 +--
extension/llm/export/test/test_export_llm.py | 101 ++++++++++++++++++-
4 files changed, 162 insertions(+), 63 deletions(-)
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 201e3a5414a..9acd633fb21 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -26,19 +26,19 @@
class ModelType(str, Enum):
- STORIES110M = "stories110m"
- LLAMA2 = "llama2"
- LLAMA3 = "llama3"
- LLAMA3_1 = "llama3_1"
- LLAMA3_2 = "llama3_2"
- LLAMA3_2_VISION = "llama3_2_vision"
- STATIC_LLAMA = "static_llama"
- QWEN2_5 = "qwen2_5"
- QWEN3_0_6B = "qwen3-0_6b"
- QWEN3_1_7B = "qwen3-1_7b"
- QWEN3_4B = "qwen3-4b"
- PHI_4_MINI = "phi_4_mini"
- SMOLLM2 = "smollm2"
+ stories110m = "stories110m"
+ llama2 = "llama2"
+ llama3 = "llama3"
+ llama3_1 = "llama3_1"
+ llama3_2 = "llama3_2"
+ llama3_2_vision = "llama3_2_vision"
+ static_llama = "static_llama"
+ qwen2_5 = "qwen2_5"
+ qwen3_0_6b = "qwen3-0_6b"
+ qwen3_1_7b = "qwen3-1_7b"
+ qwen3_4b = "qwen3-4b"
+ phi_4_mini = "phi_4_mini"
+ smollm2 = "smollm2"
class PreqMode(str, Enum):
@@ -49,8 +49,8 @@ class PreqMode(str, Enum):
are still around to preserve backward compatibility.
"""
- PREQ_8DA4W = "8da4w"
- PREQ_8DA4W_OUT_8DA8W = "8da4w_output_8da8w"
+ preq_8da4w = "8da4w"
+ preq_8da4w_out_8da8w = "8da4w_output_8da8w"
@dataclass
@@ -82,7 +82,7 @@ class BaseConfig:
are loaded.
"""
- model_class: ModelType = ModelType.LLAMA3
+ model_class: ModelType = ModelType.llama3
params: Optional[str] = None
checkpoint: Optional[str] = None
checkpoint_dir: Optional[str] = None
@@ -107,9 +107,9 @@ class DtypeOverride(str, Enum):
is not recommended.
"""
- FP32 = "fp32"
- FP16 = "fp16"
- BF16 = "bf16"
+ fp32 = "fp32"
+ fp16 = "fp16"
+ bf16 = "bf16"
@dataclass
@@ -147,7 +147,7 @@ class ModelConfig:
[16] pattern specifies all layers have a sliding window of 16.
"""
- dtype_override: DtypeOverride = DtypeOverride.FP32
+ dtype_override: DtypeOverride = DtypeOverride.fp32
enable_dynamic_shape: bool = True
use_shared_embedding: bool = False
use_sdpa_with_kv_cache: bool = False
@@ -270,22 +270,22 @@ class Pt2eQuantize(str, Enum):
and is source transform-based.
"""
- XNNPACK_DYNAMIC = "xnnpack_dynamic"
- XNNPACK_DYNAMIC_QC4 = "xnnpack_dynamic_qc4"
- QNN_8A8W = "qnn_8a8w"
- QNN_16A16W = "qnn_16a16w"
- QNN_16A4W = "qnn_16a4w"
- COREML_C4W = "coreml_c4w"
- COREML_8A_C8W = "coreml_8a_c8w"
- COREML_8A_C4W = "coreml_8a_c4w"
- COREML_BASELINE_8A_C8W = "coreml_baseline_8a_c8w"
- COREML_BASELINE_8A_C4W = "coreml_baseline_8a_c4w"
- VULKAN_8W = "vulkan_8w"
+ xnnpack_dynamic = "xnnpack_dynamic"
+ xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4"
+ qnn_8a8w = "qnn_8a8w"
+ qnn_16a16w = "qnn_16a16w"
+ qnn_16a4w = "qnn_16a4w"
+ coreml_c4w = "coreml_c4w"
+ coreml_8a_c8w = "coreml_8a_c8w"
+ coreml_8a_c4w = "coreml_8a_c4w"
+ coreml_baseline_8a_c8w = "coreml_baseline_8a_c8w"
+ coreml_baseline_8a_c4w = "coreml_baseline_8a_c4w"
+ vulkan_8w = "vulkan_8w"
class SpinQuant(str, Enum):
- CUDA = "cuda"
- NATIVE = "native"
+ cuda = "cuda"
+ native = "native"
@dataclass
@@ -378,15 +378,15 @@ class XNNPackConfig:
class CoreMLQuantize(str, Enum):
- B4W = "b4w"
- C4W = "c4w"
+ b4w = "b4w"
+ c4w = "c4w"
class CoreMLComputeUnit(str, Enum):
- CPU_ONLY = "cpu_only"
- CPU_AND_GPU = "cpu_and_gpu"
- CPU_AND_NE = "cpu_and_ne"
- ALL = "all"
+ cpu_only = "cpu_only"
+ cpu_and_gpu = "cpu_and_gpu"
+ cpu_and_ne = "cpu_and_ne"
+ all = "all"
@dataclass
@@ -400,7 +400,7 @@ class CoreMLConfig:
preserve_sdpa: bool = False
quantize: Optional[CoreMLQuantize] = None
ios: int = 15
- compute_units: CoreMLComputeUnit = CoreMLComputeUnit.CPU_ONLY
+ compute_units: CoreMLComputeUnit = CoreMLComputeUnit.cpu_only
def __post_init__(self):
if self.ios not in (15, 16, 17, 18):
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 78c6244abee..6a706e0fa05 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -590,7 +590,7 @@ def export_llama(
# If a checkpoint isn't provided for an HF OSS model, download and convert the
# weights first.
- model_name = llm_config.base.model_class
+ model_name = llm_config.base.model_class.value
if not llm_config.base.checkpoint and model_name in HUGGING_FACE_REPO_IDS:
repo_id = HUGGING_FACE_REPO_IDS[model_name]
if model_name == "qwen2_5":
@@ -668,7 +668,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
llm_config.export.output_dir = output_dir_path
# Convert dtype override string to actual type.
- dtype_override = DType[llm_config.model.dtype_override]
+ dtype_override = DType[llm_config.model.dtype_override.value]
edge_manager = _load_llama_model(llm_config)
@@ -702,7 +702,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
checkpoint=llm_config.base.checkpoint,
checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype), # type: ignore
tokenizer_path=llm_config.base.tokenizer_path,
- use_spin_quant=llm_config.quantization.use_spin_quant,
+ use_spin_quant=llm_config.quantization.use_spin_quant.value if llm_config.quantization.use_spin_quant else None,
embedding_quantize=llm_config.quantization.embedding_quantize,
use_shared_embedding=llm_config.model.use_shared_embedding,
quantization_mode=llm_config.quantization.qmode,
@@ -726,7 +726,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
vulkan=llm_config.backend.vulkan.enabled,
use_qat=llm_config.quantization.use_qat,
use_lora=llm_config.base.use_lora,
- preq_mode=llm_config.base.preq_mode,
+ preq_mode=llm_config.base.preq_mode.value if llm_config.base.preq_mode else None,
preq_group_size=llm_config.base.preq_group_size,
preq_embedding_quantize=llm_config.base.preq_embedding_quantize,
local_global_attention=llm_config.model.local_global_attention,
@@ -738,25 +738,25 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
def get_quantizer_and_quant_params(llm_config):
pt2e_quant_params = get_pt2e_quantization_params(
- llm_config.quantization.pt2e_quantize, llm_config.quantization.qmode
+ llm_config.quantization.pt2e_quantize.value if llm_config.quantization.pt2e_quantize else None, llm_config.quantization.qmode
)
quantizers = get_pt2e_quantizers(pt2e_quant_params, llm_config.export.so_library)
quant_dtype = None
if llm_config.backend.qnn.enabled and llm_config.quantization.pt2e_quantize:
assert len(quantizers) == 0, "Should not enable both xnnpack and qnn"
qnn_quantizer, quant_dtype = get_qnn_quantizer(
- llm_config.quantization.pt2e_quantize, llm_config.quantization.qmode
+ llm_config.quantization.pt2e_quantize.value, llm_config.quantization.qmode
)
quantizers.append(qnn_quantizer)
if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml"
- coreml_quantizer = get_coreml_quantizer(llm_config.quantization.pt2e_quantize)
+ coreml_quantizer = get_coreml_quantizer(llm_config.quantization.pt2e_quantize.value)
quantizers.append(coreml_quantizer)
if llm_config.backend.vulkan.enabled and llm_config.quantization.pt2e_quantize:
assert (
len(quantizers) == 0
), "Should not enable both vulkan and other quantizers"
- vulkan_quantizer = get_vulkan_quantizer(llm_config.quantization.pt2e_quantize)
+ vulkan_quantizer = get_vulkan_quantizer(llm_config.quantization.pt2e_quantize.value)
quantizers.append(vulkan_quantizer)
logging.info(f"Applying quantizers: {quantizers}")
return pt2e_quant_params, quantizers, quant_dtype
@@ -1033,7 +1033,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
)
additional_passes = []
- if llm_config.base.model_class in TORCHTUNE_DEFINED_MODELS:
+ if llm_config.base.model_class.value in TORCHTUNE_DEFINED_MODELS:
additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
# export_to_edge
@@ -1072,14 +1072,14 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
mps=llm_config.backend.mps.enabled,
coreml=llm_config.backend.coreml.enabled,
qnn=llm_config.backend.qnn.enabled,
- dtype_override=llm_config.model.dtype_override,
+ dtype_override=llm_config.model.dtype_override.value,
enable_dynamic_shape=llm_config.model.enable_dynamic_shape,
use_kv_cache=llm_config.model.use_kv_cache,
embedding_quantize=llm_config.quantization.embedding_quantize,
- pt2e_quantize=llm_config.quantization.pt2e_quantize,
+ pt2e_quantize=llm_config.quantization.pt2e_quantize.value if llm_config.quantization.pt2e_quantize else None,
coreml_ios=llm_config.backend.coreml.ios,
- coreml_quantize=llm_config.backend.coreml.quantize,
- coreml_compute_units=llm_config.backend.coreml.compute_units,
+ coreml_quantize=llm_config.backend.coreml.quantize.value if llm_config.backend.coreml.quantize else None,
+ coreml_compute_units=llm_config.backend.coreml.compute_units.value,
use_qnn_sha=llm_config.backend.qnn.use_sha,
num_sharding=llm_config.backend.qnn.num_sharding,
soc_model=llm_config.backend.qnn.soc_model,
@@ -1152,7 +1152,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
An instance of LLMEdgeManager which contains the eager mode model.
"""
- modelname = llm_config.base.model_class
+ modelname = llm_config.base.model_class.value
if modelname in EXECUTORCH_DEFINED_MODELS:
module_name = "llama"
model_class_name = "Llama2Model" # TODO: Change to "LlamaModel" in examples/models/llama/model.py.
@@ -1173,7 +1173,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
)
)
# Convert dtype override string to actual type.
- dtype_override = DType[llm_config.model.dtype_override]
+ dtype_override = DType[llm_config.model.dtype_override.value]
return LLMEdgeManager(
model=model,
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
index ec9646be6f4..efea80dde2f 100644
--- a/examples/models/llama/model.py
+++ b/examples/models/llama/model.py
@@ -157,7 +157,7 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
if model_args.use_scaled_rope:
# Older models don't have use_scaled_rope configuration
- model_name = str(self.llm_config.base.model_class)
+ model_name = self.llm_config.base.model_class.value
assert model_name not in ["llama2", "stories110m"]
# Llama3_2 and newer models in ExecuTorch repo should set larger scale factor
@@ -328,10 +328,10 @@ def get_example_inputs_kvcache_sdpa(self):
def _transform_for_pre_quantization(self, checkpoint, model_args):
assert self.llm_config.base.preq_mode, "preq_mode must be specified"
- assert self.llm_config.base.preq_mode in [
+ assert self.llm_config.base.preq_mode.value in [
"8da4w",
"8da4w_output_8da8w",
- ], f"Quantization mode {self.llm_config.base.preq_mode} is not compatible with SpinQuant."
+ ], f"Quantization mode {self.llm_config.base.preq_mode.value} is not compatible with SpinQuant."
assert self.llm_config.base.preq_group_size, "preq_group_size must be specified"
assert self.llm_config.model.dtype_override, "dtype_override must be specified"
@@ -351,7 +351,7 @@ def _transform_for_pre_quantization(self, checkpoint, model_args):
}
# Transform the output layer first if needed.
- if self.llm_config.base.preq_mode == "8da4w_output_8da8w":
+ if self.llm_config.base.preq_mode.value == "8da4w_output_8da8w":
from .source_transformation.pre_quantization import (
transform_output_linear_for_pre_quantization,
)
@@ -359,14 +359,14 @@ def _transform_for_pre_quantization(self, checkpoint, model_args):
self.model_ = transform_output_linear_for_pre_quantization(
module=self.model_,
checkpoint=checkpoint,
- dtype=mapping[self.llm_config.model.dtype_override],
+ dtype=mapping[self.llm_config.model.dtype_override.value],
)
self.model_ = transform_linear_for_pre_quantization(
self.model_,
checkpoint,
self.llm_config.base.preq_group_size,
- mapping[self.llm_config.model.dtype_override],
+ mapping[self.llm_config.model.dtype_override.value],
)
embedding_bit_width, embedding_group_size = None, None
@@ -390,7 +390,7 @@ def _transform_for_pre_quantization(self, checkpoint, model_args):
self.model_ = transform_embedding_for_pre_quantization(
self.model_,
checkpoint,
- mapping[self.llm_config.model.dtype_override],
+ mapping[self.llm_config.model.dtype_override.value],
int(embedding_bit_width),
embedding_group_size,
)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index 970a32c9606..d36baa6c62c 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -10,7 +10,16 @@
import unittest
from unittest.mock import MagicMock, patch
-from executorch.examples.models.llama.config.llm_config import LlmConfig
+from executorch.examples.models.llama.config.llm_config import (
+ LlmConfig,
+ ModelType,
+ PreqMode,
+ DtypeOverride,
+ Pt2eQuantize,
+ SpinQuant,
+ CoreMLQuantize,
+ CoreMLComputeUnit
+)
from executorch.extension.llm.export.export_llm import main, parse_config_arg, pop_config_arg
@@ -106,6 +115,96 @@ def test_config_rejects_multiple_cli_args(self) -> None:
finally:
os.unlink(config_file)
+ def test_enum_fields(self) -> None:
+ """Test that all enum fields work correctly with their lowercase keys."""
+ # Test ModelType enum
+ for enum_value in ModelType:
+ self.assertIsNotNone(enum_value.value)
+ self.assertTrue(isinstance(enum_value.value, str))
+
+ # Test specific enum values that were changed from uppercase to lowercase
+ self.assertEqual(ModelType.stories110m.value, "stories110m")
+ self.assertEqual(ModelType.llama2.value, "llama2")
+ self.assertEqual(ModelType.llama3.value, "llama3")
+ self.assertEqual(ModelType.llama3_1.value, "llama3_1")
+ self.assertEqual(ModelType.llama3_2.value, "llama3_2")
+ self.assertEqual(ModelType.llama3_2_vision.value, "llama3_2_vision")
+ self.assertEqual(ModelType.static_llama.value, "static_llama")
+ self.assertEqual(ModelType.qwen2_5.value, "qwen2_5")
+ self.assertEqual(ModelType.qwen3_0_6b.value, "qwen3-0_6b")
+ self.assertEqual(ModelType.qwen3_1_7b.value, "qwen3-1_7b")
+ self.assertEqual(ModelType.qwen3_4b.value, "qwen3-4b")
+ self.assertEqual(ModelType.phi_4_mini.value, "phi_4_mini")
+ self.assertEqual(ModelType.smollm2.value, "smollm2")
+
+ # Test PreqMode enum
+ self.assertEqual(PreqMode.preq_8da4w.value, "8da4w")
+ self.assertEqual(PreqMode.preq_8da4w_out_8da8w.value, "8da4w_output_8da8w")
+
+ # Test DtypeOverride enum
+ self.assertEqual(DtypeOverride.fp32.value, "fp32")
+ self.assertEqual(DtypeOverride.fp16.value, "fp16")
+ self.assertEqual(DtypeOverride.bf16.value, "bf16")
+
+ # Test Pt2eQuantize enum
+ self.assertEqual(Pt2eQuantize.xnnpack_dynamic.value, "xnnpack_dynamic")
+ self.assertEqual(Pt2eQuantize.xnnpack_dynamic_qc4.value, "xnnpack_dynamic_qc4")
+ self.assertEqual(Pt2eQuantize.qnn_8a8w.value, "qnn_8a8w")
+ self.assertEqual(Pt2eQuantize.qnn_16a16w.value, "qnn_16a16w")
+ self.assertEqual(Pt2eQuantize.qnn_16a4w.value, "qnn_16a4w")
+ self.assertEqual(Pt2eQuantize.coreml_c4w.value, "coreml_c4w")
+ self.assertEqual(Pt2eQuantize.coreml_8a_c8w.value, "coreml_8a_c8w")
+ self.assertEqual(Pt2eQuantize.coreml_8a_c4w.value, "coreml_8a_c4w")
+ self.assertEqual(Pt2eQuantize.coreml_baseline_8a_c8w.value, "coreml_baseline_8a_c8w")
+ self.assertEqual(Pt2eQuantize.coreml_baseline_8a_c4w.value, "coreml_baseline_8a_c4w")
+ self.assertEqual(Pt2eQuantize.vulkan_8w.value, "vulkan_8w")
+
+ # Test SpinQuant enum
+ self.assertEqual(SpinQuant.cuda.value, "cuda")
+ self.assertEqual(SpinQuant.native.value, "native")
+
+ # Test CoreMLQuantize enum
+ self.assertEqual(CoreMLQuantize.b4w.value, "b4w")
+ self.assertEqual(CoreMLQuantize.c4w.value, "c4w")
+
+ # Test CoreMLComputeUnit enum
+ self.assertEqual(CoreMLComputeUnit.cpu_only.value, "cpu_only")
+ self.assertEqual(CoreMLComputeUnit.cpu_and_gpu.value, "cpu_and_gpu")
+ self.assertEqual(CoreMLComputeUnit.cpu_and_ne.value, "cpu_and_ne")
+ self.assertEqual(CoreMLComputeUnit.all.value, "all")
+
+ def test_enum_configuration(self) -> None:
+ """Test that enum fields can be properly set in LlmConfig."""
+ config = LlmConfig()
+
+ # Test setting ModelType
+ config.base.model_class = ModelType.llama3
+ self.assertEqual(config.base.model_class.value, "llama3")
+
+ # Test setting DtypeOverride
+ config.model.dtype_override = DtypeOverride.fp16
+ self.assertEqual(config.model.dtype_override.value, "fp16")
+
+ # Test setting PreqMode
+ config.base.preq_mode = PreqMode.preq_8da4w
+ self.assertEqual(config.base.preq_mode.value, "8da4w")
+
+ # Test setting Pt2eQuantize
+ config.quantization.pt2e_quantize = Pt2eQuantize.xnnpack_dynamic
+ self.assertEqual(config.quantization.pt2e_quantize.value, "xnnpack_dynamic")
+
+ # Test setting SpinQuant
+ config.quantization.use_spin_quant = SpinQuant.cuda
+ self.assertEqual(config.quantization.use_spin_quant.value, "cuda")
+
+ # Test setting CoreMLQuantize
+ config.backend.coreml.quantize = CoreMLQuantize.c4w
+ self.assertEqual(config.backend.coreml.quantize.value, "c4w")
+
+ # Test setting CoreMLComputeUnit
+ config.backend.coreml.compute_units = CoreMLComputeUnit.cpu_and_gpu
+ self.assertEqual(config.backend.coreml.compute_units.value, "cpu_and_gpu")
+
if __name__ == "__main__":
unittest.main()
From eedd8333e8ea8bc86bb08c24586f2309f2af6252 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:22:21 -0700
Subject: [PATCH 05/17] Update
[ghstack-poisoned]
---
extension/llm/export/README.md | 8 --------
extension/llm/install_requirements.sh | 9 ---------
requirements-dev.txt | 2 ++
3 files changed, 2 insertions(+), 17 deletions(-)
delete mode 100755 extension/llm/install_requirements.sh
diff --git a/extension/llm/export/README.md b/extension/llm/export/README.md
index e15c7fd7f77..1ac27306c86 100644
--- a/extension/llm/export/README.md
+++ b/extension/llm/export/README.md
@@ -21,14 +21,6 @@ The LLM export process transforms a model from its original format to an optimiz
- **Stories**: Stories110M (educational model)
- **SmolLM**: SmolLM2
-## Installation
-
-First, install the required dependencies:
-
-```bash
-./extension/llm/install_requirements.sh
-```
-
## Usage
The export API supports two configuration approaches:
diff --git a/extension/llm/install_requirements.sh b/extension/llm/install_requirements.sh
deleted file mode 100755
index 8f322083c03..00000000000
--- a/extension/llm/install_requirements.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Install requirements for LLM extension
-pip install hydra-core>=1.3.0 omegaconf>=2.3.0
diff --git a/requirements-dev.txt b/requirements-dev.txt
index a4ed212fb65..07c63101eb8 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -9,3 +9,5 @@ wheel # For building the pip package archive.
zstd # Imported by resolve_buck.py.
lintrunner==0.12.7
lintrunner-adapters==0.12.4
+hydra-core>=1.3.0
+omegaconf>=2.3.0
From 0cffae8114eb35f298762c9308da6f82f95ce693 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:27:38 -0700
Subject: [PATCH 06/17] Update
[ghstack-poisoned]
---
extension/llm/export/test/test_export_llm.py | 119 +++----------------
1 file changed, 19 insertions(+), 100 deletions(-)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index d36baa6c62c..c4390050235 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -10,16 +10,7 @@
import unittest
from unittest.mock import MagicMock, patch
-from executorch.examples.models.llama.config.llm_config import (
- LlmConfig,
- ModelType,
- PreqMode,
- DtypeOverride,
- Pt2eQuantize,
- SpinQuant,
- CoreMLQuantize,
- CoreMLComputeUnit
-)
+from executorch.examples.models.llama.config.llm_config import LlmConfig
from executorch.extension.llm.export.export_llm import main, parse_config_arg, pop_config_arg
@@ -56,9 +47,20 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
f.write("""
base:
+ model_class: llama2
tokenizer_path: /path/to/tokenizer.json
+ preq_mode: preq_8da4w
+model:
+ dtype_override: fp16
export:
max_seq_length: 256
+quantization:
+ pt2e_quantize: xnnpack_dynamic
+ use_spin_quant: cuda
+backend:
+ coreml:
+ quantize: c4w
+ compute_units: cpu_and_gpu
""")
config_file = f.name
@@ -71,7 +73,14 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
mock_export_llama.assert_called_once()
called_config = mock_export_llama.call_args[0][0]
self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
+ self.assertEqual(called_config["base"]["model_class"], "llama2")
+ self.assertEqual(called_config["base"]["preq_mode"], "preq_8da4w")
+ self.assertEqual(called_config["model"]["dtype_override"], "fp16")
self.assertEqual(called_config["export"]["max_seq_length"], 256)
+ self.assertEqual(called_config["quantization"]["pt2e_quantize"], "xnnpack_dynamic")
+ self.assertEqual(called_config["quantization"]["use_spin_quant"], "cuda")
+ self.assertEqual(called_config["backend"]["coreml"]["quantize"], "c4w")
+ self.assertEqual(called_config["backend"]["coreml"]["compute_units"], "cpu_and_gpu")
finally:
os.unlink(config_file)
@@ -115,96 +124,6 @@ def test_config_rejects_multiple_cli_args(self) -> None:
finally:
os.unlink(config_file)
- def test_enum_fields(self) -> None:
- """Test that all enum fields work correctly with their lowercase keys."""
- # Test ModelType enum
- for enum_value in ModelType:
- self.assertIsNotNone(enum_value.value)
- self.assertTrue(isinstance(enum_value.value, str))
-
- # Test specific enum values that were changed from uppercase to lowercase
- self.assertEqual(ModelType.stories110m.value, "stories110m")
- self.assertEqual(ModelType.llama2.value, "llama2")
- self.assertEqual(ModelType.llama3.value, "llama3")
- self.assertEqual(ModelType.llama3_1.value, "llama3_1")
- self.assertEqual(ModelType.llama3_2.value, "llama3_2")
- self.assertEqual(ModelType.llama3_2_vision.value, "llama3_2_vision")
- self.assertEqual(ModelType.static_llama.value, "static_llama")
- self.assertEqual(ModelType.qwen2_5.value, "qwen2_5")
- self.assertEqual(ModelType.qwen3_0_6b.value, "qwen3-0_6b")
- self.assertEqual(ModelType.qwen3_1_7b.value, "qwen3-1_7b")
- self.assertEqual(ModelType.qwen3_4b.value, "qwen3-4b")
- self.assertEqual(ModelType.phi_4_mini.value, "phi_4_mini")
- self.assertEqual(ModelType.smollm2.value, "smollm2")
-
- # Test PreqMode enum
- self.assertEqual(PreqMode.preq_8da4w.value, "8da4w")
- self.assertEqual(PreqMode.preq_8da4w_out_8da8w.value, "8da4w_output_8da8w")
-
- # Test DtypeOverride enum
- self.assertEqual(DtypeOverride.fp32.value, "fp32")
- self.assertEqual(DtypeOverride.fp16.value, "fp16")
- self.assertEqual(DtypeOverride.bf16.value, "bf16")
-
- # Test Pt2eQuantize enum
- self.assertEqual(Pt2eQuantize.xnnpack_dynamic.value, "xnnpack_dynamic")
- self.assertEqual(Pt2eQuantize.xnnpack_dynamic_qc4.value, "xnnpack_dynamic_qc4")
- self.assertEqual(Pt2eQuantize.qnn_8a8w.value, "qnn_8a8w")
- self.assertEqual(Pt2eQuantize.qnn_16a16w.value, "qnn_16a16w")
- self.assertEqual(Pt2eQuantize.qnn_16a4w.value, "qnn_16a4w")
- self.assertEqual(Pt2eQuantize.coreml_c4w.value, "coreml_c4w")
- self.assertEqual(Pt2eQuantize.coreml_8a_c8w.value, "coreml_8a_c8w")
- self.assertEqual(Pt2eQuantize.coreml_8a_c4w.value, "coreml_8a_c4w")
- self.assertEqual(Pt2eQuantize.coreml_baseline_8a_c8w.value, "coreml_baseline_8a_c8w")
- self.assertEqual(Pt2eQuantize.coreml_baseline_8a_c4w.value, "coreml_baseline_8a_c4w")
- self.assertEqual(Pt2eQuantize.vulkan_8w.value, "vulkan_8w")
-
- # Test SpinQuant enum
- self.assertEqual(SpinQuant.cuda.value, "cuda")
- self.assertEqual(SpinQuant.native.value, "native")
-
- # Test CoreMLQuantize enum
- self.assertEqual(CoreMLQuantize.b4w.value, "b4w")
- self.assertEqual(CoreMLQuantize.c4w.value, "c4w")
-
- # Test CoreMLComputeUnit enum
- self.assertEqual(CoreMLComputeUnit.cpu_only.value, "cpu_only")
- self.assertEqual(CoreMLComputeUnit.cpu_and_gpu.value, "cpu_and_gpu")
- self.assertEqual(CoreMLComputeUnit.cpu_and_ne.value, "cpu_and_ne")
- self.assertEqual(CoreMLComputeUnit.all.value, "all")
-
- def test_enum_configuration(self) -> None:
- """Test that enum fields can be properly set in LlmConfig."""
- config = LlmConfig()
-
- # Test setting ModelType
- config.base.model_class = ModelType.llama3
- self.assertEqual(config.base.model_class.value, "llama3")
-
- # Test setting DtypeOverride
- config.model.dtype_override = DtypeOverride.fp16
- self.assertEqual(config.model.dtype_override.value, "fp16")
-
- # Test setting PreqMode
- config.base.preq_mode = PreqMode.preq_8da4w
- self.assertEqual(config.base.preq_mode.value, "8da4w")
-
- # Test setting Pt2eQuantize
- config.quantization.pt2e_quantize = Pt2eQuantize.xnnpack_dynamic
- self.assertEqual(config.quantization.pt2e_quantize.value, "xnnpack_dynamic")
-
- # Test setting SpinQuant
- config.quantization.use_spin_quant = SpinQuant.cuda
- self.assertEqual(config.quantization.use_spin_quant.value, "cuda")
-
- # Test setting CoreMLQuantize
- config.backend.coreml.quantize = CoreMLQuantize.c4w
- self.assertEqual(config.backend.coreml.quantize.value, "c4w")
-
- # Test setting CoreMLComputeUnit
- config.backend.coreml.compute_units = CoreMLComputeUnit.cpu_and_gpu
- self.assertEqual(config.backend.coreml.compute_units.value, "cpu_and_gpu")
-
if __name__ == "__main__":
unittest.main()
From 73377ca332c97df1985d77ec5659f2f86fb71063 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 16:01:08 -0700
Subject: [PATCH 07/17] Update
[ghstack-poisoned]
---
.ci/configs/README.md | 42 +++++
.ci/configs/ci_stories110m_coreml.yaml | 20 +++
.ci/configs/ci_stories110m_mps.yaml | 20 +++
.ci/configs/ci_stories110m_qnn.yaml | 28 +++
.../ci_stories110m_xnnpack_quantized.yaml | 27 +++
.ci/configs/llama3_coreml_ane.yaml | 27 +++
.ci/configs/llama3_qlora.yaml | 30 ++++
.ci/configs/llama3_spinquant.yaml | 29 ++++
.ci/configs/qwen3_xnnpack_8da4w.yaml | 28 +++
.ci/configs/stories110m_torchao_lowbit.yaml | 26 +++
.ci/configs/xnnpack_8da4w_basic.yaml | 27 +++
.ci/configs/xnnpack_custom_quantized.yaml | 27 +++
.ci/scripts/test_llama.sh | 20 +--
.ci/scripts/test_llama_torchao_lowbit.sh | 20 +--
.ci/scripts/test_model.sh | 12 +-
.github/workflows/android-perf.yml | 142 +++++++--------
.github/workflows/apple-perf.yml | 164 +++++++++---------
17 files changed, 512 insertions(+), 177 deletions(-)
create mode 100644 .ci/configs/README.md
create mode 100644 .ci/configs/ci_stories110m_coreml.yaml
create mode 100644 .ci/configs/ci_stories110m_mps.yaml
create mode 100644 .ci/configs/ci_stories110m_qnn.yaml
create mode 100644 .ci/configs/ci_stories110m_xnnpack_quantized.yaml
create mode 100644 .ci/configs/llama3_coreml_ane.yaml
create mode 100644 .ci/configs/llama3_qlora.yaml
create mode 100644 .ci/configs/llama3_spinquant.yaml
create mode 100644 .ci/configs/qwen3_xnnpack_8da4w.yaml
create mode 100644 .ci/configs/stories110m_torchao_lowbit.yaml
create mode 100644 .ci/configs/xnnpack_8da4w_basic.yaml
create mode 100644 .ci/configs/xnnpack_custom_quantized.yaml
diff --git a/.ci/configs/README.md b/.ci/configs/README.md
new file mode 100644
index 00000000000..c77e758d992
--- /dev/null
+++ b/.ci/configs/README.md
@@ -0,0 +1,42 @@
+# CI Configuration Files for LLM Export
+
+This directory contains YAML configuration files used by CI tests for exporting LLM models with the new `extension.llm.export.export_llm` command.
+
+## Usage
+
+These config files can be used with the export command like this:
+
+```bash
+python -m extension.llm.export.export_llm --config path/to/config.yaml
+```
+
+Or you can override specific parameters:
+
+```bash
+python -m extension.llm.export.export_llm --config ci_stories110m_xnnpack_quantized.yaml base.checkpoint=my_checkpoint.pt
+```
+
+## Configuration Files
+
+### CI Test Configurations
+- `ci_stories110m_xnnpack_quantized.yaml` - Stories110M with XNNPACK quantization (used in test_llama.sh)
+- `ci_stories110m_mps.yaml` - Stories110M with MPS backend
+- `ci_stories110m_coreml.yaml` - Stories110M with CoreML backend
+- `ci_stories110m_qnn.yaml` - Stories110M with QNN backend
+
+### Performance Test Configurations
+- `llama3_spinquant.yaml` - Llama3 with SpinQuant (used in apple-perf.yml, android-perf.yml)
+- `llama3_qlora.yaml` - Llama3 with QLoRA (QAT + LoRA)
+- `llama3_coreml_ane.yaml` - Llama3 with CoreML ANE
+- `xnnpack_8da4w_basic.yaml` - Basic XNNPACK 8da4w quantization
+- `qwen3_xnnpack_8da4w.yaml` - Qwen3 with XNNPACK 8da4w quantization
+
+### Specialized Configurations
+- `stories110m_torchao_lowbit.yaml` - Stories110M with TorchAO lowbit quantization
+- `xnnpack_custom_quantized.yaml` - XNNPACK with custom ops and quantization
+
+## Background
+
+These configuration files were created as part of migrating CI tests from the old `examples.models.llama.export_llama` command to the new `extension.llm.export.export_llm` command with hydra configuration support.
+
+The config files help reduce duplication in CI scripts and make it easier to maintain consistent export settings across different test scenarios.
\ No newline at end of file
diff --git a/.ci/configs/ci_stories110m_coreml.yaml b/.ci/configs/ci_stories110m_coreml.yaml
new file mode 100644
index 00000000000..9ef9a5d2f72
--- /dev/null
+++ b/.ci/configs/ci_stories110m_coreml.yaml
@@ -0,0 +1,20 @@
+# Configuration for CI test_llama.sh - stories110M with CoreML backend
+
+base:
+ model_class: "stories110m"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ enable_dynamic_shape: false
+
+export:
+ max_seq_length: 128
+ max_context_length: 128
+
+backend:
+ coreml:
+ enabled: true
+
+debug:
+ verbose: true
\ No newline at end of file
diff --git a/.ci/configs/ci_stories110m_mps.yaml b/.ci/configs/ci_stories110m_mps.yaml
new file mode 100644
index 00000000000..4b568a72203
--- /dev/null
+++ b/.ci/configs/ci_stories110m_mps.yaml
@@ -0,0 +1,20 @@
+# Configuration for CI test_llama.sh - stories110M with MPS backend
+
+base:
+ model_class: "stories110m"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ enable_dynamic_shape: false
+
+export:
+ max_seq_length: 128
+ max_context_length: 128
+
+backend:
+ mps:
+ enabled: true
+
+debug:
+ verbose: true
\ No newline at end of file
diff --git a/.ci/configs/ci_stories110m_qnn.yaml b/.ci/configs/ci_stories110m_qnn.yaml
new file mode 100644
index 00000000000..75061aa6c2b
--- /dev/null
+++ b/.ci/configs/ci_stories110m_qnn.yaml
@@ -0,0 +1,28 @@
+# Configuration for CI test_llama.sh - stories110M with QNN backend
+
+base:
+ model_class: "stories110m"
+ tokenizer_path: "tokenizer.model"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ enable_dynamic_shape: false
+
+export:
+ max_seq_length: 128
+ max_context_length: 128
+
+quantization:
+ pt2e_quantize: "qnn_16a16w"
+ calibration_tasks: ["wikitext"]
+ calibration_limit: 1
+ calibration_seq_length: 128
+ calibration_data: "Once"
+
+backend:
+ qnn:
+ enabled: true
+
+debug:
+ verbose: true
\ No newline at end of file
diff --git a/.ci/configs/ci_stories110m_xnnpack_quantized.yaml b/.ci/configs/ci_stories110m_xnnpack_quantized.yaml
new file mode 100644
index 00000000000..6fa692fbc42
--- /dev/null
+++ b/.ci/configs/ci_stories110m_xnnpack_quantized.yaml
@@ -0,0 +1,27 @@
+# Configuration for CI test_llama.sh - stories110M with XNNPACK quantization
+# Used when XNNPACK=ON, CUSTOM=ON, QE=ON modes are enabled
+
+base:
+ model_class: "stories110m"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ use_sdpa_with_kv_cache: true
+
+export:
+ max_seq_length: 128
+ max_context_length: 128
+
+quantization:
+ qmode: "8da4w"
+ group_size: 128
+ embedding_quantize: "8,1024"
+
+backend:
+ xnnpack:
+ enabled: true
+ extended_ops: true
+
+debug:
+ verbose: false
\ No newline at end of file
diff --git a/.ci/configs/llama3_coreml_ane.yaml b/.ci/configs/llama3_coreml_ane.yaml
new file mode 100644
index 00000000000..fb11b0edd86
--- /dev/null
+++ b/.ci/configs/llama3_coreml_ane.yaml
@@ -0,0 +1,27 @@
+# Configuration for Llama3 with CoreML ANE
+# Used in apple-perf.yml
+
+base:
+ model_class: "llama3_2"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ enable_dynamic_shape: false
+
+export:
+ max_seq_length: 128
+ max_context_length: 128
+
+quantization:
+ embedding_quantize: "4,32"
+
+backend:
+ coreml:
+ enabled: true
+ ios: 18
+ quantize: "c4w"
+ compute_units: "cpu_and_ne"
+
+debug:
+ verbose: false
\ No newline at end of file
diff --git a/.ci/configs/llama3_qlora.yaml b/.ci/configs/llama3_qlora.yaml
new file mode 100644
index 00000000000..e97ab40ff2a
--- /dev/null
+++ b/.ci/configs/llama3_qlora.yaml
@@ -0,0 +1,30 @@
+# Configuration for Llama3 with QLoRA (QAT + LoRA)
+# Used in apple-perf.yml and android-perf.yml
+
+base:
+ model_class: "llama3_2"
+ use_lora: 16
+ preq_mode: "8da4w_output_8da8w"
+ preq_group_size: 32
+ preq_embedding_quantize: "8,0"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ use_sdpa_with_kv_cache: true
+ enable_dynamic_shape: false
+
+export:
+ max_seq_length: 2048
+ max_context_length: 2048
+
+quantization:
+ use_qat: true
+
+backend:
+ xnnpack:
+ enabled: true
+ extended_ops: true
+
+debug:
+ verbose: false
\ No newline at end of file
diff --git a/.ci/configs/llama3_spinquant.yaml b/.ci/configs/llama3_spinquant.yaml
new file mode 100644
index 00000000000..4eba1340f16
--- /dev/null
+++ b/.ci/configs/llama3_spinquant.yaml
@@ -0,0 +1,29 @@
+# Configuration for Llama3 with SpinQuant
+# Used in apple-perf.yml and android-perf.yml
+
+base:
+ model_class: "llama3_2"
+ preq_mode: "8da4w_output_8da8w"
+ preq_group_size: 32
+ preq_embedding_quantize: "8,0"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ use_sdpa_with_kv_cache: true
+ enable_dynamic_shape: false
+
+export:
+ max_seq_length: 2048
+ max_context_length: 2048
+
+quantization:
+ use_spin_quant: "native"
+
+backend:
+ xnnpack:
+ enabled: true
+ extended_ops: true
+
+debug:
+ verbose: false
\ No newline at end of file
diff --git a/.ci/configs/qwen3_xnnpack_8da4w.yaml b/.ci/configs/qwen3_xnnpack_8da4w.yaml
new file mode 100644
index 00000000000..433ed92acdc
--- /dev/null
+++ b/.ci/configs/qwen3_xnnpack_8da4w.yaml
@@ -0,0 +1,28 @@
+# Configuration for Qwen3-0.6B with XNNPACK 8da4w quantization
+# Used in apple-perf.yml and android-perf.yml
+
+base:
+ model_class: "qwen3-0_6b"
+ params: "examples/models/qwen3/0_6b_config.json"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ use_sdpa_with_kv_cache: true
+
+export:
+ max_seq_length: 128
+ max_context_length: 128
+
+quantization:
+ qmode: "8da4w"
+ group_size: 32
+ embedding_quantize: "8,0"
+
+backend:
+ xnnpack:
+ enabled: true
+ extended_ops: true
+
+debug:
+ verbose: false
\ No newline at end of file
diff --git a/.ci/configs/stories110m_torchao_lowbit.yaml b/.ci/configs/stories110m_torchao_lowbit.yaml
new file mode 100644
index 00000000000..12970cd9fcd
--- /dev/null
+++ b/.ci/configs/stories110m_torchao_lowbit.yaml
@@ -0,0 +1,26 @@
+# Configuration for stories110M with TorchAO lowbit quantization
+# Used in CI test_llama_torchao_lowbit.sh
+
+base:
+ model_class: "stories110m"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ use_sdpa_with_kv_cache: true
+
+export:
+ max_seq_length: 128
+ max_context_length: 128
+
+quantization:
+ qmode: "torchao:8da3w" # QLINEAR_BITWIDTH=3
+ group_size: 128 # QLINEAR_GROUP_SIZE=128
+ embedding_quantize: "4,32" # QEMBEDDING_BITWIDTH=4, QEMBEDDING_GROUP_SIZE=32
+
+backend:
+ xnnpack:
+ enabled: false
+
+debug:
+ verbose: false
\ No newline at end of file
diff --git a/.ci/configs/xnnpack_8da4w_basic.yaml b/.ci/configs/xnnpack_8da4w_basic.yaml
new file mode 100644
index 00000000000..ce727df6d5c
--- /dev/null
+++ b/.ci/configs/xnnpack_8da4w_basic.yaml
@@ -0,0 +1,27 @@
+# Configuration for basic XNNPACK 8da4w quantization
+# Used in apple-perf.yml and android-perf.yml
+
+base:
+ model_class: "llama3_2"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ use_sdpa_with_kv_cache: true
+
+export:
+ max_seq_length: 128
+ max_context_length: 128
+
+quantization:
+ qmode: "8da4w"
+ group_size: 32
+ embedding_quantize: "8,0"
+
+backend:
+ xnnpack:
+ enabled: true
+ extended_ops: true
+
+debug:
+ verbose: false
\ No newline at end of file
diff --git a/.ci/configs/xnnpack_custom_quantized.yaml b/.ci/configs/xnnpack_custom_quantized.yaml
new file mode 100644
index 00000000000..6101296489d
--- /dev/null
+++ b/.ci/configs/xnnpack_custom_quantized.yaml
@@ -0,0 +1,27 @@
+# Configuration for XNNPACK + custom + quantization
+# Common pattern used in CI test_llama.sh
+
+base:
+ model_class: "stories110m"
+
+model:
+ dtype_override: "fp32"
+ use_kv_cache: true
+ use_sdpa_with_kv_cache: true
+
+export:
+ max_seq_length: 128
+ max_context_length: 128
+
+quantization:
+ qmode: "8da4w"
+ group_size: 128
+ embedding_quantize: "8,1024" # Default from test_llama.sh QE mode
+
+backend:
+ xnnpack:
+ enabled: true
+ extended_ops: true
+
+debug:
+ verbose: false
\ No newline at end of file
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 9f183528719..7371e632521 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -224,34 +224,34 @@ fi
# Export model.
EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
echo "Exporting ${EXPORTED_MODEL_NAME}"
-EXPORT_ARGS="-c ${CHECKPOINT_FILE_NAME} -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
+EXPORT_ARGS="base.checkpoint=${CHECKPOINT_FILE_NAME} base.params=${PARAMS} model.dtype_override=${DTYPE} export.output_name=${EXPORTED_MODEL_NAME} model.use_kv_cache=true"
if [[ "${XNNPACK}" == "ON" ]]; then
- EXPORT_ARGS="${EXPORT_ARGS} -X --xnnpack-extended-ops -qmode 8da4w -G 128"
+ EXPORT_ARGS="${EXPORT_ARGS} backend.xnnpack.enabled=true backend.xnnpack.extended_ops=true quantization.qmode=8da4w quantization.group_size=128"
fi
if [[ "${CUSTOM}" == "ON" ]]; then
- EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
+ EXPORT_ARGS="${EXPORT_ARGS} model.use_sdpa_with_kv_cache=true"
fi
if [[ "${QE}" == "ON" ]]; then
- EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
+ EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=8,1024"
fi
if [[ "${MPS}" == "ON" ]]; then
- EXPORT_ARGS="${EXPORT_ARGS} -kv -v --mps --disable_dynamic_shape"
+ EXPORT_ARGS="${EXPORT_ARGS} backend.mps.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
fi
if [[ "${COREML}" == "ON" ]]; then
- EXPORT_ARGS="${EXPORT_ARGS} -kv -v --coreml --disable_dynamic_shape"
+ EXPORT_ARGS="${EXPORT_ARGS} backend.coreml.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
fi
if [[ "${QNN}" == "ON" ]]; then
- EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
+ EXPORT_ARGS="${EXPORT_ARGS} backend.qnn.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
- EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
+ EXPORT_ARGS+=" base.tokenizer_path=tokenizer.model quantization.pt2e_quantize=qnn_16a16w quantization.calibration_tasks=[wikitext] quantization.calibration_limit=1 quantization.calibration_seq_length=128 quantization.calibration_data='Once '"
fi
fi
if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then
- EXPORT_ARGS="${EXPORT_ARGS} --quantize_kv_cache"
+ EXPORT_ARGS="${EXPORT_ARGS} model.quantize_kv_cache=true"
fi
# Add dynamically linked library location
-$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
+$PYTHON_EXECUTABLE -m extension.llm.export.export_llm ${EXPORT_ARGS}
# Create tokenizer.bin.
echo "Creating tokenizer.bin"
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
index ac603cc5e83..077a39e6c32 100644
--- a/.ci/scripts/test_llama_torchao_lowbit.sh
+++ b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -70,16 +70,16 @@ QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
QEMBEDDING_BITWIDTH=4 # Can be 1-8
QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
-${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
- --checkpoint "${LLAMA_CHECKPOINT:?}" \
- --params "${LLAMA_PARAMS:?}" \
- -kv \
- --use_sdpa_with_kv_cache \
- --output_name=${MODEL_OUT} \
- -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
- --group_size ${QLINEAR_GROUP_SIZE} \
- -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
- -d fp32
+${PYTHON_EXECUTABLE} -m extension.llm.export.export_llm \
+ base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+ base.params="${LLAMA_PARAMS:?}" \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ export.output_name="${MODEL_OUT}" \
+ quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \
+ quantization.group_size=${QLINEAR_GROUP_SIZE} \
+ quantization.embedding_quantize="torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+ model.dtype_override=fp32
# Test run
./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index 4f8dc7a30e5..bbf879295ae 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -86,8 +86,8 @@ test_model() {
if [[ "${MODEL_NAME}" == "llama2" ]]; then
# Install requirements for export_llama
bash examples/models/llama/install_requirements.sh
- # Test export_llama script: python3 -m examples.models.llama.export_llama
- "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
+ # Test export_llm script: python3 -m extension.llm.export.export_llm
+ "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.checkpoint=examples/models/llama/params/demo_rand_params.pth base.params=examples/models/llama/params/demo_config.json
run_portable_executor_runner
rm "./${MODEL_NAME}.pte"
fi
@@ -100,17 +100,17 @@ test_model() {
if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then
# Install requirements for export_llama
bash examples/models/llama/install_requirements.sh
- # Test export_llama script: python3 -m examples.models.llama.export_llama.
+ # Test export_llm script: python3 -m extension.llm.export.export_llm.
# Use Llama random checkpoint with Qwen 2.5 1.5b model configuration.
- "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/qwen2_5/1_5b_config.json
+ "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json
rm "./${MODEL_NAME}.pte"
return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
fi
if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
# Install requirements for export_llama
bash examples/models/llama/install_requirements.sh
- # Test export_llama script: python3 -m examples.models.llama.export_llama.
- "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/phi_4_mini/config.json
+ # Test export_llm script: python3 -m extension.llm.export.export_llm.
+ "${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json
run_portable_executor_runner
rm "./${MODEL_NAME}.pte"
return
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 1a6d63f1bd1..ff134682220 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -214,23 +214,23 @@ jobs:
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
- python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
- --params "${DOWNLOADED_PATH}/params.json" \
- --use_sdpa_with_kv_cache \
- -X \
- --xnnpack-extended-ops \
- --preq_mode 8da4w_output_8da8w \
- --preq_group_size 32 \
- --max_seq_length 2048 \
- --max_context_length 2048 \
- --output_name "${OUT_ET_MODEL_NAME}.pte" \
- -kv \
- -d fp32 \
- --preq_embedding_quantize 8,0 \
- --use_spin_quant native \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+ base.params="${DOWNLOADED_PATH}/params.json" \
+ model.use_sdpa_with_kv_cache=true \
+ backend.xnnpack.enabled=true \
+ backend.xnnpack.extended_ops=true \
+ base.preq_mode="8da4w_output_8da8w" \
+ base.preq_group_size=32 \
+ export.max_seq_length=2048 \
+ export.max_context_length=2048 \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte" \
+ model.use_kv_cache=true \
+ model.dtype_override=fp32 \
+ base.preq_embedding_quantize="8,0" \
+ quantization.use_spin_quant=native \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
# QAT + LoRA
@@ -241,53 +241,55 @@ jobs:
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
- python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
- --params "${DOWNLOADED_PATH}/params.json" \
- -qat \
- -lora 16 \
- --preq_mode 8da4w_output_8da8w \
- --preq_group_size 32 \
- --preq_embedding_quantize 8,0 \
- --use_sdpa_with_kv_cache \
- -kv \
- -X \
- --xnnpack-extended-ops \
- -d fp32 \
- --max_seq_length 2048 \
- --max_context_length 2048 \
- --output_name "${OUT_ET_MODEL_NAME}.pte" \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+ base.params="${DOWNLOADED_PATH}/params.json" \
+ quantization.use_qat=true \
+ base.use_lora=16 \
+ base.preq_mode="8da4w_output_8da8w" \
+ base.preq_group_size=32 \
+ base.preq_embedding_quantize="8,0" \
+ model.use_sdpa_with_kv_cache=true \
+ model.use_kv_cache=true \
+ backend.xnnpack.enabled=true \
+ backend.xnnpack.extended_ops=true \
+ model.dtype_override=fp32 \
+ export.max_seq_length=2048 \
+ export.max_context_length=2048 \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
# Original BF16 version, without any quantization
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
- python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
- --params "${DOWNLOADED_PATH}/params.json" \
- -kv \
- --use_sdpa_with_kv_cache \
- -X \
- -d bf16 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --output_name="${OUT_ET_MODEL_NAME}.pte"
+ python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+ base.params="${DOWNLOADED_PATH}/params.json" \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ backend.xnnpack.enabled=true \
+ model.dtype_override=bf16 \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
- python -m examples.models.llama.export_llama \
- --model llama3_2 \
- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
- --params "${DOWNLOADED_PATH}/params.json" \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --xnnpack-extended-ops \
- -qmode 8da4w -G 32 -E 8,0 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --output_name="${OUT_ET_MODEL_NAME}.pte"
+ python -m extension.llm.export.export_llm \
+ base.model_class=llama3_2 \
+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+ base.params="${DOWNLOADED_PATH}/params.json" \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ model.dtype_override=fp32 \
+ backend.xnnpack.enabled=true \
+ backend.xnnpack.extended_ops=true \
+ quantization.qmode=8da4w \
+ quantization.group_size=32 \
+ quantization.embedding_quantize="8,0" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
@@ -313,19 +315,19 @@ jobs:
elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
- python -m examples.models.llama.export_llama \
- --model qwen3-0_6b \
- --params examples/models/qwen3/0_6b_config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --xnnpack-extended-ops \
- -qmode 8da4w \
- -G 32 \
- -E 8,0 \
- --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- --output_name="${OUT_ET_MODEL_NAME}.pte"
+ python -m extension.llm.export.export_llm \
+ base.model_class=qwen3-0_6b \
+ base.params=examples/models/qwen3/0_6b_config.json \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ model.dtype_override=fp32 \
+ backend.xnnpack.enabled=true \
+ backend.xnnpack.extended_ops=true \
+ quantization.qmode=8da4w \
+ quantization.group_size=32 \
+ quantization.embedding_quantize="8,0" \
+ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
fi
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 0c03f55f82e..bd096b1c9ac 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -223,23 +223,23 @@ jobs:
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
- ${CONDA_RUN} python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
- --params "${DOWNLOADED_PATH}/params.json" \
- --use_sdpa_with_kv_cache \
- -X \
- --xnnpack-extended-ops \
- --preq_mode 8da4w_output_8da8w \
- --preq_group_size 32 \
- --max_seq_length 2048 \
- --max_context_length 2048 \
- --output_name "${OUT_ET_MODEL_NAME}.pte" \
- -kv \
- -d fp32 \
- --preq_embedding_quantize 8,0 \
- --use_spin_quant native \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ ${CONDA_RUN} python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+ base.params="${DOWNLOADED_PATH}/params.json" \
+ model.use_sdpa_with_kv_cache=true \
+ backend.xnnpack.enabled=true \
+ backend.xnnpack.extended_ops=true \
+ base.preq_mode="8da4w_output_8da8w" \
+ base.preq_group_size=32 \
+ export.max_seq_length=2048 \
+ export.max_context_length=2048 \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte" \
+ model.use_kv_cache=true \
+ model.dtype_override=fp32 \
+ base.preq_embedding_quantize="8,0" \
+ quantization.use_spin_quant=native \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
# QAT + LoRA
@@ -250,87 +250,89 @@ jobs:
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
- ${CONDA_RUN} python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
- --params "${DOWNLOADED_PATH}/params.json" \
- -qat \
- -lora 16 \
- --preq_mode 8da4w_output_8da8w \
- --preq_group_size 32 \
- --preq_embedding_quantize 8,0 \
- --use_sdpa_with_kv_cache \
- -kv \
- -X \
- --xnnpack-extended-ops \
- -d fp32 \
- --max_seq_length 2048 \
- --max_context_length 2048 \
- --output_name "${OUT_ET_MODEL_NAME}.pte" \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ ${CONDA_RUN} python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+ base.params="${DOWNLOADED_PATH}/params.json" \
+ quantization.use_qat=true \
+ base.use_lora=16 \
+ base.preq_mode="8da4w_output_8da8w" \
+ base.preq_group_size=32 \
+ base.preq_embedding_quantize="8,0" \
+ model.use_sdpa_with_kv_cache=true \
+ model.use_kv_cache=true \
+ backend.xnnpack.enabled=true \
+ backend.xnnpack.extended_ops=true \
+ model.dtype_override=fp32 \
+ export.max_seq_length=2048 \
+ export.max_context_length=2048 \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
# Original BF16 version, without any quantization
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
- ${CONDA_RUN} python -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
- --params "${DOWNLOADED_PATH}/params.json" \
- -kv \
- --use_sdpa_with_kv_cache \
- -X \
- -d bf16 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --output_name="${OUT_ET_MODEL_NAME}.pte"
+ ${CONDA_RUN} python -m extension.llm.export.export_llm \
+ base.model_class="llama3_2" \
+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+ base.params="${DOWNLOADED_PATH}/params.json" \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ backend.xnnpack.enabled=true \
+ model.dtype_override=bf16 \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
- ${CONDA_RUN} python -m examples.models.llama.export_llama \
- --model llama3_2 \
- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
- --params "${DOWNLOADED_PATH}/params.json" \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --xnnpack-extended-ops \
- -qmode 8da4w -G 32 -E 8,0 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --output_name="${OUT_ET_MODEL_NAME}.pte"
+ ${CONDA_RUN} python -m extension.llm.export.export_llm \
+ base.model_class=llama3_2 \
+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+ base.params="${DOWNLOADED_PATH}/params.json" \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ model.dtype_override=fp32 \
+ backend.xnnpack.enabled=true \
+ backend.xnnpack.extended_ops=true \
+ quantization.qmode=8da4w \
+ quantization.group_size=32 \
+ quantization.embedding_quantize="8,0" \
+ base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then
# ANE
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
- ${CONDA_RUN} python -m examples.models.llama.export_llama \
- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
- --params "${DOWNLOADED_PATH}/params.json" \
- -E "4,32" \
- -kv \
- --disable_dynamic_shape \
- --coreml \
- --coreml-ios 18 \
- --coreml-quantize c4w \
- --coreml-compute-units cpu_and_ne \
- --output_name="${OUT_ET_MODEL_NAME}.pte"
+ ${CONDA_RUN} python -m extension.llm.export.export_llm \
+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+ base.params="${DOWNLOADED_PATH}/params.json" \
+ quantization.embedding_quantize="4,32" \
+ model.use_kv_cache=true \
+ model.enable_dynamic_shape=false \
+ backend.coreml.enabled=true \
+ backend.coreml.ios=18 \
+ backend.coreml.quantize=c4w \
+ backend.coreml.compute_units=cpu_and_ne \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
OUT_ET_MODEL_NAME=${OUT_ET_MODEL_NAME}_llm
if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
- ${CONDA_RUN} python -m examples.models.llama.export_llama \
- --model qwen3-0_6b \
- --params examples/models/qwen3/0_6b_config.json \
- -kv \
- --use_sdpa_with_kv_cache \
- -d fp32 \
- -X \
- --xnnpack-extended-ops \
- -qmode 8da4w \
- -G 32 \
- -E 8,0 \
- --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- --output_name="${OUT_ET_MODEL_NAME}.pte"
+ ${CONDA_RUN} python -m extension.llm.export.export_llm \
+ base.model_class=qwen3-0_6b \
+ base.params=examples/models/qwen3/0_6b_config.json \
+ model.use_kv_cache=true \
+ model.use_sdpa_with_kv_cache=true \
+ model.dtype_override=fp32 \
+ backend.xnnpack.enabled=true \
+ backend.xnnpack.extended_ops=true \
+ quantization.qmode=8da4w \
+ quantization.group_size=32 \
+ quantization.embedding_quantize="8,0" \
+ base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
fi
From bd8b8984aed25e3833c2823431ac7d00daf6bcda Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 16:14:05 -0700
Subject: [PATCH 08/17] Update
[ghstack-poisoned]
---
.ci/configs/README.md | 42 -------------------
.ci/configs/ci_stories110m_coreml.yaml | 20 ---------
.ci/configs/ci_stories110m_mps.yaml | 20 ---------
.ci/configs/ci_stories110m_qnn.yaml | 28 -------------
.../ci_stories110m_xnnpack_quantized.yaml | 27 ------------
.ci/configs/llama3_coreml_ane.yaml | 27 ------------
.ci/configs/llama3_qlora.yaml | 30 -------------
.ci/configs/llama3_spinquant.yaml | 29 -------------
.ci/configs/qwen3_xnnpack_8da4w.yaml | 28 -------------
.ci/configs/stories110m_torchao_lowbit.yaml | 26 ------------
.ci/configs/xnnpack_8da4w_basic.yaml | 27 ------------
.ci/configs/xnnpack_custom_quantized.yaml | 27 ------------
.../LlamaDemo/run_instrumentation_test.sh | 2 +-
.../executorch_android/android_test_setup.sh | 2 +-
14 files changed, 2 insertions(+), 333 deletions(-)
delete mode 100644 .ci/configs/README.md
delete mode 100644 .ci/configs/ci_stories110m_coreml.yaml
delete mode 100644 .ci/configs/ci_stories110m_mps.yaml
delete mode 100644 .ci/configs/ci_stories110m_qnn.yaml
delete mode 100644 .ci/configs/ci_stories110m_xnnpack_quantized.yaml
delete mode 100644 .ci/configs/llama3_coreml_ane.yaml
delete mode 100644 .ci/configs/llama3_qlora.yaml
delete mode 100644 .ci/configs/llama3_spinquant.yaml
delete mode 100644 .ci/configs/qwen3_xnnpack_8da4w.yaml
delete mode 100644 .ci/configs/stories110m_torchao_lowbit.yaml
delete mode 100644 .ci/configs/xnnpack_8da4w_basic.yaml
delete mode 100644 .ci/configs/xnnpack_custom_quantized.yaml
diff --git a/.ci/configs/README.md b/.ci/configs/README.md
deleted file mode 100644
index c77e758d992..00000000000
--- a/.ci/configs/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# CI Configuration Files for LLM Export
-
-This directory contains YAML configuration files used by CI tests for exporting LLM models with the new `extension.llm.export.export_llm` command.
-
-## Usage
-
-These config files can be used with the export command like this:
-
-```bash
-python -m extension.llm.export.export_llm --config path/to/config.yaml
-```
-
-Or you can override specific parameters:
-
-```bash
-python -m extension.llm.export.export_llm --config ci_stories110m_xnnpack_quantized.yaml base.checkpoint=my_checkpoint.pt
-```
-
-## Configuration Files
-
-### CI Test Configurations
-- `ci_stories110m_xnnpack_quantized.yaml` - Stories110M with XNNPACK quantization (used in test_llama.sh)
-- `ci_stories110m_mps.yaml` - Stories110M with MPS backend
-- `ci_stories110m_coreml.yaml` - Stories110M with CoreML backend
-- `ci_stories110m_qnn.yaml` - Stories110M with QNN backend
-
-### Performance Test Configurations
-- `llama3_spinquant.yaml` - Llama3 with SpinQuant (used in apple-perf.yml, android-perf.yml)
-- `llama3_qlora.yaml` - Llama3 with QLoRA (QAT + LoRA)
-- `llama3_coreml_ane.yaml` - Llama3 with CoreML ANE
-- `xnnpack_8da4w_basic.yaml` - Basic XNNPACK 8da4w quantization
-- `qwen3_xnnpack_8da4w.yaml` - Qwen3 with XNNPACK 8da4w quantization
-
-### Specialized Configurations
-- `stories110m_torchao_lowbit.yaml` - Stories110M with TorchAO lowbit quantization
-- `xnnpack_custom_quantized.yaml` - XNNPACK with custom ops and quantization
-
-## Background
-
-These configuration files were created as part of migrating CI tests from the old `examples.models.llama.export_llama` command to the new `extension.llm.export.export_llm` command with hydra configuration support.
-
-The config files help reduce duplication in CI scripts and make it easier to maintain consistent export settings across different test scenarios.
\ No newline at end of file
diff --git a/.ci/configs/ci_stories110m_coreml.yaml b/.ci/configs/ci_stories110m_coreml.yaml
deleted file mode 100644
index 9ef9a5d2f72..00000000000
--- a/.ci/configs/ci_stories110m_coreml.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-# Configuration for CI test_llama.sh - stories110M with CoreML backend
-
-base:
- model_class: "stories110m"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- enable_dynamic_shape: false
-
-export:
- max_seq_length: 128
- max_context_length: 128
-
-backend:
- coreml:
- enabled: true
-
-debug:
- verbose: true
\ No newline at end of file
diff --git a/.ci/configs/ci_stories110m_mps.yaml b/.ci/configs/ci_stories110m_mps.yaml
deleted file mode 100644
index 4b568a72203..00000000000
--- a/.ci/configs/ci_stories110m_mps.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-# Configuration for CI test_llama.sh - stories110M with MPS backend
-
-base:
- model_class: "stories110m"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- enable_dynamic_shape: false
-
-export:
- max_seq_length: 128
- max_context_length: 128
-
-backend:
- mps:
- enabled: true
-
-debug:
- verbose: true
\ No newline at end of file
diff --git a/.ci/configs/ci_stories110m_qnn.yaml b/.ci/configs/ci_stories110m_qnn.yaml
deleted file mode 100644
index 75061aa6c2b..00000000000
--- a/.ci/configs/ci_stories110m_qnn.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Configuration for CI test_llama.sh - stories110M with QNN backend
-
-base:
- model_class: "stories110m"
- tokenizer_path: "tokenizer.model"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- enable_dynamic_shape: false
-
-export:
- max_seq_length: 128
- max_context_length: 128
-
-quantization:
- pt2e_quantize: "qnn_16a16w"
- calibration_tasks: ["wikitext"]
- calibration_limit: 1
- calibration_seq_length: 128
- calibration_data: "Once"
-
-backend:
- qnn:
- enabled: true
-
-debug:
- verbose: true
\ No newline at end of file
diff --git a/.ci/configs/ci_stories110m_xnnpack_quantized.yaml b/.ci/configs/ci_stories110m_xnnpack_quantized.yaml
deleted file mode 100644
index 6fa692fbc42..00000000000
--- a/.ci/configs/ci_stories110m_xnnpack_quantized.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Configuration for CI test_llama.sh - stories110M with XNNPACK quantization
-# Used when XNNPACK=ON, CUSTOM=ON, QE=ON modes are enabled
-
-base:
- model_class: "stories110m"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- use_sdpa_with_kv_cache: true
-
-export:
- max_seq_length: 128
- max_context_length: 128
-
-quantization:
- qmode: "8da4w"
- group_size: 128
- embedding_quantize: "8,1024"
-
-backend:
- xnnpack:
- enabled: true
- extended_ops: true
-
-debug:
- verbose: false
\ No newline at end of file
diff --git a/.ci/configs/llama3_coreml_ane.yaml b/.ci/configs/llama3_coreml_ane.yaml
deleted file mode 100644
index fb11b0edd86..00000000000
--- a/.ci/configs/llama3_coreml_ane.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Configuration for Llama3 with CoreML ANE
-# Used in apple-perf.yml
-
-base:
- model_class: "llama3_2"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- enable_dynamic_shape: false
-
-export:
- max_seq_length: 128
- max_context_length: 128
-
-quantization:
- embedding_quantize: "4,32"
-
-backend:
- coreml:
- enabled: true
- ios: 18
- quantize: "c4w"
- compute_units: "cpu_and_ne"
-
-debug:
- verbose: false
\ No newline at end of file
diff --git a/.ci/configs/llama3_qlora.yaml b/.ci/configs/llama3_qlora.yaml
deleted file mode 100644
index e97ab40ff2a..00000000000
--- a/.ci/configs/llama3_qlora.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Configuration for Llama3 with QLoRA (QAT + LoRA)
-# Used in apple-perf.yml and android-perf.yml
-
-base:
- model_class: "llama3_2"
- use_lora: 16
- preq_mode: "8da4w_output_8da8w"
- preq_group_size: 32
- preq_embedding_quantize: "8,0"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- use_sdpa_with_kv_cache: true
- enable_dynamic_shape: false
-
-export:
- max_seq_length: 2048
- max_context_length: 2048
-
-quantization:
- use_qat: true
-
-backend:
- xnnpack:
- enabled: true
- extended_ops: true
-
-debug:
- verbose: false
\ No newline at end of file
diff --git a/.ci/configs/llama3_spinquant.yaml b/.ci/configs/llama3_spinquant.yaml
deleted file mode 100644
index 4eba1340f16..00000000000
--- a/.ci/configs/llama3_spinquant.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Configuration for Llama3 with SpinQuant
-# Used in apple-perf.yml and android-perf.yml
-
-base:
- model_class: "llama3_2"
- preq_mode: "8da4w_output_8da8w"
- preq_group_size: 32
- preq_embedding_quantize: "8,0"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- use_sdpa_with_kv_cache: true
- enable_dynamic_shape: false
-
-export:
- max_seq_length: 2048
- max_context_length: 2048
-
-quantization:
- use_spin_quant: "native"
-
-backend:
- xnnpack:
- enabled: true
- extended_ops: true
-
-debug:
- verbose: false
\ No newline at end of file
diff --git a/.ci/configs/qwen3_xnnpack_8da4w.yaml b/.ci/configs/qwen3_xnnpack_8da4w.yaml
deleted file mode 100644
index 433ed92acdc..00000000000
--- a/.ci/configs/qwen3_xnnpack_8da4w.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Configuration for Qwen3-0.6B with XNNPACK 8da4w quantization
-# Used in apple-perf.yml and android-perf.yml
-
-base:
- model_class: "qwen3-0_6b"
- params: "examples/models/qwen3/0_6b_config.json"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- use_sdpa_with_kv_cache: true
-
-export:
- max_seq_length: 128
- max_context_length: 128
-
-quantization:
- qmode: "8da4w"
- group_size: 32
- embedding_quantize: "8,0"
-
-backend:
- xnnpack:
- enabled: true
- extended_ops: true
-
-debug:
- verbose: false
\ No newline at end of file
diff --git a/.ci/configs/stories110m_torchao_lowbit.yaml b/.ci/configs/stories110m_torchao_lowbit.yaml
deleted file mode 100644
index 12970cd9fcd..00000000000
--- a/.ci/configs/stories110m_torchao_lowbit.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Configuration for stories110M with TorchAO lowbit quantization
-# Used in CI test_llama_torchao_lowbit.sh
-
-base:
- model_class: "stories110m"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- use_sdpa_with_kv_cache: true
-
-export:
- max_seq_length: 128
- max_context_length: 128
-
-quantization:
- qmode: "torchao:8da3w" # QLINEAR_BITWIDTH=3
- group_size: 128 # QLINEAR_GROUP_SIZE=128
- embedding_quantize: "4,32" # QEMBEDDING_BITWIDTH=4, QEMBEDDING_GROUP_SIZE=32
-
-backend:
- xnnpack:
- enabled: false
-
-debug:
- verbose: false
\ No newline at end of file
diff --git a/.ci/configs/xnnpack_8da4w_basic.yaml b/.ci/configs/xnnpack_8da4w_basic.yaml
deleted file mode 100644
index ce727df6d5c..00000000000
--- a/.ci/configs/xnnpack_8da4w_basic.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Configuration for basic XNNPACK 8da4w quantization
-# Used in apple-perf.yml and android-perf.yml
-
-base:
- model_class: "llama3_2"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- use_sdpa_with_kv_cache: true
-
-export:
- max_seq_length: 128
- max_context_length: 128
-
-quantization:
- qmode: "8da4w"
- group_size: 32
- embedding_quantize: "8,0"
-
-backend:
- xnnpack:
- enabled: true
- extended_ops: true
-
-debug:
- verbose: false
\ No newline at end of file
diff --git a/.ci/configs/xnnpack_custom_quantized.yaml b/.ci/configs/xnnpack_custom_quantized.yaml
deleted file mode 100644
index 6101296489d..00000000000
--- a/.ci/configs/xnnpack_custom_quantized.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Configuration for XNNPACK + custom + quantization
-# Common pattern used in CI test_llama.sh
-
-base:
- model_class: "stories110m"
-
-model:
- dtype_override: "fp32"
- use_kv_cache: true
- use_sdpa_with_kv_cache: true
-
-export:
- max_seq_length: 128
- max_context_length: 128
-
-quantization:
- qmode: "8da4w"
- group_size: 128
- embedding_quantize: "8,1024" # Default from test_llama.sh QE mode
-
-backend:
- xnnpack:
- enabled: true
- extended_ops: true
-
-debug:
- verbose: false
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh b/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh
index ff59fc56b2c..8c1ad52ef8b 100644
--- a/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh
+++ b/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh
@@ -14,7 +14,7 @@ curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokeni
# Create params.json file
touch params.json
echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
-python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -d fp16 -n stories110m_h.pte -kv
+python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json model.dtype_override=fp16 export.output_name=stories110m_h.pte model.use_kv_cache=true
python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
adb mkdir -p /data/local/tmp/llama
diff --git a/extension/android/executorch_android/android_test_setup.sh b/extension/android/executorch_android/android_test_setup.sh
index 682a1d16787..f521dac30c5 100644
--- a/extension/android/executorch_android/android_test_setup.sh
+++ b/extension/android/executorch_android/android_test_setup.sh
@@ -25,7 +25,7 @@ prepare_tinyllama() {
# Create params.json file
touch params.json
echo '{"dim": 288, "multiple_of": 32, "n_heads": 6, "n_layers": 6, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
- python -m examples.models.llama.export_llama -c stories15M.pt -p params.json -d fp16 -n stories15m_h.pte -kv
+ python -m extension.llm.export.export_llm base.checkpoint=stories15M.pt base.params=params.json model.dtype_override=fp16 export.output_name=stories15m_h.pte model.use_kv_cache=true
python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
cp stories15m_h.pte "${BASEDIR}/src/androidTest/resources/stories.pte"
From f3016c04fc61ecc1bdc77a01a3d823331c8d25a9 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 17:28:47 -0700
Subject: [PATCH 09/17] Update
[ghstack-poisoned]
---
.ci/scripts/test_llama.sh | 9 +++------
.github/workflows/android-perf.yml | 10 +++++-----
.github/workflows/apple-perf.yml | 10 +++++-----
3 files changed, 13 insertions(+), 16 deletions(-)
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 7371e632521..6c9ab314448 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -54,10 +54,7 @@ PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"
# Default CMake Build Type to release mode
CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
-if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
- echo "Expecting atleast 4 positional arguments"
- echo "Usage: [...]"
-fi
+# Argument validation is done individually below for each required parameter
if [[ -z "${MODEL_NAME:-}" ]]; then
echo "Missing model name, exiting..."
exit 1
@@ -232,7 +229,7 @@ if [[ "${CUSTOM}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} model.use_sdpa_with_kv_cache=true"
fi
if [[ "${QE}" == "ON" ]]; then
- EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=8,1024"
+ EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=\"8,1024\""
fi
if [[ "${MPS}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} backend.mps.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
@@ -244,7 +241,7 @@ if [[ "${QNN}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} backend.qnn.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
- EXPORT_ARGS+=" base.tokenizer_path=tokenizer.model quantization.pt2e_quantize=qnn_16a16w quantization.calibration_tasks=[wikitext] quantization.calibration_limit=1 quantization.calibration_seq_length=128 quantization.calibration_data='Once '"
+ EXPORT_ARGS+=" base.tokenizer_path=tokenizer.model quantization.pt2e_quantize=qnn_16a16w quantization.calibration_tasks=\"[wikitext]\" quantization.calibration_limit=1 quantization.calibration_seq_length=128 quantization.calibration_data=\"Once\""
fi
fi
if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index ff134682220..06dd87101fb 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -230,7 +230,7 @@ jobs:
model.dtype_override=fp32 \
base.preq_embedding_quantize="8,0" \
quantization.use_spin_quant=native \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ base.metadata="{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
# QAT + LoRA
@@ -258,7 +258,7 @@ jobs:
export.max_seq_length=2048 \
export.max_context_length=2048 \
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ base.metadata="{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
# Original BF16 version, without any quantization
@@ -271,7 +271,7 @@ jobs:
model.use_sdpa_with_kv_cache=true \
backend.xnnpack.enabled=true \
model.dtype_override=bf16 \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ base.metadata="{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}" \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
@@ -288,7 +288,7 @@ jobs:
quantization.qmode=8da4w \
quantization.group_size=32 \
quantization.embedding_quantize="8,0" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ base.metadata="{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}" \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
@@ -326,7 +326,7 @@ jobs:
quantization.qmode=8da4w \
quantization.group_size=32 \
quantization.embedding_quantize="8,0" \
- base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ base.metadata="{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}" \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index bd096b1c9ac..c4054023646 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -239,7 +239,7 @@ jobs:
model.dtype_override=fp32 \
base.preq_embedding_quantize="8,0" \
quantization.use_spin_quant=native \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ base.metadata="{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
# QAT + LoRA
@@ -267,7 +267,7 @@ jobs:
export.max_seq_length=2048 \
export.max_context_length=2048 \
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ base.metadata="{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
# Original BF16 version, without any quantization
@@ -280,7 +280,7 @@ jobs:
model.use_sdpa_with_kv_cache=true \
backend.xnnpack.enabled=true \
model.dtype_override=bf16 \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ base.metadata="{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}" \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
@@ -297,7 +297,7 @@ jobs:
quantization.qmode=8da4w \
quantization.group_size=32 \
quantization.embedding_quantize="8,0" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ base.metadata="{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}" \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then
@@ -331,7 +331,7 @@ jobs:
quantization.qmode=8da4w \
quantization.group_size=32 \
quantization.embedding_quantize="8,0" \
- base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ base.metadata="{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}" \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
From 1af1b27274ddc736608dd7566f26d98c2596dbe6 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 18:44:29 -0700
Subject: [PATCH 10/17] Update
[ghstack-poisoned]
---
extension/llm/export/test/test_export_llm.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index c4390050235..0932d3b1bd6 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -74,13 +74,13 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
called_config = mock_export_llama.call_args[0][0]
self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
self.assertEqual(called_config["base"]["model_class"], "llama2")
- self.assertEqual(called_config["base"]["preq_mode"], "preq_8da4w")
- self.assertEqual(called_config["model"]["dtype_override"], "fp16")
+ self.assertEqual(called_config["base"]["preq_mode"].value, "8da4w")
+ self.assertEqual(called_config["model"]["dtype_override"].value, "fp16")
self.assertEqual(called_config["export"]["max_seq_length"], 256)
- self.assertEqual(called_config["quantization"]["pt2e_quantize"], "xnnpack_dynamic")
- self.assertEqual(called_config["quantization"]["use_spin_quant"], "cuda")
- self.assertEqual(called_config["backend"]["coreml"]["quantize"], "c4w")
- self.assertEqual(called_config["backend"]["coreml"]["compute_units"], "cpu_and_gpu")
+ self.assertEqual(called_config["quantization"]["pt2e_quantize"].value, "xnnpack_dynamic")
+ self.assertEqual(called_config["quantization"]["use_spin_quant"].value, "cuda")
+ self.assertEqual(called_config["backend"]["coreml"]["quantize"].value, "c4w")
+ self.assertEqual(called_config["backend"]["coreml"]["compute_units"].value, "cpu_and_gpu")
finally:
os.unlink(config_file)
From 392821bab3f2a62ba8cfd493b89253a6ce1306a4 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 20:13:18 -0700
Subject: [PATCH 11/17] Update
[ghstack-poisoned]
---
examples/models/llama/export_llama_lib.py | 37 +++++++++++---
extension/llm/export/export_llm.py | 5 +-
extension/llm/export/test/test_export_llm.py | 53 ++++++++++++++------
3 files changed, 71 insertions(+), 24 deletions(-)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 6a706e0fa05..3c66f496e51 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -702,7 +702,11 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
checkpoint=llm_config.base.checkpoint,
checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype), # type: ignore
tokenizer_path=llm_config.base.tokenizer_path,
- use_spin_quant=llm_config.quantization.use_spin_quant.value if llm_config.quantization.use_spin_quant else None,
+ use_spin_quant=(
+ llm_config.quantization.use_spin_quant.value
+ if llm_config.quantization.use_spin_quant
+ else None
+ ),
embedding_quantize=llm_config.quantization.embedding_quantize,
use_shared_embedding=llm_config.model.use_shared_embedding,
quantization_mode=llm_config.quantization.qmode,
@@ -726,7 +730,9 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
vulkan=llm_config.backend.vulkan.enabled,
use_qat=llm_config.quantization.use_qat,
use_lora=llm_config.base.use_lora,
- preq_mode=llm_config.base.preq_mode.value if llm_config.base.preq_mode else None,
+ preq_mode=(
+ llm_config.base.preq_mode.value if llm_config.base.preq_mode else None
+ ),
preq_group_size=llm_config.base.preq_group_size,
preq_embedding_quantize=llm_config.base.preq_embedding_quantize,
local_global_attention=llm_config.model.local_global_attention,
@@ -738,7 +744,12 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
def get_quantizer_and_quant_params(llm_config):
pt2e_quant_params = get_pt2e_quantization_params(
- llm_config.quantization.pt2e_quantize.value if llm_config.quantization.pt2e_quantize else None, llm_config.quantization.qmode
+ (
+ llm_config.quantization.pt2e_quantize.value
+ if llm_config.quantization.pt2e_quantize
+ else None
+ ),
+ llm_config.quantization.qmode,
)
quantizers = get_pt2e_quantizers(pt2e_quant_params, llm_config.export.so_library)
quant_dtype = None
@@ -750,13 +761,17 @@ def get_quantizer_and_quant_params(llm_config):
quantizers.append(qnn_quantizer)
if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml"
- coreml_quantizer = get_coreml_quantizer(llm_config.quantization.pt2e_quantize.value)
+ coreml_quantizer = get_coreml_quantizer(
+ llm_config.quantization.pt2e_quantize.value
+ )
quantizers.append(coreml_quantizer)
if llm_config.backend.vulkan.enabled and llm_config.quantization.pt2e_quantize:
assert (
len(quantizers) == 0
), "Should not enable both vulkan and other quantizers"
- vulkan_quantizer = get_vulkan_quantizer(llm_config.quantization.pt2e_quantize.value)
+ vulkan_quantizer = get_vulkan_quantizer(
+ llm_config.quantization.pt2e_quantize.value
+ )
quantizers.append(vulkan_quantizer)
logging.info(f"Applying quantizers: {quantizers}")
return pt2e_quant_params, quantizers, quant_dtype
@@ -1076,9 +1091,17 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
enable_dynamic_shape=llm_config.model.enable_dynamic_shape,
use_kv_cache=llm_config.model.use_kv_cache,
embedding_quantize=llm_config.quantization.embedding_quantize,
- pt2e_quantize=llm_config.quantization.pt2e_quantize.value if llm_config.quantization.pt2e_quantize else None,
+ pt2e_quantize=(
+ llm_config.quantization.pt2e_quantize.value
+ if llm_config.quantization.pt2e_quantize
+ else None
+ ),
coreml_ios=llm_config.backend.coreml.ios,
- coreml_quantize=llm_config.backend.coreml.quantize.value if llm_config.backend.coreml.quantize else None,
+ coreml_quantize=(
+ llm_config.backend.coreml.quantize.value
+ if llm_config.backend.coreml.quantize
+ else None
+ ),
coreml_compute_units=llm_config.backend.coreml.compute_units.value,
use_qnn_sha=llm_config.backend.qnn.use_sha,
num_sharding=llm_config.backend.qnn.num_sharding,
diff --git a/extension/llm/export/export_llm.py b/extension/llm/export/export_llm.py
index 2af7439b805..e995b329f30 100644
--- a/extension/llm/export/export_llm.py
+++ b/extension/llm/export/export_llm.py
@@ -34,12 +34,11 @@
from typing import Any, List, Tuple
import hydra
-import yaml
from executorch.examples.models.llama.config.llm_config import LlmConfig
from executorch.examples.models.llama.export_llama_lib import export_llama
from hydra.core.config_store import ConfigStore
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import OmegaConf
cs = ConfigStore.instance()
cs.store(name="llm_config", node=LlmConfig)
@@ -79,7 +78,7 @@ def main() -> None:
"Cannot specify additional CLI arguments when using --config. "
f"Found: {remaining_args}. Use either --config file or hydra CLI args, not both."
)
-
+
config_file_path = pop_config_arg()
default_llm_config = LlmConfig()
llm_config_from_file = OmegaConf.load(config_file_path)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index 0932d3b1bd6..7d17b7819d3 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -10,8 +10,11 @@
import unittest
from unittest.mock import MagicMock, patch
-from executorch.examples.models.llama.config.llm_config import LlmConfig
-from executorch.extension.llm.export.export_llm import main, parse_config_arg, pop_config_arg
+from executorch.extension.llm.export.export_llm import (
+ main,
+ parse_config_arg,
+ pop_config_arg,
+)
class TestExportLlm(unittest.TestCase):
@@ -45,7 +48,8 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
"""Test main function with --config file and no hydra args."""
# Create a temporary config file
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- f.write("""
+ f.write(
+ """
base:
model_class: llama2
tokenizer_path: /path/to/tokenizer.json
@@ -61,7 +65,8 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
coreml:
quantize: c4w
compute_units: cpu_and_gpu
-""")
+"""
+ )
config_file = f.name
try:
@@ -72,15 +77,25 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
# Verify export_llama was called with config
mock_export_llama.assert_called_once()
called_config = mock_export_llama.call_args[0][0]
- self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
+ self.assertEqual(
+ called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json"
+ )
self.assertEqual(called_config["base"]["model_class"], "llama2")
self.assertEqual(called_config["base"]["preq_mode"].value, "8da4w")
self.assertEqual(called_config["model"]["dtype_override"].value, "fp16")
self.assertEqual(called_config["export"]["max_seq_length"], 256)
- self.assertEqual(called_config["quantization"]["pt2e_quantize"].value, "xnnpack_dynamic")
- self.assertEqual(called_config["quantization"]["use_spin_quant"].value, "cuda")
- self.assertEqual(called_config["backend"]["coreml"]["quantize"].value, "c4w")
- self.assertEqual(called_config["backend"]["coreml"]["compute_units"].value, "cpu_and_gpu")
+ self.assertEqual(
+ called_config["quantization"]["pt2e_quantize"].value, "xnnpack_dynamic"
+ )
+ self.assertEqual(
+ called_config["quantization"]["use_spin_quant"].value, "cuda"
+ )
+ self.assertEqual(
+ called_config["backend"]["coreml"]["quantize"].value, "c4w"
+ )
+ self.assertEqual(
+ called_config["backend"]["coreml"]["compute_units"].value, "cpu_and_gpu"
+ )
finally:
os.unlink(config_file)
@@ -88,7 +103,9 @@ def test_with_cli_args(self) -> None:
"""Test main function with only hydra CLI args."""
test_argv = ["script.py", "debug.verbose=True"]
with patch.object(sys, "argv", test_argv):
- with patch("executorch.extension.llm.export.export_llm.hydra_main") as mock_hydra:
+ with patch(
+ "executorch.extension.llm.export.export_llm.hydra_main"
+ ) as mock_hydra:
main()
mock_hydra.assert_called_once()
@@ -104,9 +121,12 @@ def test_config_with_cli_args_error(self) -> None:
with patch.object(sys, "argv", test_argv):
with self.assertRaises(ValueError) as cm:
main()
-
+
error_msg = str(cm.exception)
- self.assertIn("Cannot specify additional CLI arguments when using --config", error_msg)
+ self.assertIn(
+ "Cannot specify additional CLI arguments when using --config",
+ error_msg,
+ )
finally:
os.unlink(config_file)
@@ -117,7 +137,13 @@ def test_config_rejects_multiple_cli_args(self) -> None:
config_file = f.name
try:
- test_argv = ["script.py", "--config", config_file, "debug.verbose=True", "export.output_dir=/tmp"]
+ test_argv = [
+ "script.py",
+ "--config",
+ config_file,
+ "debug.verbose=True",
+ "export.output_dir=/tmp",
+ ]
with patch.object(sys, "argv", test_argv):
with self.assertRaises(ValueError):
main()
@@ -127,4 +153,3 @@ def test_config_rejects_multiple_cli_args(self) -> None:
if __name__ == "__main__":
unittest.main()
-
From acd2079be7d10dfd7afc60b2fbf1a3821dfce6dd Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 20 Jun 2025 20:17:46 -0700
Subject: [PATCH 12/17] Update
[ghstack-poisoned]
---
extension/llm/export/export_llm.py | 5 ++-
extension/llm/export/test/test_export_llm.py | 37 ++++++++++++++------
2 files changed, 29 insertions(+), 13 deletions(-)
diff --git a/extension/llm/export/export_llm.py b/extension/llm/export/export_llm.py
index 2af7439b805..e995b329f30 100644
--- a/extension/llm/export/export_llm.py
+++ b/extension/llm/export/export_llm.py
@@ -34,12 +34,11 @@
from typing import Any, List, Tuple
import hydra
-import yaml
from executorch.examples.models.llama.config.llm_config import LlmConfig
from executorch.examples.models.llama.export_llama_lib import export_llama
from hydra.core.config_store import ConfigStore
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import OmegaConf
cs = ConfigStore.instance()
cs.store(name="llm_config", node=LlmConfig)
@@ -79,7 +78,7 @@ def main() -> None:
"Cannot specify additional CLI arguments when using --config. "
f"Found: {remaining_args}. Use either --config file or hydra CLI args, not both."
)
-
+
config_file_path = pop_config_arg()
default_llm_config = LlmConfig()
llm_config_from_file = OmegaConf.load(config_file_path)
diff --git a/extension/llm/export/test/test_export_llm.py b/extension/llm/export/test/test_export_llm.py
index 970a32c9606..1f230233867 100644
--- a/extension/llm/export/test/test_export_llm.py
+++ b/extension/llm/export/test/test_export_llm.py
@@ -10,8 +10,11 @@
import unittest
from unittest.mock import MagicMock, patch
-from executorch.examples.models.llama.config.llm_config import LlmConfig
-from executorch.extension.llm.export.export_llm import main, parse_config_arg, pop_config_arg
+from executorch.extension.llm.export.export_llm import (
+ main,
+ parse_config_arg,
+ pop_config_arg,
+)
class TestExportLlm(unittest.TestCase):
@@ -45,12 +48,14 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
"""Test main function with --config file and no hydra args."""
# Create a temporary config file
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
- f.write("""
+ f.write(
+ """
base:
tokenizer_path: /path/to/tokenizer.json
export:
max_seq_length: 256
-""")
+"""
+ )
config_file = f.name
try:
@@ -61,7 +66,9 @@ def test_with_config(self, mock_export_llama: MagicMock) -> None:
# Verify export_llama was called with config
mock_export_llama.assert_called_once()
called_config = mock_export_llama.call_args[0][0]
- self.assertEqual(called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json")
+ self.assertEqual(
+ called_config["base"]["tokenizer_path"], "/path/to/tokenizer.json"
+ )
self.assertEqual(called_config["export"]["max_seq_length"], 256)
finally:
os.unlink(config_file)
@@ -70,7 +77,9 @@ def test_with_cli_args(self) -> None:
"""Test main function with only hydra CLI args."""
test_argv = ["script.py", "debug.verbose=True"]
with patch.object(sys, "argv", test_argv):
- with patch("executorch.extension.llm.export.export_llm.hydra_main") as mock_hydra:
+ with patch(
+ "executorch.extension.llm.export.export_llm.hydra_main"
+ ) as mock_hydra:
main()
mock_hydra.assert_called_once()
@@ -86,9 +95,12 @@ def test_config_with_cli_args_error(self) -> None:
with patch.object(sys, "argv", test_argv):
with self.assertRaises(ValueError) as cm:
main()
-
+
error_msg = str(cm.exception)
- self.assertIn("Cannot specify additional CLI arguments when using --config", error_msg)
+ self.assertIn(
+ "Cannot specify additional CLI arguments when using --config",
+ error_msg,
+ )
finally:
os.unlink(config_file)
@@ -99,7 +111,13 @@ def test_config_rejects_multiple_cli_args(self) -> None:
config_file = f.name
try:
- test_argv = ["script.py", "--config", config_file, "debug.verbose=True", "export.output_dir=/tmp"]
+ test_argv = [
+ "script.py",
+ "--config",
+ config_file,
+ "debug.verbose=True",
+ "export.output_dir=/tmp",
+ ]
with patch.object(sys, "argv", test_argv):
with self.assertRaises(ValueError):
main()
@@ -109,4 +127,3 @@ def test_config_rejects_multiple_cli_args(self) -> None:
if __name__ == "__main__":
unittest.main()
-
From 8f9faa2dc98f5b58f0889b7db266449185697e64 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 23 Jun 2025 11:40:03 -0700
Subject: [PATCH 13/17] Update
[ghstack-poisoned]
---
.github/workflows/android-perf.yml | 18 +++++++++---------
.github/workflows/apple-perf.yml | 18 +++++++++---------
examples/models/llama/config/llm_config.py | 13 +++++++++----
examples/models/llama/export_llama_lib.py | 12 ++++++------
examples/models/qwen3/README.md | 20 ++++++++++----------
extension/llm/export/README.md | 2 +-
6 files changed, 44 insertions(+), 39 deletions(-)
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 7d8f0a80153..7b509be28b9 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -228,9 +228,9 @@ jobs:
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
model.use_kv_cache=true \
model.dtype_override=fp32 \
- base.preq_embedding_quantize="8,0" \
+ base.preq_embedding_quantize='8,0' \
quantization.use_spin_quant=native \
- base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
+ base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
# QAT + LoRA
@@ -249,7 +249,7 @@ jobs:
base.use_lora=16 \
base.preq_mode="8da4w_output_8da8w" \
base.preq_group_size=32 \
- base.preq_embedding_quantize="8,0" \
+ base.preq_embedding_quantize='8,0' \
model.use_sdpa_with_kv_cache=true \
model.use_kv_cache=true \
backend.xnnpack.enabled=true \
@@ -258,7 +258,7 @@ jobs:
export.max_seq_length=2048 \
export.max_context_length=2048 \
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
- base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
+ base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
# Original BF16 version, without any quantization
@@ -271,7 +271,7 @@ jobs:
model.use_sdpa_with_kv_cache=true \
backend.xnnpack.enabled=true \
model.dtype_override=bf16 \
- base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
+ base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
@@ -287,8 +287,8 @@ jobs:
backend.xnnpack.extended_ops=true \
quantization.qmode=8da4w \
quantization.group_size=32 \
- quantization.embedding_quantize="8,0" \
- base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
+ quantization.embedding_quantize='8,0' \
+ base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
@@ -325,8 +325,8 @@ jobs:
backend.xnnpack.extended_ops=true \
quantization.qmode=8da4w \
quantization.group_size=32 \
- quantization.embedding_quantize="8,0" \
- base.metadata="\{\"get_bos_id\":151644,\"get_eos_ids\":[151645]\}" \
+ quantization.embedding_quantize='8,0' \
+ base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 003191250e3..1bcc2adba51 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -237,9 +237,9 @@ jobs:
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
model.use_kv_cache=true \
model.dtype_override=fp32 \
- base.preq_embedding_quantize="8,0" \
+ base.preq_embedding_quantize='8,0' \
quantization.use_spin_quant=native \
- base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
+ base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
# QAT + LoRA
@@ -258,7 +258,7 @@ jobs:
base.use_lora=16 \
base.preq_mode="8da4w_output_8da8w" \
base.preq_group_size=32 \
- base.preq_embedding_quantize="8,0" \
+ base.preq_embedding_quantize='8,0' \
model.use_sdpa_with_kv_cache=true \
model.use_kv_cache=true \
backend.xnnpack.enabled=true \
@@ -267,7 +267,7 @@ jobs:
export.max_seq_length=2048 \
export.max_context_length=2048 \
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
- base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}"
+ base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
# Original BF16 version, without any quantization
@@ -280,7 +280,7 @@ jobs:
model.use_sdpa_with_kv_cache=true \
backend.xnnpack.enabled=true \
model.dtype_override=bf16 \
- base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
+ base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
@@ -296,8 +296,8 @@ jobs:
backend.xnnpack.extended_ops=true \
quantization.qmode=8da4w \
quantization.group_size=32 \
- quantization.embedding_quantize="8,0" \
- base.metadata="\{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]\}" \
+ quantization.embedding_quantize='8,0' \
+ base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then
@@ -330,8 +330,8 @@ jobs:
backend.xnnpack.extended_ops=true \
quantization.qmode=8da4w \
quantization.group_size=32 \
- quantization.embedding_quantize="8,0" \
- base.metadata="\{\"get_bos_id\":151644,\"get_eos_ids\":[151645]\}" \
+ quantization.embedding_quantize='8,0' \
+ base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 9acd633fb21..72342199dfd 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -10,6 +10,11 @@
Configurations for exporting Llama.
Uses dataclasses, which integrate with OmegaConf and Hydra.
+
+Note:
+- Hydra is a bit finnicky with string values that include quotations, please
+refer to https://hydra.cc/docs/1.2/advanced/override_grammar/basic/#quoted-values
+for more information.
"""
import argparse
@@ -34,9 +39,9 @@ class ModelType(str, Enum):
llama3_2_vision = "llama3_2_vision"
static_llama = "static_llama"
qwen2_5 = "qwen2_5"
- qwen3_0_6b = "qwen3-0_6b"
- qwen3_1_7b = "qwen3-1_7b"
- qwen3_4b = "qwen3-4b"
+ qwen3_0_6b = "qwen3_0_6b"
+ qwen3_1_7b = "qwen3_1_7b"
+ qwen3_4b = "qwen3_4b"
phi_4_mini = "phi_4_mini"
smollm2 = "smollm2"
@@ -71,7 +76,7 @@ class BaseConfig:
checkpoint_dir: Path to directory containing sharded checkpoint files.
tokenizer_path: Path to the tokenizer file.
metadata: Json string containing metadata information.
- e.g. '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ e.g. '"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
use_lora: Rank of the LoRA, if set to 0 then this means no LoRA. For use with QAT.
fairseq2: For legacy internal use cases, this is safe to ignore.
preq_mode: Legacy option to specify how prequantized weights are loaded.
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 3c66f496e51..4ed29c4c103 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -104,9 +104,9 @@
"llama3_2",
"static_llama",
"qwen2_5",
- "qwen3-0_6b",
- "qwen3-1_7b",
- "qwen3-4b",
+ "qwen3_0_6b",
+ "qwen3_1_7b",
+ "qwen3_4b",
"phi_4_mini",
"smollm2",
]
@@ -115,9 +115,9 @@
"qwen2_5": "Qwen/Qwen2.5-1.5B",
"phi_4_mini": "microsoft/Phi-4-mini-instruct",
"smollm2": "HuggingFaceTB/SmolLM-135M",
- "qwen3-0_6b": "Qwen/Qwen3-0.6B",
- "qwen3-1_7b": "Qwen/Qwen3-1.7B",
- "qwen3-4b": "Qwen/Qwen3-4B",
+ "qwen3_0_6b": "Qwen/Qwen3-0.6B",
+ "qwen3_1_7b": "Qwen/Qwen3-1.7B",
+ "qwen3_4b": "Qwen/Qwen3-4B",
}
diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
index acdd4497503..b59021a3a83 100644
--- a/examples/models/qwen3/README.md
+++ b/examples/models/qwen3/README.md
@@ -7,7 +7,7 @@ Qwen 3 uses the same example code as our optimized Llama model, while the checkp
All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args:
```
-base.model_class=[qwen3-0_6b,qwen3-1_7b,qwen3-4b]
+base.model_class=[qwen3_0_6b,qwen3_1_7b,qwen3_4b]
base.params=[examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json]
```
@@ -17,7 +17,7 @@ Here is a basic example for exporting Qwen 3, although please refer to the Llama
Export 0.6b to XNNPack, quantized with 8da4w:
```
python -m extension.llm.export.export_llm \
- base.model_class="qwen3-0_6b" \
+ base.model_class="qwen3_0_6b" \
base.params="examples/models/qwen3/0_6b_config.json" \
model.use_kv_cache=True \
model.use_sdpa_with_kv_cache=True \
@@ -26,14 +26,14 @@ python -m extension.llm.export.export_llm \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w" \
base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- export.output_name="qwen3-0_6b.pte" \
+ export.output_name="qwen3_0_6b.pte" \
debug.verbose=True
```
Export 1.7b to XNNPack, quantized with 8da4w:
```
python -m extension.llm.export.export_llm \
- base.model_class="qwen3-1_7b" \
+ base.model_class="qwen3_1_7b" \
base.params="examples/models/qwen3/1_7b_config.json" \
model.use_kv_cache=True \
model.use_sdpa_with_kv_cache=True \
@@ -42,14 +42,14 @@ python -m extension.llm.export.export_llm \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w" \
base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- export.output_name="qwen3-1_7b.pte" \
+ export.output_name="qwen3_1_7b.pte" \
debug.verbose=True
```
Export 4b to XNNPack, quantized with 8da4w:
```
python -m extension.llm.export.export_llm \
- base.model_class="qwen3-4b" \
+ base.model_class="qwen3_4b" \
base.params="examples/models/qwen3/4b_config.json" \
model.use_kv_cache=True \
model.use_sdpa_with_kv_cache=True \
@@ -58,7 +58,7 @@ python -m extension.llm.export.export_llm \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w" \
base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
- export.output_name="qwen3-4b.pte" \
+ export.output_name="qwen3_4b.pte" \
debug.verbose=True
```
@@ -66,8 +66,8 @@ python -m extension.llm.export.export_llm \
With ExecuTorch pybindings:
```
python -m examples.models.llama.runner.native
- --model qwen3-0_6b \
- --pte qwen3-0_6b.pte \
+ --model qwen3_0_6b \
+ --pte qwen3_0_6b.pte \
--tokenizer ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json \
--tokenizer_config ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer_config.json \
--prompt "Who is the president of the US?" \
@@ -80,7 +80,7 @@ python -m examples.models.llama.runner.native
With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner):
```
cmake-out/examples/models/llama/llama_main
- --model_path qwen3-0_6b.pte
+ --model_path qwen3_0_6b.pte
--tokenizer_path ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json
--prompt="Who is the president of the US?"
```
diff --git a/extension/llm/export/README.md b/extension/llm/export/README.md
index 1ac27306c86..96f36acc1b4 100644
--- a/extension/llm/export/README.md
+++ b/extension/llm/export/README.md
@@ -85,7 +85,7 @@ debug:
### Export Qwen3 0.6B with XNNPACK backend and quantization
```bash
python -m extension.llm.export.export_llm \
- base.model_class=qwen3-0_6b \
+ base.model_class=qwen3_0_6b \
base.params=examples/models/qwen3/0_6b_config.json \
base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
model.use_kv_cache=true \
From 199ff957a8a4d9e4b7f4cdf165a4c456c5d5f7a7 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 23 Jun 2025 11:43:12 -0700
Subject: [PATCH 14/17] Update
[ghstack-poisoned]
---
.github/workflows/android-perf.yml | 8 ++++----
.github/workflows/apple-perf.yml | 8 ++++----
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 7b509be28b9..1bede7c3f27 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -228,7 +228,7 @@ jobs:
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
model.use_kv_cache=true \
model.dtype_override=fp32 \
- base.preq_embedding_quantize='8,0' \
+ base.preq_embedding_quantize=\'8,0\' \
quantization.use_spin_quant=native \
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
@@ -249,7 +249,7 @@ jobs:
base.use_lora=16 \
base.preq_mode="8da4w_output_8da8w" \
base.preq_group_size=32 \
- base.preq_embedding_quantize='8,0' \
+ base.preq_embedding_quantize=\'8,0\' \
model.use_sdpa_with_kv_cache=true \
model.use_kv_cache=true \
backend.xnnpack.enabled=true \
@@ -287,7 +287,7 @@ jobs:
backend.xnnpack.extended_ops=true \
quantization.qmode=8da4w \
quantization.group_size=32 \
- quantization.embedding_quantize='8,0' \
+ quantization.embedding_quantize=\'8,0\' \
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
@@ -325,7 +325,7 @@ jobs:
backend.xnnpack.extended_ops=true \
quantization.qmode=8da4w \
quantization.group_size=32 \
- quantization.embedding_quantize='8,0' \
+ quantization.embedding_quantize=\'8,0\' \
base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 1bcc2adba51..1155ef2a7b2 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -237,7 +237,7 @@ jobs:
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
model.use_kv_cache=true \
model.dtype_override=fp32 \
- base.preq_embedding_quantize='8,0' \
+ base.preq_embedding_quantize=\'8,0\' \
quantization.use_spin_quant=native \
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
@@ -258,7 +258,7 @@ jobs:
base.use_lora=16 \
base.preq_mode="8da4w_output_8da8w" \
base.preq_group_size=32 \
- base.preq_embedding_quantize='8,0' \
+ base.preq_embedding_quantize=\'8,0\' \
model.use_sdpa_with_kv_cache=true \
model.use_kv_cache=true \
backend.xnnpack.enabled=true \
@@ -296,7 +296,7 @@ jobs:
backend.xnnpack.extended_ops=true \
quantization.qmode=8da4w \
quantization.group_size=32 \
- quantization.embedding_quantize='8,0' \
+ quantization.embedding_quantize=\'8,0\' \
base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
@@ -330,7 +330,7 @@ jobs:
backend.xnnpack.extended_ops=true \
quantization.qmode=8da4w \
quantization.group_size=32 \
- quantization.embedding_quantize='8,0' \
+ quantization.embedding_quantize=\'8,0\' \
base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
From 0118873d79a6997fae75252a880804ed73c2c6cc Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 23 Jun 2025 11:57:35 -0700
Subject: [PATCH 15/17] Update
[ghstack-poisoned]
---
.../docs/delegates/qualcomm_README.md | 4 ++--
.../docs/delegates/xnnpack_README.md | 10 +++++-----
.../LLaMA/docs/delegates/xnnpack_README.md | 8 ++++----
.../deepseek-r1-distill-llama-8B/README.md | 4 ++--
examples/models/llama/README.md | 20 +++++++++----------
examples/models/phi_4_mini/README.md | 2 +-
examples/models/qwen2_5/README.md | 2 +-
examples/models/qwen3/README.md | 6 +++---
8 files changed, 28 insertions(+), 28 deletions(-)
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
index 969b6cacab9..360e92a5f30 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -106,12 +106,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B)
Examples:
```
# 4 bits weight only quantize
-python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="test.pte"
+python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="test.pte"
```
If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example:
```
# 8 bits quantization with 4 shards
-python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_8a8w" model.dtype_override="fp32" backend.qnn.num_sharding=4 base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="test.pte"
+python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_8a8w" model.dtype_override="fp32" backend.qnn.num_sharding=4 base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="test.pte"
```
Note: if you encountered issues below
```
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
index c60bd537e6b..baf8ffb7071 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -55,7 +55,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an
Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" quantization.use_spin_quant="native" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_spinquant.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' quantization.use_spin_quant="native" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_spinquant.pte"
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -63,7 +63,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_qat_lora.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_qat_lora.pte"
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -74,7 +74,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_bf16.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_bf16.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -90,7 +90,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla
* We prepared this model using the following command
```
-python -m extension.llm.export.export_llm base.checkpoint= base.params= model.dtype_override="fp32" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True quantization.qmode="8da4w" quantization.group_size=256 backend.xnnpack.enabled=True export.max_seq_length=8193 export.max_context_length=8193 quantization.embedding_quantize="4,32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' base.output_prune_map= export.output_name="llama_guard_3_1b_pruned_xnnpack.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.dtype_override="fp32" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True quantization.qmode="8da4w" quantization.group_size=256 backend.xnnpack.enabled=True export.max_seq_length=8193 export.max_context_length=8193 quantization.embedding_quantize=\'4,32\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' base.output_prune_map= export.output_name="llama_guard_3_1b_pruned_xnnpack.pte"
```
@@ -100,7 +100,7 @@ python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama.pte"
```
You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
index d64a119e35f..6cca65339da 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -51,7 +51,7 @@ In this demo app, we support text-only inference with up-to-date Llama models an
Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" quantization.use_spin_quant="native" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_spinquant.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' quantization.use_spin_quant="native" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_spinquant.pte"
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -59,7 +59,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co
Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize="8,0" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_qat_lora.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="8da4w_output_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_qat_lora.pte"
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -69,7 +69,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
* Export Llama model and generate .pte file as below:
```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' export.output_name="llama3_2_bf16.pte"
+python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_bf16.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -79,7 +79,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl
Export the model
```
-python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' quantization.embedding_quantize="4,32" export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+python -m extension.llm.export.export_llm base.checkpoint= base.params= model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' quantization.embedding_quantize=\'4,32\' export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
```
### For LLaVA model
diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md
index 7695c678337..f05dd9990a2 100644
--- a/examples/models/deepseek-r1-distill-llama-8B/README.md
+++ b/examples/models/deepseek-r1-distill-llama-8B/README.md
@@ -61,8 +61,8 @@ python -m extension.llm.export.export_llm \
quantization.qmode="8da4w" \
quantization.group_size=128 \
model.dtype_override="fp16" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- quantization.embedding_quantize="4,32" \
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
+ quantization.embedding_quantize=\'4,32\' \
export.output_name="DeepSeek-R1-Distill-Llama-8B.pte"
```
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 23a377a6611..e555043c44d 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -174,7 +174,7 @@ python -m extension.llm.export.export_llm \
model.use_kv_cache=True \
model.use_sdpa_with_kv_cache=True \
model.dtype_override="bf16" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
export.output_name="llama3_2.pte"
```
For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
@@ -203,9 +203,9 @@ python -m extension.llm.export.export_llm \
export.output_name="llama3_2.pte" \
model.use_kv_cache=True \
model.dtype_override="fp32" \
- base.preq_embedding_quantize="8,0" \
+ base.preq_embedding_quantize=\'8,0\' \
quantization.use_spin_quant="native" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
```
For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
@@ -226,7 +226,7 @@ python -m extension.llm.export.export_llm \
base.use_lora=16 \
base.preq_mode="8da4w_output_8da8w" \
base.preq_group_size=32 \
- base.preq_embedding_quantize="8,0" \
+ base.preq_embedding_quantize=\'8,0\' \
model.use_sdpa_with_kv_cache=True \
model.use_kv_cache=True \
backend.xnnpack.enabled=True \
@@ -235,7 +235,7 @@ python -m extension.llm.export.export_llm \
export.max_seq_length=2048 \
export.max_context_length=2048 \
export.output_name="llama3_2.pte" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
```
For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
@@ -256,11 +256,11 @@ You can export and run the original Llama 3 8B instruct model.
quantization.qmode="8da4w" \
quantization.group_size=128 \
model.dtype_override="fp32" \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- quantization.embedding_quantize="4,32" \
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
+ quantization.embedding_quantize=\'4,32\' \
export.output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
```
- Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize="4,32"` as shown above to further reduce the model size.
+ Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `quantization.embedding_quantize=\'4,32\'` as shown above to further reduce the model size.
If you're interested in deploying on non-CPU backends, [please refer the non-cpu-backend section](non_cpu_backends.md)
@@ -395,11 +395,11 @@ python -m extension.llm.export.export_llm \
base.params="${LLAMA_PARAMS:?}" \
model.use_kv_cache=True \
model.use_sdpa_with_kv_cache=True \
- base.metadata='{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' \
export.output_name="llama3_2.pte" \
quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \
quantization.group_size=${QLINEAR_GROUP_SIZE} \
- quantization.embedding_quantize="torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+ quantization.embedding_quantize=\'torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}\' \
model.dtype_override="fp32"
```
diff --git a/examples/models/phi_4_mini/README.md b/examples/models/phi_4_mini/README.md
index c2b3d515ec0..d168d54226e 100644
--- a/examples/models/phi_4_mini/README.md
+++ b/examples/models/phi_4_mini/README.md
@@ -40,7 +40,7 @@ python -m extension.llm.export.export_llm \
model.use_sdpa_with_kv_cache=True \
model.dtype_override="fp32" \
backend.xnnpack.enabled=True \
- base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+ base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \
export.output_name="phi-4-mini.pte" \
debug.verbose=True
```
diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md
index b40daaca469..57784169ece 100644
--- a/examples/models/qwen2_5/README.md
+++ b/examples/models/qwen2_5/README.md
@@ -40,7 +40,7 @@ python -m extension.llm.export.export_llm \
model.use_sdpa_with_kv_cache=True \
model.dtype_override="fp32" \
backend.xnnpack.enabled=True \
- base.metadata='{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+ base.metadata='"{\"get_bos_id\":151643, \"get_eos_ids\":[151643]}"' \
export.output_name="qwen2_5-1_5b.pte" \
debug.verbose=True
```
diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
index acdd4497503..d31d491adf2 100644
--- a/examples/models/qwen3/README.md
+++ b/examples/models/qwen3/README.md
@@ -25,7 +25,7 @@ python -m extension.llm.export.export_llm \
backend.xnnpack.enabled=True \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w" \
- base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \
export.output_name="qwen3-0_6b.pte" \
debug.verbose=True
```
@@ -41,7 +41,7 @@ python -m extension.llm.export.export_llm \
backend.xnnpack.enabled=True \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w" \
- base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \
export.output_name="qwen3-1_7b.pte" \
debug.verbose=True
```
@@ -57,7 +57,7 @@ python -m extension.llm.export.export_llm \
backend.xnnpack.enabled=True \
backend.xnnpack.extended_ops=True \
quantization.qmode="8da4w" \
- base.metadata='{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+ base.metadata='"{\"get_bos_id\": 151644, \"get_eos_ids\":[151645]}"' \
export.output_name="qwen3-4b.pte" \
debug.verbose=True
```
From 63a4002eaf0b77f620054d711b2e3c0def164dbb Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 23 Jun 2025 13:34:05 -0700
Subject: [PATCH 16/17] Update
[ghstack-poisoned]
---
.github/workflows/android-perf.yml | 10 +++++-----
.github/workflows/apple-perf.yml | 12 ++++++------
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 1bede7c3f27..2f72594354a 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -230,7 +230,7 @@ jobs:
model.dtype_override=fp32 \
base.preq_embedding_quantize=\'8,0\' \
quantization.use_spin_quant=native \
- base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
+ base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
# QAT + LoRA
@@ -258,7 +258,7 @@ jobs:
export.max_seq_length=2048 \
export.max_context_length=2048 \
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
- base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
+ base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
# Original BF16 version, without any quantization
@@ -271,7 +271,7 @@ jobs:
model.use_sdpa_with_kv_cache=true \
backend.xnnpack.enabled=true \
model.dtype_override=bf16 \
- base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
+ base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
@@ -288,7 +288,7 @@ jobs:
quantization.qmode=8da4w \
quantization.group_size=32 \
quantization.embedding_quantize=\'8,0\' \
- base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
+ base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
@@ -326,7 +326,7 @@ jobs:
quantization.qmode=8da4w \
quantization.group_size=32 \
quantization.embedding_quantize=\'8,0\' \
- base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
+ base.metadata='"{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}"' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 1155ef2a7b2..b9134c675cf 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -239,7 +239,7 @@ jobs:
model.dtype_override=fp32 \
base.preq_embedding_quantize=\'8,0\' \
quantization.use_spin_quant=native \
- base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
+ base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
# QAT + LoRA
@@ -267,7 +267,7 @@ jobs:
export.max_seq_length=2048 \
export.max_context_length=2048 \
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
- base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}'
+ base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
# Original BF16 version, without any quantization
@@ -280,7 +280,7 @@ jobs:
model.use_sdpa_with_kv_cache=true \
backend.xnnpack.enabled=true \
model.dtype_override=bf16 \
- base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
+ base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
@@ -297,7 +297,7 @@ jobs:
quantization.qmode=8da4w \
quantization.group_size=32 \
quantization.embedding_quantize=\'8,0\' \
- base.metadata='{"get_bos_id":128000,"get_eos_ids":[128009,128001]}' \
+ base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then
@@ -306,7 +306,7 @@ jobs:
${CONDA_RUN} python -m extension.llm.export.export_llm \
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
base.params="${DOWNLOADED_PATH}/params.json" \
- quantization.embedding_quantize="4,32" \
+ quantization.embedding_quantize=\'4,32\' \
model.use_kv_cache=true \
model.enable_dynamic_shape=false \
backend.coreml.enabled=true \
@@ -331,7 +331,7 @@ jobs:
quantization.qmode=8da4w \
quantization.group_size=32 \
quantization.embedding_quantize=\'8,0\' \
- base.metadata='{"get_bos_id":151644,"get_eos_ids":[151645]}' \
+ base.metadata='"{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}"' \
export.output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
From a3013e89de120bb4250f4b3c05b4ec2c86e18a60 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 23 Jun 2025 14:10:06 -0700
Subject: [PATCH 17/17] Update
[ghstack-poisoned]
---
.github/workflows/android-perf.yml | 2 +-
.github/workflows/apple-perf.yml | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 2f72594354a..a7c2b9ca14c 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -316,7 +316,7 @@ jobs:
if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
python -m extension.llm.export.export_llm \
- base.model_class=qwen3-0_6b \
+ base.model_class=qwen3_0_6b \
base.params=examples/models/qwen3/0_6b_config.json \
model.use_kv_cache=true \
model.use_sdpa_with_kv_cache=true \
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index b9134c675cf..6b1666da642 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -321,7 +321,7 @@ jobs:
if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
${CONDA_RUN} python -m extension.llm.export.export_llm \
- base.model_class=qwen3-0_6b \
+ base.model_class=qwen3_0_6b \
base.params=examples/models/qwen3/0_6b_config.json \
model.use_kv_cache=true \
model.use_sdpa_with_kv_cache=true \